From 5480488c91e1b491433994f3f3ccd4d70958fc34 Mon Sep 17 00:00:00 2001 From: AlongWY Date: Thu, 31 Aug 2023 05:21:14 +0000 Subject: [PATCH] deploy: 72066be21ad467c8ffc76b74c152b38decf3f0ac --- .nojekyll | 0 cache.json | 1 + favicon.ico | Bin 0 -> 15086 bytes index.css | 355 + index.html | 83075 ++++++++++++++++++++++++++++++++++++++++++++++++++ index.js | 39 + 6 files changed, 83470 insertions(+) create mode 100644 .nojekyll create mode 100644 cache.json create mode 100644 favicon.ico create mode 100644 index.css create mode 100644 index.html create mode 100644 index.js diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/cache.json b/cache.json new file mode 100644 index 00000000..f06e0e16 --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2023-08-23T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2308.12284v1","updated":"2023-08-23T17:58:14Z","published":"2023-08-23T17:58:14Z","title":"D4: Improving LLM Pretraining via Document De-Duplication and\n Diversification","summary":" Over recent years, an increasing amount of compute and data has been poured\ninto training large language models (LLMs), usually by doing one-pass learning\non as many tokens as possible randomly selected from large-scale web corpora.\nWhile training on ever-larger portions of the internet leads to consistent\nperformance improvements, the size of these improvements diminishes with scale,\nand there has been little work exploring the effect of data selection on\npre-training and downstream performance beyond simple de-duplication methods\nsuch as MinHash. Here, we show that careful data selection (on top of\nde-duplicated data) via pre-trained model embeddings can speed up training (20%\nefficiency gains) and improves average downstream accuracy on 16 NLP tasks (up\nto 2%) at the 6.7B model scale. Furthermore, we show that repeating data\nintelligently consistently outperforms baseline training (while repeating\nrandom data performs worse than baseline training). Our results indicate that\nclever data selection can significantly improve LLM pre-training, calls into\nquestion the common practice of training for a single epoch on as much data as\npossible, and demonstrates a path to keep improving our models past the limits\nof randomly sampling web data.\n","authors":["Kushal Tirumala","Daniel Simig","Armen Aghajanyan","Ari S. Morcos"],"pdf_url":"https://arxiv.org/pdf/2308.12284v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12272v1","updated":"2023-08-23T17:40:35Z","published":"2023-08-23T17:40:35Z","title":"Simple is Better and Large is Not Enough: Towards Ensembling of\n Foundational Language Models","summary":" Foundational Language Models (FLMs) have advanced natural language processing\n(NLP) research. Current researchers are developing larger FLMs (e.g., XLNet,\nT5) to enable contextualized language representation, classification, and\ngeneration. While developing larger FLMs has been of significant advantage, it\nis also a liability concerning hallucination and predictive uncertainty.\nFundamentally, larger FLMs are built on the same foundations as smaller FLMs\n(e.g., BERT); hence, one must recognize the potential of smaller FLMs which can\nbe realized through an ensemble. In the current research, we perform a reality\ncheck on FLMs and their ensemble on benchmark and real-world datasets. We\nhypothesize that the ensembling of FLMs can influence the individualistic\nattention of FLMs and unravel the strength of coordination and cooperation of\ndifferent FLMs. We utilize BERT and define three other ensemble techniques:\n{Shallow, Semi, and Deep}, wherein the Deep-Ensemble introduces a\nknowledge-guided reinforcement learning approach. We discovered that the\nsuggested Deep-Ensemble BERT outperforms its large variation i.e. BERTlarge, by\na factor of many times using datasets that show the usefulness of NLP in\nsensitive fields, such as mental health.\n","authors":["Nancy Tyagi","Aidin Shiri","Surjodeep Sarkar","Abhishek Kumar Umrawal","Manas Gaur"],"pdf_url":"https://arxiv.org/pdf/2308.12272v1.pdf","comment":"Accepted at the 10th Mid-Atlantic Student Colloquium on Speech,\n Language and Learning (MASC-SLL 2023)"},{"id":"http://arxiv.org/abs/2308.11601v2","updated":"2023-08-23T17:34:17Z","published":"2023-08-22T17:48:24Z","title":"Tryage: Real-time, intelligent Routing of User Prompts to Large Language\n Models","summary":" The introduction of the transformer architecture and the self-attention\nmechanism has led to an explosive production of language models trained on\nspecific downstream tasks and data domains. With over 200, 000 models in the\nHugging Face ecosystem, users grapple with selecting and optimizing models to\nsuit multifaceted workflows and data domains while addressing computational,\nsecurity, and recency concerns. There is an urgent need for machine learning\nframeworks that can eliminate the burden of model selection and customization\nand unleash the incredible power of the vast emerging model library for end\nusers. Here, we propose a context-aware routing system, Tryage, that leverages\na language model router for optimal selection of expert models from a model\nlibrary based on analysis of individual input prompts. Inspired by the thalamic\nrouter in the brain, Tryage employs a perceptive router to predict down-stream\nmodel performance on prompts and, then, makes a routing decision using an\nobjective function that integrates performance predictions with user goals and\nconstraints that are incorporated through flags (e.g., model size, model\nrecency). Tryage allows users to explore a Pareto front and automatically\ntrade-off between task accuracy and secondary goals including minimization of\nmodel size, recency, security, verbosity, and readability. Across heterogeneous\ndata sets that include code, text, clinical data, and patents, the Tryage\nframework surpasses Gorilla and GPT3.5 turbo in dynamic model selection\nidentifying the optimal model with an accuracy of 50.9% , compared to 23.6% by\nGPT 3.5 Turbo and 10.8% by Gorilla. Conceptually, Tryage demonstrates how\nrouting models can be applied to program and control the behavior of\nmulti-model LLM systems to maximize efficient use of the expanding and evolving\nlanguage model ecosystem.\n","authors":["Surya Narayanan Hari","Matt Thomson"],"pdf_url":"https://arxiv.org/pdf/2308.11601v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12261v1","updated":"2023-08-23T17:28:21Z","published":"2023-08-23T17:28:21Z","title":"Prompt2Model: Generating Deployable Models from Natural Language\n Instructions","summary":" Large language models (LLMs) enable system builders today to create competent\nNLP systems through prompting, where they only need to describe the task in\nnatural language and provide a few examples. However, in other ways, LLMs are a\nstep backward from traditional special-purpose NLP models; they require\nextensive computational resources for deployment and can be gated behind APIs.\nIn this paper, we propose Prompt2Model, a general-purpose method that takes a\nnatural language task description like the prompts provided to LLMs, and uses\nit to train a special-purpose model that is conducive to deployment. This is\ndone through a multi-step process of retrieval of existing datasets and\npretrained models, dataset generation using LLMs, and supervised fine-tuning on\nthese retrieved and generated datasets. Over three tasks, we demonstrate that\ngiven the same few-shot prompt as input, Prompt2Model trains models that\noutperform the results of a strong LLM, gpt-3.5-turbo, by an average of 20%\nwhile being up to 700 times smaller. We also show that this data can be used to\nobtain reliable performance estimates of model performance, enabling model\ndevelopers to assess model reliability before deployment. Prompt2Model is\navailable open-source at https://github.com/neulab/prompt2model.\n","authors":["Vijay Viswanathan","Chenyang Zhao","Amanda Bertsch","Tongshuang Wu","Graham Neubig"],"pdf_url":"https://arxiv.org/pdf/2308.12261v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2308.10261v2","updated":"2023-08-23T16:49:33Z","published":"2023-08-20T13:15:18Z","title":"How Good Are Large Language Models at Out-of-Distribution Detection?","summary":" Out-of-distribution (OOD) detection plays a vital role in enhancing the\nreliability of machine learning (ML) models. The emergence of large language\nmodels (LLMs) has catalyzed a paradigm shift within the ML community,\nshowcasing their exceptional capabilities across diverse natural language\nprocessing tasks. While existing research has probed OOD detection with\nrelative small-scale Transformers like BERT, RoBERTa and GPT-2, the stark\ndifferences in scales, pre-training objectives, and inference paradigms call\ninto question the applicability of these findings to LLMs. This paper embarks\non a pioneering empirical investigation of OOD detection in the domain of LLMs,\nfocusing on LLaMA series ranging from 7B to 65B in size. We thoroughly evaluate\ncommonly-used OOD detectors, scrutinizing their performance in both zero-grad\nand fine-tuning scenarios. Notably, we alter previous discriminative\nin-distribution fine-tuning into generative fine-tuning, aligning the\npre-training objective of LLMs with downstream tasks. Our findings unveil that\na simple cosine distance OOD detector demonstrates superior efficacy,\noutperforming other OOD detectors. We provide an intriguing explanation for\nthis phenomenon by highlighting the isotropic nature of the embedding spaces of\nLLMs, which distinctly contrasts with the anisotropic property observed in\nsmaller BERT family models. The new insight enhances our understanding of how\nLLMs detect OOD data, thereby enhancing their adaptability and reliability in\ndynamic environments.\n","authors":["Bo Liu","Liming Zhan","Zexin Lu","Yujie Feng","Lei Xue","Xiao-Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2308.10261v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2308.12247v1","updated":"2023-08-23T16:48:04Z","published":"2023-08-23T16:48:04Z","title":"How to Protect Copyright Data in Optimization of Large Language Models?","summary":" Large language models (LLMs) and generative AI have played a transformative\nrole in computer research and applications. Controversy has arisen as to\nwhether these models output copyrighted data, which can occur if the data the\nmodels are trained on is copyrighted. LLMs are built on the transformer neural\nnetwork architecture, which in turn relies on a mathematical computation called\nAttention that uses the softmax function.\n In this paper, we show that large language model training and optimization\ncan be seen as a softmax regression problem. We then establish a method of\nefficiently performing softmax regression, in a way that prevents the\nregression function from generating copyright data. This establishes a\ntheoretical method of training large language models in a way that avoids\ngenerating copyright data.\n","authors":["Timothy Chu","Zhao Song","Chiwun Yang"],"pdf_url":"https://arxiv.org/pdf/2308.12247v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.08986v2","updated":"2023-08-23T16:31:34Z","published":"2022-12-18T01:57:30Z","title":"Low-Resource Authorship Style Transfer: Can Non-Famous Authors Be\n Imitated?","summary":" Authorship style transfer involves altering text to match the style of a\ntarget author whilst preserving the original meaning. Existing unsupervised\napproaches like STRAP have largely focused on style transfer to target authors\nwith many examples of their writing style in books, speeches, or other\npublished works. This high-resource training data requirement (often greater\nthan 100,000 words) makes these approaches primarily useful for style transfer\nto published authors, politicians, or other well-known figures and authorship\nstyles, while style transfer to non-famous authors has not been well-studied.\nWe introduce the \\textit{low-resource authorship style transfer} task, a more\nchallenging class of authorship style transfer where only a limited amount of\ntext in the target author's style may exist. In our experiments, we\nspecifically choose source and target authors from Reddit and style transfer\ntheir Reddit posts, limiting ourselves to just 16 posts (on average ~500 words)\nof the target author's style. Style transfer accuracy is typically measured by\nhow often a classifier or human judge will classify an output as written by the\ntarget author. Recent authorship representations models excel at authorship\nidentification even with just a few writing samples, making automatic\nevaluation of this task possible for the first time through evaluation metrics\nwe propose. Our results establish an in-context learning technique we develop\nas the strongest baseline, though we find current approaches do not yet achieve\nmastery of this challenging task. We release our data and implementations to\nencourage further investigation.\n","authors":["Ajay Patel","Nicholas Andrews","Chris Callison-Burch"],"pdf_url":"https://arxiv.org/pdf/2212.08986v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12219v1","updated":"2023-08-23T16:01:12Z","published":"2023-08-23T16:01:12Z","title":"Diffusion Language Models Can Perform Many Tasks with Scaling and\n Instruction-Finetuning","summary":" The recent surge of generative AI has been fueled by the generative power of\ndiffusion probabilistic models and the scalable capabilities of large language\nmodels. Despite their potential, it remains elusive whether diffusion language\nmodels can solve general language tasks comparable to their autoregressive\ncounterparts. This paper demonstrates that scaling diffusion models w.r.t.\ndata, sizes, and tasks can effectively make them strong language learners. We\nbuild competent diffusion language models at scale by first acquiring knowledge\nfrom massive data via masked language modeling pretraining thanks to their\nintrinsic connections. We then reprogram pretrained masked language models into\ndiffusion language models via diffusive adaptation, wherein task-specific\nfinetuning and instruction finetuning are explored to unlock their versatility\nin solving general language tasks. Experiments show that scaling diffusion\nlanguage models consistently improves performance across downstream language\ntasks. We further discover that instruction finetuning can elicit zero-shot and\nfew-shot in-context learning abilities that help tackle many unseen tasks by\nfollowing natural language instructions, and show promise in advanced and\nchallenging abilities such as reasoning\n","authors":["Jiasheng Ye","Zaixiang Zheng","Yu Bao","Lihua Qian","Quanquan Gu"],"pdf_url":"https://arxiv.org/pdf/2308.12219v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12215v1","updated":"2023-08-23T15:52:20Z","published":"2023-08-23T15:52:20Z","title":"The Challenges of Machine Learning for Trust and Safety: A Case Study on\n Misinformation Detection","summary":" We examine the disconnect between scholarship and practice in applying\nmachine learning to trust and safety problems, using misinformation detection\nas a case study. We systematize literature on automated detection of\nmisinformation across a corpus of 270 well-cited papers in the field. We then\nexamine subsets of papers for data and code availability, design missteps,\nreproducibility, and generalizability. We find significant shortcomings in the\nliterature that call into question claimed performance and practicality.\nDetection tasks are often meaningfully distinct from the challenges that online\nservices actually face. Datasets and model evaluation are often\nnon-representative of real-world contexts, and evaluation frequently is not\nindependent of model training. Data and code availability is poor. Models do\nnot generalize well to out-of-domain data. Based on these results, we offer\nrecommendations for evaluating machine learning applications to trust and\nsafety problems. Our aim is for future work to avoid the pitfalls that we\nidentify.\n","authors":["Madelyne Xiao","Jonathan Mayer"],"pdf_url":"https://arxiv.org/pdf/2308.12215v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12202v1","updated":"2023-08-23T15:39:42Z","published":"2023-08-23T15:39:42Z","title":"Curriculum Learning with Adam: The Devil Is in the Wrong Details","summary":" Curriculum learning (CL) posits that machine learning models -- similar to\nhumans -- may learn more efficiently from data that match their current\nlearning progress. However, CL methods are still poorly understood and, in\nparticular for natural language processing (NLP), have achieved only limited\nsuccess. In this paper, we explore why. Starting from an attempt to replicate\nand extend a number of recent curriculum methods, we find that their results\nare surprisingly brittle when applied to NLP. A deep dive into the\n(in)effectiveness of the curricula in some scenarios shows us why: when\ncurricula are employed in combination with the popular Adam optimisation\nalgorithm, they oftentimes learn to adapt to suboptimally chosen optimisation\nparameters for this algorithm. We present a number of different case studies\nwith different common hand-crafted and automated CL approaches to illustrate\nthis phenomenon, and we find that none of them outperforms optimisation with\nonly Adam with well-chosen hyperparameters. As such, our results contribute to\nunderstanding why CL methods work, but at the same time urge caution when\nclaiming positive results.\n","authors":["Lucas Weber","Jaap Jumelet","Paul Michel","Elia Bruni","Dieuwke Hupkes"],"pdf_url":"https://arxiv.org/pdf/2308.12202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11534v2","updated":"2023-08-23T14:33:53Z","published":"2023-08-21T06:51:56Z","title":"Large Language Model as a User Simulator","summary":" The unparalleled performance of closed-sourced ChatGPT has sparked efforts\ntowards its democratization, with notable strides made by leveraging real user\nand ChatGPT conversations, as evidenced by Vicuna. However, while current\nendeavors like Baize and UltraChat aim to auto-generate conversational data due\nto challenges in gathering human participation, they primarily rely on ChatGPT\nto simulate human behaviors based on directives rather than genuine human\nlearning. This results in a limited scope, diminished diversity, and an absence\nof genuine multi-round conversational dynamics. To address the above issues, we\ninnovatively target human questions extracted from genuine human-machine\nconversations as a learning goal and train a user simulator, UserGPT, to\nproduce a high-quality human-centric synthetic conversation dataset, RealChat.\nSubsequently, this dataset trains our assistant model, ReaLM. Experimentally,\nReaLM outpaces baseline models in both Vicuna-Bench and MT-Bench by pairwise\ncomparison when considering equivalent training set sizes, and manual\nevaluation also shows that our model is highly competitive. Impressively, when\nfine-tuned with the latest LLaMA 2 model, ReaLM secured a leading score of 6.33\nin the MT-Bench, outshining the contemporary same-scale models, including the\nLLaMA-2-7B-chat model. Further in-depth analysis demonstrates the scalability\nand transferability of our approach. A preliminary exploration into the\ninterplay between training set data quality and resultant model performance is\nalso undertaken, laying a robust groundwork for future investigations. The code\nis available at https://github.com/FreedomIntelligence/ReaLM.\n","authors":["Chuyi Kong","Yaxin Fan","Xiang Wan","Feng Jiang","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2308.11534v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.02206v2","updated":"2023-08-23T14:23:48Z","published":"2023-03-03T20:35:38Z","title":"Domain Specific Question Answering Over Knowledge Graphs Using Logical\n Programming and Large Language Models","summary":" Answering questions over domain-specific graphs requires a tailored approach\ndue to the limited number of relations and the specific nature of the domain.\nOur approach integrates classic logical programming languages into large\nlanguage models (LLMs), enabling the utilization of logical reasoning\ncapabilities to tackle the KGQA task. By representing the questions as Prolog\nqueries, which are readable and near close to natural language in\nrepresentation, we facilitate the generation of programmatically derived\nanswers. To validate the effectiveness of our approach, we evaluate it using a\nwell-known benchmark dataset, MetaQA. Our experimental results demonstrate that\nour method achieves accurate identification of correct answer entities for all\ntest questions, even when trained on a small fraction of annotated data.\nOverall, our work presents a promising approach to addressing question\nanswering over domain-specific graphs, offering an explainable and robust\nsolution by incorporating logical programming languages.\n","authors":["Navid Madani","Rohini K. Srihari","Kenneth Joseph"],"pdf_url":"https://arxiv.org/pdf/2303.02206v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12157v1","updated":"2023-08-23T14:18:44Z","published":"2023-08-23T14:18:44Z","title":"Evaluation of Faithfulness Using the Longest Supported Subsequence","summary":" As increasingly sophisticated language models emerge, their trustworthiness\nbecomes a pivotal issue, especially in tasks such as summarization and\nquestion-answering. Ensuring their responses are contextually grounded and\nfaithful is challenging due to the linguistic diversity and the myriad of\npossible answers. In this paper, we introduce a novel approach to evaluate\nfaithfulness of machine-generated text by computing the longest noncontinuous\nsubstring of the claim that is supported by the context, which we refer to as\nthe Longest Supported Subsequence (LSS). Using a new human-annotated dataset,\nwe finetune a model to generate LSS. We introduce a new method of evaluation\nand demonstrate that these metrics correlate better with human ratings when LSS\nis employed, as opposed to when it is not. Our proposed metric demonstrates an\n18% enhancement over the prevailing state-of-the-art metric for faithfulness on\nour dataset. Our metric consistently outperforms other metrics on a\nsummarization dataset across six different models. Finally, we compare several\npopular Large Language Models (LLMs) for faithfulness using this metric. We\nrelease the human-annotated dataset built for predicting LSS and our fine-tuned\nmodel for evaluating faithfulness.\n","authors":["Anirudh Mittal","Timo Schick","Mikel Artetxe","Jane Dwivedi-Yu"],"pdf_url":"https://arxiv.org/pdf/2308.12157v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10652v4","updated":"2023-08-23T14:00:36Z","published":"2023-07-20T07:33:30Z","title":"Exploring the Landscape of Natural Language Processing Research","summary":" As an efficient approach to understand, generate, and process natural\nlanguage texts, research in natural language processing (NLP) has exhibited a\nrapid spread and wide adoption in recent years. Given the increasing research\nwork in this area, several NLP-related approaches have been surveyed in the\nresearch community. However, a comprehensive study that categorizes established\ntopics, identifies trends, and outlines areas for future research remains\nabsent. Contributing to closing this gap, we have systematically classified and\nanalyzed research papers in the ACL Anthology. As a result, we present a\nstructured overview of the research landscape, provide a taxonomy of fields of\nstudy in NLP, analyze recent developments in NLP, summarize our findings, and\nhighlight directions for future work.\n","authors":["Tim Schopf","Karim Arabi","Florian Matthes"],"pdf_url":"https://arxiv.org/pdf/2307.10652v4.pdf","comment":"Extended version of the paper published at the 14th International\n Conference on Recent Advances in Natural Language Processing (RANLP 2023)"},{"id":"http://arxiv.org/abs/2308.12131v1","updated":"2023-08-23T13:37:02Z","published":"2023-08-23T13:37:02Z","title":"Semantic Change Detection for the Romanian Language","summary":" Automatic semantic change methods try to identify the changes that appear\nover time in the meaning of words by analyzing their usage in diachronic\ncorpora. In this paper, we analyze different strategies to create static and\ncontextual word embedding models, i.e., Word2Vec and ELMo, on real-world\nEnglish and Romanian datasets. To test our pipeline and determine the\nperformance of our models, we first evaluate both word embedding models on an\nEnglish dataset (SEMEVAL-CCOHA). Afterward, we focus our experiments on a\nRomanian dataset, and we underline different aspects of semantic changes in\nthis low-resource language, such as meaning acquisition and loss. The\nexperimental results show that, depending on the corpus, the most important\nfactors to consider are the choice of model and the distance to calculate a\nscore for detecting semantic change.\n","authors":["Ciprian-Octavian Truică","Victor Tudose","Elena-Simona Apostol"],"pdf_url":"https://arxiv.org/pdf/2308.12131v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12097v1","updated":"2023-08-23T12:36:57Z","published":"2023-08-23T12:36:57Z","title":"Instruction Position Matters in Sequence Generation with Large Language\n Models","summary":" Large language models (LLMs) are capable of performing conditional sequence\ngeneration tasks, such as translation or summarization, through instruction\nfine-tuning. The fine-tuning data is generally sequentially concatenated from a\nspecific task instruction, an input sentence, and the corresponding response.\nConsidering the locality modeled by the self-attention mechanism of LLMs, these\nmodels face the risk of instruction forgetting when generating responses for\nlong input sentences. To mitigate this issue, we propose enhancing the\ninstruction-following capability of LLMs by shifting the position of task\ninstructions after the input sentences. Theoretical analysis suggests that our\nstraightforward method can alter the model's learning focus, thereby\nemphasizing the training of instruction-following capabilities. Concurrently,\nexperimental results demonstrate that our approach consistently outperforms\ntraditional settings across various model scales (1B / 7B / 13B) and different\nsequence generation tasks (translation and summarization), without any\nadditional data or annotation costs. Notably, our method significantly improves\nthe zero-shot performance on conditional sequence generation, e.g., up to 9.7\nBLEU points on WMT zero-shot translation tasks.\n","authors":["Yijin Liu","Xianfeng Zeng","Fandong Meng","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.12097v1.pdf","comment":"Codes and results are at\n https://github.com/Adaxry/Post-Instruction/tree/main"},{"id":"http://arxiv.org/abs/2307.00009v2","updated":"2023-08-23T12:24:02Z","published":"2023-06-18T20:06:58Z","title":"Comparison of Machine Learning Methods for Assigning Software Issues to\n Team Members","summary":" Software issues contain units of work to fix, improve, or create new threads\nduring the development and facilitate communication among the team members.\nAssigning an issue to the most relevant team member and determining a category\nof an issue is a tedious and challenging task. Wrong classifications cause\ndelays and rework in the project and trouble among the team members. This paper\nproposes a set of carefully curated linguistic features for shallow machine\nlearning methods and compares the performance of shallow and ensemble methods\nwith deep language models. Unlike the state-of-the-art, we assign issues to\nfour roles (designer, developer, tester, and leader) rather than to specific\nindividuals or teams to contribute to the generality of our solution. We also\nconsider the level of experience of the developers to reflect the industrial\npractices in our solution formulation. We collect and annotate five industrial\ndata sets from one of the top three global television producers to evaluate our\nproposal and compare it with deep language models. Our data sets contain 5324\nissues in total. We show that an ensemble classifier of shallow techniques\nachieves 0.92 for issue assignment in accuracy which is statistically\ncomparable to the state-of-the-art deep language models. The contributions\ninclude the public sharing of five annotated industrial issue data sets, the\ndevelopment of a clear and comprehensive feature set, the introduction of a\nnovel label set, and the validation of the efficacy of an ensemble classifier\nof shallow machine learning techniques.\n","authors":["Büşra Tabak","Fatma Başak Aydemir"],"pdf_url":"https://arxiv.org/pdf/2307.00009v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12086v1","updated":"2023-08-23T12:11:27Z","published":"2023-08-23T12:11:27Z","title":"Out of the Cage: How Stochastic Parrots Win in Cyber Security\n Environments","summary":" Large Language Models (LLMs) have gained widespread popularity across diverse\ndomains involving text generation, summarization, and various natural language\nprocessing tasks. Despite their inherent limitations, LLM-based designs have\nshown promising capabilities in planning and navigating open-world scenarios.\nThis paper introduces a novel application of pre-trained LLMs as agents within\ncybersecurity network environments, focusing on their utility for sequential\ndecision-making processes.\n We present an approach wherein pre-trained LLMs are leveraged as attacking\nagents in two reinforcement learning environments. Our proposed agents\ndemonstrate similar or better performance against state-of-the-art agents\ntrained for thousands of episodes in most scenarios and configurations. In\naddition, the best LLM agents perform similarly to human testers of the\nenvironment without any additional training process. This design highlights the\npotential of LLMs to efficiently address complex decision-making tasks within\ncybersecurity.\n Furthermore, we introduce a new network security environment named\nNetSecGame. The environment is designed to eventually support complex\nmulti-agent scenarios within the network security domain. The proposed\nenvironment mimics real network attacks and is designed to be highly modular\nand adaptable for various scenarios.\n","authors":["Maria Rigaki","Ondřej Lukáš","Carlos A. Catania","Sebastian Garcia"],"pdf_url":"https://arxiv.org/pdf/2308.12086v1.pdf","comment":"Under review. 10 pages plus appendices, 7 figures, 4 tables"},{"id":"http://arxiv.org/abs/2308.12067v1","updated":"2023-08-23T11:27:30Z","published":"2023-08-23T11:27:30Z","title":"InstructionGPT-4: A 200-Instruction Paradigm for Fine-Tuning MiniGPT-4","summary":" Multimodal large language models acquire their instruction-following\ncapabilities through a two-stage training process: pre-training on image-text\npairs and fine-tuning on supervised vision-language instruction data. Recent\nstudies have shown that large language models can achieve satisfactory results\neven with a limited amount of high-quality instruction-following data. In this\npaper, we introduce InstructionGPT-4, which is fine-tuned on a small dataset\ncomprising only 200 examples, amounting to approximately 6% of the\ninstruction-following data used in the alignment dataset for MiniGPT-4. We\nfirst propose several metrics to access the quality of multimodal instruction\ndata. Based on these metrics, we present a simple and effective data selector\nto automatically identify and filter low-quality vision-language data. By\nemploying this method, InstructionGPT-4 outperforms the original MiniGPT-4 on\nvarious evaluations (e.g., visual question answering, GPT-4 preference).\nOverall, our findings demonstrate that less but high-quality instruction tuning\ndata is efficient to enable multimodal large language models to generate better\noutput.\n","authors":["Lai Wei","Zihao Jiang","Weiran Huang","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2308.12067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10592v2","updated":"2023-08-23T11:01:21Z","published":"2023-08-21T09:47:31Z","title":"BAN-PL: a Novel Polish Dataset of Banned Harmful and Offensive Content\n from Wykop.pl web service","summary":" Advances in automated detection of offensive language online, including hate\nspeech and cyberbullying, require improved access to publicly available\ndatasets comprising social media content. In this paper, we introduce BAN-PL,\nthe first open dataset in the Polish language that encompasses texts flagged as\nharmful and subsequently removed by professional moderators. The dataset\nencompasses a total of 691,662 pieces of content from a popular social\nnetworking service, Wykop, often referred to as the \"Polish Reddit\", including\nboth posts and comments, and is evenly distributed into two distinct classes:\n\"harmful\" and \"neutral\". We provide a comprehensive description of the data\ncollection and preprocessing procedures, as well as highlight the linguistic\nspecificity of the data. The BAN-PL dataset, along with advanced preprocessing\nscripts for, i.a., unmasking profanities, will be publicly available.\n","authors":["Inez Okulska","Kinga Głąbińska","Anna Kołos","Agnieszka Karlińska","Emilia Wiśnios","Adam Nowakowski","Paweł Ellerik","Andrzej Prałat"],"pdf_url":"https://arxiv.org/pdf/2308.10592v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12060v1","updated":"2023-08-23T11:00:36Z","published":"2023-08-23T11:00:36Z","title":"FlexKBQA: A Flexible LLM-Powered Framework for Few-Shot Knowledge Base\n Question Answering","summary":" Knowledge base question answering (KBQA) is a critical yet challenging task\ndue to the vast number of entities within knowledge bases and the diversity of\nnatural language questions posed by users. Unfortunately, the performance of\nmost KBQA models tends to decline significantly in real-world scenarios where\nhigh-quality annotated data is insufficient. To mitigate the burden associated\nwith manual annotation, we introduce FlexKBQA by utilizing Large Language\nModels (LLMs) as program translators for addressing the challenges inherent in\nthe few-shot KBQA task. Specifically, FlexKBQA leverages automated algorithms\nto sample diverse programs, such as SPARQL queries, from the knowledge base,\nwhich are subsequently converted into natural language questions via LLMs. This\nsynthetic dataset facilitates training a specialized lightweight model for the\nKB. Additionally, to reduce the barriers of distribution shift between\nsynthetic data and real user questions, FlexKBQA introduces an executionguided\nself-training method to iterative leverage unlabeled user questions.\nFurthermore, we explore harnessing the inherent reasoning capability of LLMs to\nenhance the entire framework. Consequently, FlexKBQA delivers substantial\nflexibility, encompassing data annotation, deployment, and being domain\nagnostic. Through extensive experiments on GrailQA, WebQSP, and KQA Pro, we\nobserve that under the few-shot even the more challenging zero-shot scenarios,\nFlexKBQA achieves impressive results with a few annotations, surpassing all\nprevious baselines and even approaching the performance of supervised models,\nachieving a remarkable 93% performance relative to the fully-supervised models.\nWe posit that FlexKBQA represents a significant advancement towards exploring\nbetter integration of large and lightweight models. The code is open-sourced.\n","authors":["Zhenyu Li","Sunqi Fan","Yu Gu","Xiuxing Li","Zhichao Duan","Bowen Dong","Ning Liu","Jianyong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.12060v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11466v2","updated":"2023-08-23T10:46:16Z","published":"2023-08-22T14:25:15Z","title":"SONAR: Sentence-Level Multimodal and Language-Agnostic Representations","summary":" We introduce SONAR, a new multilingual and multimodal fixed-size sentence\nembedding space. Our single text encoder, covering 200 languages, substantially\noutperforms existing sentence embeddings such as LASER3 and LabSE on the xsim\nand xsim++ multilingual similarity search tasks. Speech segments can be\nembedded in the same SONAR embedding space using language-specific speech\nencoders trained in a teacher-student setting on speech transcription data. Our\nencoders outperform existing speech encoders on similarity search tasks. We\nalso provide a text decoder for 200 languages, which allows us to perform\ntext-to-text and speech-to-text machine translation, including for zero-shot\nlanguage and modality combinations. Our text-to-text results are competitive\ncompared to the state-of-the-art NLLB~1B model, despite the fixed-size\nbottleneck representation. Our zero-shot speech-to-text translation results\ncompare favorably with strong supervised baselines such as Whisper.\n","authors":["Paul-Ambroise Duquenne","Holger Schwenk","Benoît Sagot"],"pdf_url":"https://arxiv.org/pdf/2308.11466v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12050v1","updated":"2023-08-23T10:41:07Z","published":"2023-08-23T10:41:07Z","title":"Aligning Language Models with Offline Reinforcement Learning from Human\n Feedback","summary":" Learning from human preferences is crucial for language models (LMs) to\neffectively cater to human needs and societal values. Previous research has\nmade notable progress by leveraging human feedback to follow instructions.\nHowever, these approaches rely primarily on online reinforcement learning (RL)\ntechniques like Proximal Policy Optimization (PPO), which have been proven\nunstable and challenging to tune for language models. Moreover, PPO requires\ncomplex distributed system implementation, hindering the efficiency of\nlarge-scale distributed training. In this study, we propose an offline\nreinforcement learning from human feedback (RLHF) framework to align LMs using\npre-generated samples without interacting with RL environments. Specifically,\nwe explore maximum likelihood estimation (MLE) with filtering, reward-weighted\nregression (RWR), and Decision Transformer (DT) to align language models to\nhuman preferences. By employing a loss function similar to supervised\nfine-tuning, our methods ensure more stable model training than PPO with a\nsimple machine learning system~(MLSys) and much fewer (around 12.3\\%) computing\nresources. Experimental results demonstrate the DT alignment outperforms other\nOffline RLHF methods and is better than PPO.\n","authors":["Jian Hu","Li Tao","June Yang","Chandler Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.12050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12045v1","updated":"2023-08-23T10:25:37Z","published":"2023-08-23T10:25:37Z","title":"CgT-GAN: CLIP-guided Text GAN for Image Captioning","summary":" The large-scale visual-language pre-trained model, Contrastive Language-Image\nPre-training (CLIP), has significantly improved image captioning for scenarios\nwithout human-annotated image-caption pairs. Recent advanced CLIP-based image\ncaptioning without human annotations follows a text-only training paradigm,\ni.e., reconstructing text from shared embedding space. Nevertheless, these\napproaches are limited by the training/inference gap or huge storage\nrequirements for text embeddings. Given that it is trivial to obtain images in\nthe real world, we propose CLIP-guided text GAN (CgT-GAN), which incorporates\nimages into the training process to enable the model to \"see\" real visual\nmodality. Particularly, we use adversarial training to teach CgT-GAN to mimic\nthe phrases of an external text corpus and CLIP-based reward to provide\nsemantic guidance. The caption generator is jointly rewarded based on the\ncaption naturalness to human language calculated from the GAN's discriminator\nand the semantic guidance reward computed by the CLIP-based reward module. In\naddition to the cosine similarity as the semantic guidance reward (i.e.,\nCLIP-cos), we further introduce a novel semantic guidance reward called\nCLIP-agg, which aligns the generated caption with a weighted text embedding by\nattentively aggregating the entire corpus. Experimental results on three\nsubtasks (ZS-IC, In-UIC and Cross-UIC) show that CgT-GAN outperforms\nstate-of-the-art methods significantly across all metrics. Code is available at\nhttps://github.com/Lihr747/CgtGAN.\n","authors":["Jiarui Yu","Haoran Li","Yanbin Hao","Bin Zhu","Tong Xu","Xiangnan He"],"pdf_url":"https://arxiv.org/pdf/2308.12045v1.pdf","comment":"Accepted at ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.12043v1","updated":"2023-08-23T10:08:10Z","published":"2023-08-23T10:08:10Z","title":"IncreLoRA: Incremental Parameter Allocation Method for\n Parameter-Efficient Fine-tuning","summary":" With the increasing size of pre-trained language models (PLMs), fine-tuning\nall the parameters in the model is not efficient, especially when there are a\nlarge number of downstream tasks, which incur significant training and storage\ncosts. Many parameter-efficient fine-tuning (PEFT) approaches have been\nproposed, among which, Low-Rank Adaptation (LoRA) is a representative approach\nthat injects trainable rank decomposition matrices into every target module.\nYet LoRA ignores the importance of parameters in different modules. To address\nthis problem, many works have been proposed to prune the parameters of LoRA.\nHowever, under limited training conditions, the upper bound of the rank of the\npruned parameter matrix is still affected by the preset values. We, therefore,\npropose IncreLoRA, an incremental parameter allocation method that adaptively\nadds trainable parameters during training based on the importance scores of\neach module. This approach is different from the pruning method as it is not\nlimited by the initial number of training parameters, and each parameter matrix\nhas a higher rank upper bound for the same training overhead. We conduct\nextensive experiments on GLUE to demonstrate the effectiveness of IncreLoRA.\nThe results show that our method owns higher parameter efficiency, especially\nwhen under the low-resource settings where our method significantly outperforms\nthe baselines. Our code is publicly available.\n","authors":["Feiyu Zhang","Liangzhi Li","Junhao Chen","Zhouqiang Jiang","Bowen Wang","Yiming Qian"],"pdf_url":"https://arxiv.org/pdf/2308.12043v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12039v1","updated":"2023-08-23T09:56:59Z","published":"2023-08-23T09:56:59Z","title":"Hybrid Retrieval and Multi-stage Text Ranking Solution at TREC 2022 Deep\n Learning Track","summary":" Large-scale text retrieval technology has been widely used in various\npractical business scenarios. This paper presents our systems for the TREC 2022\nDeep Learning Track. We explain the hybrid text retrieval and multi-stage text\nranking method adopted in our solution. The retrieval stage combined the two\nstructures of traditional sparse retrieval and neural dense retrieval. In the\nranking stage, in addition to the full interaction-based ranking model built on\nlarge pre-trained language model, we also proposes a lightweight sub-ranking\nmodule to further enhance the final text ranking performance. Evaluation\nresults demonstrate the effectiveness of our proposed approach. Our models\nachieve the 1st and 4th rank on the test set of passage ranking and document\nranking respectively.\n","authors":["Guangwei Xu","Yangzhao Zhang","Longhui Zhang","Dingkun Long","Pengjun Xie","Ruijie Guo"],"pdf_url":"https://arxiv.org/pdf/2308.12039v1.pdf","comment":"TREC 2022 Deep Learning Track"},{"id":"http://arxiv.org/abs/2308.12038v1","updated":"2023-08-23T09:55:41Z","published":"2023-08-23T09:55:41Z","title":"Large Multilingual Models Pivot Zero-Shot Multimodal Learning across\n Languages","summary":" Recently there has been a significant surge in multimodal learning in terms\nof both image-to-text and text-to-image generation. However, the success is\ntypically limited to English, leaving other languages largely behind. Building\na competitive counterpart in other languages is highly challenging due to the\nlow-resource nature of non-English multimodal data (i.e., lack of large-scale,\nhigh-quality image-text data). In this work, we propose MPM, an effective\ntraining paradigm for training large multimodal models in low-resource\nlanguages. MPM demonstrates that Multilingual language models can Pivot\nzero-shot Multimodal learning across languages. Specifically, based on a strong\nmultilingual large language model, multimodal models pretrained on English-only\nimage-text data can well generalize to other languages in a zero-shot manner\nfor both image-to-text and text-to-image generation, even surpassing models\ntrained on image-text data in native languages. Taking Chinese as a practice of\nMPM, we build large multimodal models VisCPM in image-to-text and text-to-image\ngeneration, which achieve state-of-the-art (open-source) performance in\nChinese. To facilitate future research, we open-source codes and model weights\nat https://github.com/OpenBMB/VisCPM.git.\n","authors":["Jinyi Hu","Yuan Yao","Chongyi Wang","Shan Wang","Yinxu Pan","Qianyu Chen","Tianyu Yu","Hanghao Wu","Yue Zhao","Haoye Zhang","Xu Han","Yankai Lin","Jiao Xue","Dahai Li","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2308.12038v1.pdf","comment":"https://github.com/OpenBMB/VisCPM.git"},{"id":"http://arxiv.org/abs/2308.12033v1","updated":"2023-08-23T09:46:37Z","published":"2023-08-23T09:46:37Z","title":"PREFER: Prompt Ensemble Learning via Feedback-Reflect-Refine","summary":" As an effective tool for eliciting the power of Large Language Models (LLMs),\nprompting has recently demonstrated unprecedented abilities across a variety of\ncomplex tasks. To further improve the performance, prompt ensemble has\nattracted substantial interest for tackling the hallucination and instability\nof LLMs. However, existing methods usually adopt a two-stage paradigm, which\nrequires a pre-prepared set of prompts with substantial manual effort, and is\nunable to perform directed optimization for different weak learners. In this\npaper, we propose a simple, universal, and automatic method named PREFER (Pompt\nEnsemble learning via Feedback-Reflect-Refine) to address the stated\nlimitations. Specifically, given the fact that weak learners are supposed to\nfocus on hard examples during boosting, PREFER builds a feedback mechanism for\nreflecting on the inadequacies of existing weak learners. Based on this, the\nLLM is required to automatically synthesize new prompts for iterative\nrefinement. Moreover, to enhance stability of the prompt effect evaluation, we\npropose a novel prompt bagging method involving forward and backward thinking,\nwhich is superior to majority voting and is beneficial for both feedback and\nweight calculation in boosting. Extensive experiments demonstrate that our\nPREFER achieves state-of-the-art performance in multiple types of tasks by a\nsignificant margin. We have made our code publicly available.\n","authors":["Chenrui Zhang","Lin Liu","Jinpeng Wang","Chuyuan Wang","Xiao Sun","Hongyu Wang","Mingchen Cai"],"pdf_url":"https://arxiv.org/pdf/2308.12033v1.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.12032v1","updated":"2023-08-23T09:45:29Z","published":"2023-08-23T09:45:29Z","title":"From Quantity to Quality: Boosting LLM Performance with Self-Guided Data\n Selection for Instruction Tuning","summary":" In the realm of Large Language Models, the balance between instruction data\nquality and quantity has become a focal point. Recognizing this, we introduce a\nself-guided methodology for LLMs to autonomously discern and select cherry\nsamples from vast open-source datasets, effectively minimizing manual curation\nand potential cost for instruction tuning an LLM. Our key innovation, the\nInstruction-Following Difficulty (IFD) metric, emerges as a pivotal tool to\nidentify discrepancies between a model's expected responses and its autonomous\ngeneration prowess. Through the adept application of IFD, cherry samples are\npinpointed, leading to a marked uptick in model training efficiency. Empirical\nvalidations on renowned datasets like Alpaca and WizardLM underpin our\nfindings; with a mere 10% of conventional data input, our strategy showcases\nimproved results. This synthesis of self-guided cherry-picking and the IFD\nmetric signifies a transformative leap in the optimization of LLMs, promising\nboth efficiency and resource-conscious advancements.\n","authors":["Ming Li","Yong Zhang","Zhitao Li","Jiuhai Chen","Lichang Chen","Ning Cheng","Jianzong Wang","Tianyi Zhou","Jing Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.12032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12030v1","updated":"2023-08-23T09:43:10Z","published":"2023-08-23T09:43:10Z","title":"Prompt-Based Length Controlled Generation with Reinforcement Learning","summary":" Recently, large language models (LLMs) like ChatGPT and GPT-4 have attracted\ngreat attention given their surprising improvement and performance. Length\ncontrolled generation of LLMs emerges as an important topic, which also enables\nusers to fully leverage the capability of LLMs in more real-world scenarios\nlike generating a proper answer or essay of a desired length. In addition, the\nautoregressive generation in LLMs is extremely time-consuming, while the\nability of controlling this generated length can arbitrarily reduce the\ninference cost by limiting the length, and thus satisfy different needs.\nTherefore, we aim to propose a prompt-based length control method to achieve\nthis length controlled generation, which can also be widely applied in\nGPT-style LLMs. In particular, we adopt reinforcement learning with the reward\nsignal given by either trainable or rule-based reward model, which further\naffects the generation of LLMs via rewarding a pre-defined target length.\nExperiments show that our method significantly improves the accuracy of\nprompt-based length control for summarization task on popular datasets like\nCNNDM and NYT. We believe this length-controllable ability can provide more\npotentials towards the era of LLMs.\n","authors":["Renlong Jie","Xiaojun Meng","Lifeng Shang","Xin Jiang","Qun Liu"],"pdf_url":"https://arxiv.org/pdf/2308.12030v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07758v3","updated":"2023-08-23T09:35:33Z","published":"2023-08-15T13:19:59Z","title":"Forward-Backward Reasoning in Large Language Models for Verification","summary":" Chain-of-Though (CoT) prompting has shown promising performance in various\nreasoning tasks. Recently, Self-Consistency \\citep{wang2023selfconsistency}\nproposes to sample a diverse set of reasoning chains which may lead to\ndifferent answers while the answer that receives the most votes is selected. In\nthis paper, we propose a novel method to use backward reasoning in verifying\ncandidate answers. We mask a token in the question by ${\\bf x}$ and ask the LLM\nto predict the masked token when a candidate answer is provided by \\textit{a\nsimple template}, i.e., \"\\textit{\\textbf{If we know the answer of the above\nquestion is \\{a candidate answer\\}, what is the value of unknown variable ${\\bf\nx}$?}}\" Intuitively, the LLM is expected to predict the masked token\nsuccessfully if the provided candidate answer is correct. We further propose\nFOBAR to combine forward and backward reasoning for estimating the probability\nof candidate answers. We conduct extensive experiments on six data sets and\nthree LLMs. Experimental results demonstrate that FOBAR achieves\nstate-of-the-art performance on various reasoning benchmarks.\n","authors":["Weisen Jiang","Han Shi","Longhui Yu","Zhengying Liu","Yu Zhang","Zhenguo Li","James T. Kwok"],"pdf_url":"https://arxiv.org/pdf/2308.07758v3.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2308.12025v1","updated":"2023-08-23T09:32:40Z","published":"2023-08-23T09:32:40Z","title":"Knowledge-injected Prompt Learning for Chinese Biomedical Entity\n Normalization","summary":" The Biomedical Entity Normalization (BEN) task aims to align raw,\nunstructured medical entities to standard entities, thus promoting data\ncoherence and facilitating better downstream medical applications. Recently,\nprompt learning methods have shown promising results in this task. However,\nexisting research falls short in tackling the more complex Chinese BEN task,\nespecially in the few-shot scenario with limited medical data, and the vast\npotential of the external medical knowledge base has yet to be fully harnessed.\nTo address these challenges, we propose a novel Knowledge-injected Prompt\nLearning (PL-Knowledge) method. Specifically, our approach consists of five\nstages: candidate entity matching, knowledge extraction, knowledge encoding,\nknowledge injection, and prediction output. By effectively encoding the\nknowledge items contained in medical entities and incorporating them into our\ntailor-made knowledge-injected templates, the additional knowledge enhances the\nmodel's ability to capture latent relationships between medical entities, thus\nachieving a better match with the standard entities. We extensively evaluate\nour model on a benchmark dataset in both few-shot and full-scale scenarios. Our\nmethod outperforms existing baselines, with an average accuracy boost of\n12.96\\% in few-shot and 0.94\\% in full-data cases, showcasing its excellence in\nthe BEN task.\n","authors":["Songhua Yang","Chenghao Zhang","Hongfei Xu","Yuxiang Jia"],"pdf_url":"https://arxiv.org/pdf/2308.12025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12022v1","updated":"2023-08-23T09:29:29Z","published":"2023-08-23T09:29:29Z","title":"Reranking Passages with Coarse-to-Fine Neural Retriever using\n List-Context Information","summary":" Passage reranking is a crucial task in many applications, particularly when\ndealing with large-scale documents. Traditional neural architectures are\nlimited in retrieving the best passage for a question because they usually\nmatch the question to each passage separately, seldom considering contextual\ninformation in other passages that can provide comparison and reference\ninformation. This paper presents a list-context attention mechanism to augment\nthe passage representation by incorporating the list-context information from\nother candidates. The proposed coarse-to-fine (C2F) neural retriever addresses\nthe out-of-memory limitation of the passage attention mechanism by dividing the\nlist-context modeling process into two sub-processes, allowing for efficient\nencoding of context information from a large number of candidate answers. This\nmethod can be generally used to encode context information from any number of\ncandidate answers in one pass. Different from most multi-stage information\nretrieval architectures, this model integrates the coarse and fine rankers into\nthe joint optimization process, allowing for feedback between the two layers to\nupdate the model simultaneously. Experiments demonstrate the effectiveness of\nthe proposed approach.\n","authors":["Hongyin Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.12022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12014v1","updated":"2023-08-23T09:11:13Z","published":"2023-08-23T09:11:13Z","title":"From Instructions to Intrinsic Human Values -- A Survey of Alignment\n Goals for Big Models","summary":" Big models, exemplified by Large Language Models (LLMs), are models typically\npre-trained on massive data and comprised of enormous parameters, which not\nonly obtain significantly improved performance across diverse tasks but also\npresent emergent capabilities absent in smaller models. However, the growing\nintertwining of big models with everyday human lives poses potential risks and\nmight cause serious social harm. Therefore, many efforts have been made to\nalign LLMs with humans to make them better follow user instructions and satisfy\nhuman preferences. Nevertheless, `what to align with' has not been fully\ndiscussed, and inappropriate alignment goals might even backfire. In this\npaper, we conduct a comprehensive survey of different alignment goals in\nexisting work and trace their evolution paths to help identify the most\nessential goal. Particularly, we investigate related works from two\nperspectives: the definition of alignment goals and alignment evaluation. Our\nanalysis encompasses three distinct levels of alignment goals and reveals a\ngoal transformation from fundamental abilities to value orientation, indicating\nthe potential of intrinsic human values as the alignment goal for enhanced\nLLMs. Based on such results, we further discuss the challenges of achieving\nsuch intrinsic value alignment and provide a collection of available resources\nfor future research on the alignment of big models.\n","authors":["Jing Yao","Xiaoyuan Yi","Xiting Wang","Jindong Wang","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2308.12014v1.pdf","comment":"20 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.12008v1","updated":"2023-08-23T08:54:05Z","published":"2023-08-23T08:54:05Z","title":"Graecia capta ferum victorem cepit. Detecting Latin Allusions to Ancient\n Greek Literature","summary":" Intertextual allusions hold a pivotal role in Classical Philology, with Latin\nauthors frequently referencing Ancient Greek texts. Until now, the automatic\nidentification of these intertextual references has been constrained to\nmonolingual approaches, seeking parallels solely within Latin or Greek texts.\nIn this study, we introduce SPhilBERTa, a trilingual Sentence-RoBERTa model\ntailored for Classical Philology, which excels at cross-lingual semantic\ncomprehension and identification of identical sentences across Ancient Greek,\nLatin, and English. We generate new training data by automatically translating\nEnglish texts into Ancient Greek. Further, we present a case study,\ndemonstrating SPhilBERTa's capability to facilitate automated detection of\nintertextual parallels. Our models and resources are available at\nhttps://github.com/Heidelberg-NLP/ancient-language-models.\n","authors":["Frederick Riemenschneider","Anette Frank"],"pdf_url":"https://arxiv.org/pdf/2308.12008v1.pdf","comment":"Paper accepted for publication at the First Workshop on Ancient\n Language Processing (ALP) 2023; 9 pages, 5 tables"},{"id":"http://arxiv.org/abs/2308.11995v1","updated":"2023-08-23T08:33:14Z","published":"2023-08-23T08:33:14Z","title":"Topical-Chat: Towards Knowledge-Grounded Open-Domain Conversations","summary":" Building socialbots that can have deep, engaging open-domain conversations\nwith humans is one of the grand challenges of artificial intelligence (AI). To\nthis end, bots need to be able to leverage world knowledge spanning several\ndomains effectively when conversing with humans who have their own world\nknowledge. Existing knowledge-grounded conversation datasets are primarily\nstylized with explicit roles for conversation partners. These datasets also do\nnot explore depth or breadth of topical coverage with transitions in\nconversations. We introduce Topical-Chat, a knowledge-grounded human-human\nconversation dataset where the underlying knowledge spans 8 broad topics and\nconversation partners don't have explicitly defined roles, to help further\nresearch in open-domain conversational AI. We also train several\nstate-of-the-art encoder-decoder conversational models on Topical-Chat and\nperform automated and human evaluation for benchmarking.\n","authors":["Karthik Gopalakrishnan","Behnam Hedayatnia","Qinlang Chen","Anna Gottardi","Sanjeev Kwatra","Anu Venkatesh","Raefer Gabriel","Dilek Hakkani-Tur"],"pdf_url":"https://arxiv.org/pdf/2308.11995v1.pdf","comment":"arXiving an old paper accepted at INTERSPEECH 2019"},{"id":"http://arxiv.org/abs/2308.11971v1","updated":"2023-08-23T07:36:30Z","published":"2023-08-23T07:36:30Z","title":"EVE: Efficient Vision-Language Pre-training with Masked Prediction and\n Modality-Aware MoE","summary":" Building scalable vision-language models to learn from diverse, multimodal\ndata remains an open challenge. In this paper, we introduce an Efficient\nVision-languagE foundation model, namely EVE, which is one unified multimodal\nTransformer pre-trained solely by one unified pre-training task. Specifically,\nEVE encodes both vision and language within a shared Transformer network\nintegrated with modality-aware sparse Mixture-of-Experts (MoE) modules, which\ncapture modality-specific information by selectively switching to different\nexperts. To unify pre-training tasks of vision and language, EVE performs\nmasked signal modeling on image-text pairs to reconstruct masked signals, i.e.,\nimage pixels and text tokens, given visible signals. This simple yet effective\npre-training objective accelerates training by 3.5x compared to the model\npre-trained with Image-Text Contrastive and Image-Text Matching losses. Owing\nto the combination of the unified architecture and pre-training task, EVE is\neasy to scale up, enabling better downstream performance with fewer resources\nand faster training speed. Despite its simplicity, EVE achieves\nstate-of-the-art performance on various vision-language downstream tasks,\nincluding visual question answering, visual reasoning, and image-text\nretrieval.\n","authors":["Junyi Chen","Longteng Guo","Jia Sun","Shuai Shao","Zehuan Yuan","Liang Lin","Dongyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.11971v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06857v2","updated":"2023-08-23T07:06:53Z","published":"2023-07-11T17:51:48Z","title":"Self-consistency for open-ended generations","summary":" Large Language Models (LLMs) can exhibit considerable variation in the\nquality of their sampled outputs. Reranking and selecting the best generation\nfrom the sampled set is a popular way of obtaining strong gains in generation\nquality. In this paper, we present a novel approach for reranking LLM\ngenerations. Unlike other techniques that might involve additional inferences\nor training a specialized reranker, our approach relies on easy to compute\npairwise statistics between the generations that have minimal compute overhead.\nWe show that our approach can be formalized as an extension of self-consistency\nand analyze its performance in that framework, theoretically as well as via\nsimulations. We show strong improvements for selecting the best $k$ generations\nfor code generation tasks as well as robust improvements for best generation\nfor the tasks of autoformalization, and summarization. While our approach only\nassumes black-box access to LLMs, we show that additional access to token\nprobabilities can improve performance even further.\n","authors":["Siddhartha Jain","Xiaofei Ma","Anoop Deoras","Bing Xiang"],"pdf_url":"https://arxiv.org/pdf/2307.06857v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11940v1","updated":"2023-08-23T06:21:46Z","published":"2023-08-23T06:21:46Z","title":"Audio Generation with Multiple Conditional Diffusion Model","summary":" Text-based audio generation models have limitations as they cannot encompass\nall the information in audio, leading to restricted controllability when\nrelying solely on text. To address this issue, we propose a novel model that\nenhances the controllability of existing pre-trained text-to-audio models by\nincorporating additional conditions including content (timestamp) and style\n(pitch contour and energy contour) as supplements to the text. This approach\nachieves fine-grained control over the temporal order, pitch, and energy of\ngenerated audio. To preserve the diversity of generation, we employ a trainable\ncontrol condition encoder that is enhanced by a large language model and a\ntrainable Fusion-Net to encode and fuse the additional conditions while keeping\nthe weights of the pre-trained text-to-audio model frozen. Due to the lack of\nsuitable datasets and evaluation metrics, we consolidate existing datasets into\na new dataset comprising the audio and corresponding conditions and use a\nseries of evaluation metrics to evaluate the controllability performance.\nExperimental results demonstrate that our model successfully achieves\nfine-grained control to accomplish controllable audio generation. Audio samples\nand our dataset are publicly available at\nhttps://conditionaudiogen.github.io/conditionaudiogen/\n","authors":["Zhifang Guo","Jianguo Mao","Rui Tao","Long Yan","Kazushige Ouchi","Hong Liu","Xiangdong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.11940v1.pdf","comment":"Submitted to AAAI 2024"},{"id":"http://arxiv.org/abs/2206.08955v3","updated":"2023-08-23T05:34:42Z","published":"2022-06-17T18:11:34Z","title":"Making first order linear logic a generating grammar","summary":" It is known that different categorial grammars have surface representation in\na fragment of first order multiplicative linear logic (MLL1). We show that the\nfragment of interest is equivalent to the recently introduced extended tensor\ntype calculus (ETTC). ETTC is a calculus of specific typed terms, which\nrepresent tuples of strings, more precisely bipartite graphs decorated with\nstrings. Types are derived from linear logic formulas, and rules correspond to\nconcrete operations on these string-labeled graphs, so that they can be\nconveniently visualized. This provides the above mentioned fragment of MLL1\nthat is relevant for language modeling not only with some alternative syntax\nand intuitive geometric representation, but also with an intrinsic deductive\nsystem, which has been absent.\n In this work we consider a non-trivial notationally enriched variation of the\npreviously introduced {\\bf ETTC}, which allows more concise and transparent\ncomputations. We present both a cut-free sequent calculus and a natural\ndeduction formalism.\n","authors":["Sergey Slavnov"],"pdf_url":"https://arxiv.org/pdf/2206.08955v3.pdf","comment":"Revised and extended version with detailed proofs. arXiv admin note:\n substantial text overlap with arXiv:2112.15253"},{"id":"http://arxiv.org/abs/2205.03977v3","updated":"2023-08-23T05:18:04Z","published":"2022-05-08T23:58:40Z","title":"A Structured Span Selector","summary":" Many natural language processing tasks, e.g., coreference resolution and\nsemantic role labeling, require selecting text spans and making decisions about\nthem. A typical approach to such tasks is to score all possible spans and\ngreedily select spans for task-specific downstream processing. This approach,\nhowever, does not incorporate any inductive bias about what sort of spans ought\nto be selected, e.g., that selected spans tend to be syntactic constituents. In\nthis paper, we propose a novel grammar-based structured span selection model\nwhich learns to make use of the partial span-level annotation provided for such\nproblems. Compared to previous approaches, our approach gets rid of the\nheuristic greedy span selection scheme, allowing us to model the downstream\ntask on an optimal set of spans. We evaluate our model on two popular span\nprediction tasks: coreference resolution and semantic role labeling. We show\nempirical improvements on both.\n","authors":["Tianyu Liu","Yuchen Eleanor Jiang","Ryan Cotterell","Mrinmaya Sachan"],"pdf_url":"https://arxiv.org/pdf/2205.03977v3.pdf","comment":"NAACL 2022 camera-ready"},{"id":"http://arxiv.org/abs/2308.11923v1","updated":"2023-08-23T05:13:25Z","published":"2023-08-23T05:13:25Z","title":"Audio Difference Captioning Utilizing Similarity-Discrepancy\n Disentanglement","summary":" We proposed Audio Difference Captioning (ADC) as a new extension task of\naudio captioning for describing the semantic differences between input pairs of\nsimilar but slightly different audio clips. The ADC solves the problem that\nconventional audio captioning sometimes generates similar captions for similar\naudio clips, failing to describe the difference in content. We also propose a\ncross-attention-concentrated transformer encoder to extract differences by\ncomparing a pair of audio clips and a similarity-discrepancy disentanglement to\nemphasize the difference in the latent space. To evaluate the proposed methods,\nwe built an AudioDiffCaps dataset consisting of pairs of similar but slightly\ndifferent audio clips with human-annotated descriptions of their differences.\nThe experiment with the AudioDiffCaps dataset showed that the proposed methods\nsolve the ADC task effectively and improve the attention weights to extract the\ndifference by visualizing them in the transformer encoder.\n","authors":["Daiki Takeuchi","Yasunori Ohishi","Daisuke Niizumi","Noboru Harada","Kunio Kashino"],"pdf_url":"https://arxiv.org/pdf/2308.11923v1.pdf","comment":"Accepted to DCASE2023 Workshop"},{"id":"http://arxiv.org/abs/2306.14122v3","updated":"2023-08-23T05:04:58Z","published":"2023-06-25T04:33:56Z","title":"Chain-of-Thought Prompt Distillation for Multimodal Named Entity\n Recognition and Multimodal Relation Extraction","summary":" Multimodal Named Entity Recognition (MNER) and Multimodal Relation Extraction\n(MRE) necessitate the fundamental reasoning capacity for intricate linguistic\nand multimodal comprehension. In this study, we explore distilling the\nreasoning ability of large language models (LLMs) into a more compact student\nmodel by generating a \\textit{chain of thought} (CoT) -- a sequence of\nintermediate reasoning steps. Specifically, we commence by exemplifying the\nelicitation of such reasoning ability from LLMs through CoT prompts covering\nmulti-grain (noun, sentence, multimodality) and data-augmentation (style,\nentity, image) dimensions. Subsequently, we present a novel conditional prompt\ndistillation method to assimilate the commonsense reasoning ability from LLMs,\nthereby enhancing the utility of the student model in addressing text-only\ninputs without the requisite addition of image and CoT knowledge. Extensive\nexperiments reveal that our approach attains state-of-the-art accuracy and\nmanifests a plethora of advantages concerning interpretability, data\nefficiency, and cross-domain generalization on MNER and MRE datasets.\n","authors":["Feng Chen","Yujian Feng"],"pdf_url":"https://arxiv.org/pdf/2306.14122v3.pdf","comment":"modification"},{"id":"http://arxiv.org/abs/2308.11891v1","updated":"2023-08-23T03:38:21Z","published":"2023-08-23T03:38:21Z","title":"Bridging the Gap: Deciphering Tabular Data Using Large Language Model","summary":" In the realm of natural language processing, the understanding of tabular\ndata has perpetually stood as a focal point of scholarly inquiry. The emergence\nof expansive language models, exemplified by the likes of ChatGPT, has ushered\nin a wave of endeavors wherein researchers aim to harness these models for\ntasks related to table-based question answering. Central to our investigative\npursuits is the elucidation of methodologies that amplify the aptitude of such\nlarge language models in discerning both the structural intricacies and\ninherent content of tables, ultimately facilitating their capacity to provide\ninformed responses to pertinent queries. To this end, we have architected a\ndistinctive module dedicated to the serialization of tables for seamless\nintegration with expansive language models. Additionally, we've instituted a\ncorrective mechanism within the model to rectify potential inaccuracies.\nExperimental results indicate that, although our proposed method trails the\nSOTA by approximately 11.7% in overall metrics, it surpasses the SOTA by about\n1.2% in tests on specific datasets. This research marks the first application\nof large language models to table-based question answering tasks, enhancing the\nmodel's comprehension of both table structures and content.\n","authors":["Hengyuan Zhang","Peng Chang","Zongcheng Ji"],"pdf_url":"https://arxiv.org/pdf/2308.11891v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08239v2","updated":"2023-08-23T03:37:04Z","published":"2023-08-16T09:15:18Z","title":"MemoChat: Tuning LLMs to Use Memos for Consistent Long-Range Open-Domain\n Conversation","summary":" We propose MemoChat, a pipeline for refining instructions that enables large\nlanguage models (LLMs) to effectively employ self-composed memos for\nmaintaining consistent long-range open-domain conversations. We demonstrate a\nlong-range open-domain conversation through iterative\n\"memorization-retrieval-response\" cycles. This requires us to carefully design\ntailored tuning instructions for each distinct stage. The instructions are\nreconstructed from a collection of public datasets to teach the LLMs to\nmemorize and retrieve past dialogues with structured memos, leading to enhanced\nconsistency when participating in future conversations. We invite experts to\nmanually annotate a test set designed to evaluate the consistency of long-range\nconversations questions. Experiments on three testing scenarios involving both\nopen-source and API-accessible chatbots at scale verify the efficacy of\nMemoChat, which outperforms strong baselines. Our codes, data and models are\navailable here: https://github.com/LuJunru/MemoChat.\n","authors":["Junru Lu","Siyu An","Mingbao Lin","Gabriele Pergola","Yulan He","Di Yin","Xing Sun","Yunsheng Wu"],"pdf_url":"https://arxiv.org/pdf/2308.08239v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16680v4","updated":"2023-08-23T03:28:30Z","published":"2023-07-31T13:57:05Z","title":"On the Trustworthiness Landscape of State-of-the-art Generative Models:\n A Comprehensive Survey","summary":" Diffusion models and large language models have emerged as leading-edge\ngenerative models and have sparked a revolutionary impact on various aspects of\nhuman life. However, the practical implementation of these models has also\nexposed inherent risks, highlighting their dual nature and raising concerns\nregarding their trustworthiness. Despite the abundance of literature on this\nsubject, a comprehensive survey specifically delving into the intersection of\nlarge-scale generative models and their trustworthiness remains largely absent.\nTo bridge this gap, This paper investigates both the long-standing and emerging\nthreats associated with these models across four fundamental dimensions:\nprivacy, security, fairness, and responsibility. In this way, we construct an\nextensive map outlining the trustworthiness of these models, while also\nproviding practical recommendations and identifying future directions. These\nefforts are crucial for promoting the trustworthy deployment of these models,\nultimately benefiting society as a whole.\n","authors":["Mingyuan Fan","Cen Chen","Chengyu Wang","Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2307.16680v4.pdf","comment":"Draft Version"},{"id":"http://arxiv.org/abs/2308.11878v1","updated":"2023-08-23T02:49:35Z","published":"2023-08-23T02:49:35Z","title":"Cabrita: closing the gap for foreign languages","summary":" The strategy of training the model from scratch in a specific language or\ndomain serves two essential purposes: i) enhancing performance in the\nparticular linguistic or domain context, and ii) ensuring effective\ntokenization. The main limitation inherent to this approach lies in the\nassociated cost, which can reach six to seven-digit dollar values, depending on\nthe model size and the number of parameters involved.\n The main solution to overcome the cost challenge is to rely on available\npre-trained models, which, despite recent advancements such as the LLaMA and\nLLaMA-2 models, still demonstrate inefficiency for certain specific domain\nproblems or prove ineffective in scenarios involving conversational memory\nresources, given the large number of tokens required to represent text.\n To overcome this issue, we present a methodology named Cabrita, which, as our\nresearch demonstrates, successfully addresses the performance and efficient\ntokenization problem, all at an affordable cost. We believe that this\nmethodology can be applied to any transformer-like architecture model. To\nvalidate the study, we conducted continuous pre-training exclusively using\nPortuguese text on a 3-billion-parameter model known as OpenLLaMA, resulting in\na model named openCabrita 3B. The openCabrita 3B also features a new tokenizer\nthat results in a significant reduction in the number of tokens required to\nrepresent the text. In our assessment, for few-shot learning tasks, we achieved\nsimilar results with this 3B model compared to a traditional continuous\npre-training approach as well as to 7B models English pre-trained models.\n","authors":["Celio Larcher","Marcos Piau","Paulo Finardi","Pedro Gengo","Piero Esposito","Vinicius Caridá"],"pdf_url":"https://arxiv.org/pdf/2308.11878v1.pdf","comment":"9 pages, 1 figure"},{"id":"http://arxiv.org/abs/2301.11004v5","updated":"2023-08-23T01:15:04Z","published":"2023-01-26T09:26:01Z","title":"NLP as a Lens for Causal Analysis and Perception Mining to Infer Mental\n Health on Social Media","summary":" Interactions among humans on social media often convey intentions behind\ntheir actions, yielding a psychological language resource for Mental Health\nAnalysis (MHA) of online users. The success of Computational Intelligence\nTechniques (CIT) for inferring mental illness from such social media resources\npoints to NLP as a lens for causal analysis and perception mining. However, we\nargue that more consequential and explainable research is required for optimal\nimpact on clinical psychology practice and personalized mental healthcare. To\nbridge this gap, we posit two significant dimensions: (1) Causal analysis to\nillustrate a cause and effect relationship in the user generated text; (2)\nPerception mining to infer psychological perspectives of social effects on\nonline users intentions. Within the scope of Natural Language Processing (NLP),\nwe further explore critical areas of inquiry associated with these two\ndimensions, specifically through recent advancements in discourse analysis.\nThis position paper guides the community to explore solutions in this space and\nadvance the state of practice in developing conversational agents for inferring\nmental health from social media. We advocate for a more explainable approach\ntoward modeling computational psychology problems through the lens of language\nas we observe an increased number of research contributions in dataset and\nproblem formulation for causal relation extraction and perception enhancements\nwhile inferring mental states.\n","authors":["Muskan Garg","Chandni Saxena","Usman Naseem","Bonnie J Dorr"],"pdf_url":"https://arxiv.org/pdf/2301.11004v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10380v2","updated":"2023-08-23T00:52:13Z","published":"2023-08-20T22:42:04Z","title":"A Human-on-the-Loop Optimization Autoformalism Approach for\n Sustainability","summary":" This paper outlines a natural conversational approach to solving personalized\nenergy-related problems using large language models (LLMs). We focus on\ncustomizable optimization problems that necessitate repeated solving with\nslight variations in modeling and are user-specific, hence posing a challenge\nto devising a one-size-fits-all model. We put forward a strategy that augments\nan LLM with an optimization solver, enhancing its proficiency in understanding\nand responding to user specifications and preferences while providing nonlinear\nreasoning capabilities. Our approach pioneers the novel concept of human-guided\noptimization autoformalism, translating a natural language task specification\nautomatically into an optimization instance. This enables LLMs to analyze,\nexplain, and tackle a variety of instance-specific energy-related problems,\npushing beyond the limits of current prompt-based techniques.\n Our research encompasses various commonplace tasks in the energy sector, from\nelectric vehicle charging and Heating, Ventilation, and Air Conditioning (HVAC)\ncontrol to long-term planning problems such as cost-benefit evaluations for\ninstalling rooftop solar photovoltaics (PVs) or heat pumps. This pilot study\nmarks an essential stride towards the context-based formulation of optimization\nusing LLMs, with the potential to democratize optimization processes. As a\nresult, stakeholders are empowered to optimize their energy consumption,\npromoting sustainable energy practices customized to personal needs and\npreferences.\n","authors":["Ming Jin","Bilgehan Sel","Fnu Hardeep","Wotao Yin"],"pdf_url":"https://arxiv.org/pdf/2308.10380v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12466v1","updated":"2023-08-23T23:16:35Z","published":"2023-08-23T23:16:35Z","title":"Are ChatGPT and GPT-4 Good Poker Players? -- A Pre-Flop Analysis","summary":" Since the introduction of ChatGPT and GPT-4, these models have been tested\nacross a large number of tasks. Their adeptness across domains is evident, but\ntheir aptitude in playing games and specifically their aptitude in the realm of\npoker has remained unexplored. Poker is a game that requires decision making\nunder uncertainty and incomplete information. In this paper, we put ChatGPT and\nGPT-4 through the poker test and evaluate their poker skills. Our findings\nreveal that while both models display an advanced understanding of poker,\nencompassing concepts like the valuation of starting hands, playing positions\nand other intricacies of game theory optimal (GTO) poker, both ChatGPT and\nGPT-4 are NOT game theory optimal poker players.\n Through a series of experiments, we first discover the characteristics of\noptimal prompts and model parameters for playing poker with these models. Our\nobservations then unveil the distinct playing personas of the two models. We\nfirst conclude that GPT-4 is a more advanced poker player than ChatGPT. This\nexploration then sheds light on the divergent poker tactics of the two models:\nChatGPT's conservativeness juxtaposed against GPT-4's aggression. In poker\nvernacular, when tasked to play GTO poker, ChatGPT plays like a Nit, which\nmeans that it has a propensity to only engage with premium hands and folds a\nmajority of hands. When subjected to the same directive, GPT-4 plays like a\nmaniac, showcasing a loose and aggressive style of play. Both strategies,\nalthough relatively advanced, are not game theory optimal.\n","authors":["Akshat Gupta"],"pdf_url":"https://arxiv.org/pdf/2308.12466v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11596v2","updated":"2023-08-23T21:02:01Z","published":"2023-08-22T17:44:18Z","title":"SeamlessM4T-Massively Multilingual & Multimodal Machine Translation","summary":" What does it take to create the Babel Fish, a tool that can help individuals\ntranslate speech between any two languages? While recent breakthroughs in\ntext-based models have pushed machine translation coverage beyond 200\nlanguages, unified speech-to-speech translation models have yet to achieve\nsimilar strides. More specifically, conventional speech-to-speech translation\nsystems rely on cascaded systems that perform translation progressively,\nputting high-performing unified systems out of reach. To address these gaps, we\nintroduce SeamlessM4T, a single model that supports speech-to-speech\ntranslation, speech-to-text translation, text-to-speech translation,\ntext-to-text translation, and automatic speech recognition for up to 100\nlanguages. To build this, we used 1 million hours of open speech audio data to\nlearn self-supervised speech representations with w2v-BERT 2.0. Subsequently,\nwe created a multimodal corpus of automatically aligned speech translations.\nFiltered and combined with human-labeled and pseudo-labeled data, we developed\nthe first multilingual system capable of translating from and into English for\nboth speech and text. On FLEURS, SeamlessM4T sets a new standard for\ntranslations into multiple target languages, achieving an improvement of 20%\nBLEU over the previous SOTA in direct speech-to-text translation. Compared to\nstrong cascaded models, SeamlessM4T improves the quality of into-English\ntranslation by 1.3 BLEU points in speech-to-text and by 2.6 ASR-BLEU points in\nspeech-to-speech. Tested for robustness, our system performs better against\nbackground noises and speaker variations in speech-to-text tasks compared to\nthe current SOTA model. Critically, we evaluated SeamlessM4T on gender bias and\nadded toxicity to assess translation safety. Finally, all contributions in this\nwork are open-sourced and accessible at\nhttps://github.com/facebookresearch/seamless_communication\n","authors":["Seamless Communication","Loïc Barrault","Yu-An Chung","Mariano Cora Meglioli","David Dale","Ning Dong","Paul-Ambroise Duquenne","Hady Elsahar","Hongyu Gong","Kevin Heffernan","John Hoffman","Christopher Klaiber","Pengwei Li","Daniel Licht","Jean Maillard","Alice Rakotoarison","Kaushik Ram Sadagopan","Guillaume Wenzek","Ethan Ye","Bapi Akula","Peng-Jen Chen","Naji El Hachem","Brian Ellis","Gabriel Mejia Gonzalez","Justin Haaheim","Prangthip Hansanti","Russ Howes","Bernie Huang","Min-Jae Hwang","Hirofumi Inaguma","Somya Jain","Elahe Kalbassi","Amanda Kallet","Ilia Kulikov","Janice Lam","Daniel Li","Xutai Ma","Ruslan Mavlyutov","Benjamin Peloquin","Mohamed Ramadan","Abinesh Ramakrishnan","Anna Sun","Kevin Tran","Tuan Tran","Igor Tufanov","Vish Vogeti","Carleigh Wood","Yilin Yang","Bokai Yu","Pierre Andrews","Can Balioglu","Marta R. Costa-jussà","Onur Celebi","Maha Elbayad","Cynthia Gao","Francisco Guzmán","Justine Kao","Ann Lee","Alexandre Mourachko","Juan Pino","Sravya Popuri","Christophe Ropers","Safiyyah Saleem","Holger Schwenk","Paden Tomasello","Changhan Wang","Jeff Wang","Skyler Wang"],"pdf_url":"https://arxiv.org/pdf/2308.11596v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12420v1","updated":"2023-08-23T20:42:32Z","published":"2023-08-23T20:42:32Z","title":"Evolution of ESG-focused DLT Research: An NLP Analysis of the Literature","summary":" Distributed Ledger Technologies (DLTs) have rapidly evolved, necessitating\ncomprehensive insights into their diverse components. However, a systematic\nliterature review that emphasizes the Environmental, Sustainability, and\nGovernance (ESG) components of DLT remains lacking. To bridge this gap, we\nselected 107 seed papers to build a citation network of 63,083 references and\nrefined it to a corpus of 24,539 publications for analysis. Then, we labeled\nthe named entities in 46 papers according to twelve top-level categories\nderived from an established technology taxonomy and enhanced the taxonomy by\npinpointing DLT's ESG elements. Leveraging transformer-based language models,\nwe fine-tuned a pre-trained language model for a Named Entity Recognition (NER)\ntask using our labeled dataset. We used our fine-tuned language model to\ndistill the corpus to 505 key papers, facilitating a literature review via\nnamed entities and temporal graph analysis on DLT evolution in the context of\nESG. Our contributions are a methodology to conduct a machine learning-driven\nsystematic literature review in the DLT field, placing a special emphasis on\nESG aspects. Furthermore, we present a first-of-its-kind NER dataset, composed\nof 54,808 named entities, designed for DLT and ESG-related explorations.\n","authors":["Walter Hernandez","Kamil Tylinski","Alastair Moore","Niall Roche","Nikhil Vadgama","Horst Treiblmaier","Jiangbo Shangguan","Paolo Tasca","Jiahua Xu"],"pdf_url":"https://arxiv.org/pdf/2308.12420v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12419v1","updated":"2023-08-23T20:38:19Z","published":"2023-08-23T20:38:19Z","title":"Toward American Sign Language Processing in the Real World: Data, Tasks,\n and Methods","summary":" Sign language, which conveys meaning through gestures, is the chief means of\ncommunication among deaf people. Recognizing sign language in natural settings\npresents significant challenges due to factors such as lighting, background\nclutter, and variations in signer characteristics. In this thesis, I study\nautomatic sign language processing in the wild, using signing videos collected\nfrom the Internet. This thesis contributes new datasets, tasks, and methods.\nMost chapters of this thesis address tasks related to fingerspelling, an\nimportant component of sign language and yet has not been studied widely by\nprior work. I present three new large-scale ASL datasets in the wild:\nChicagoFSWild, ChicagoFSWild+, and OpenASL. Using ChicagoFSWild and\nChicagoFSWild+, I address fingerspelling recognition, which consists of\ntranscribing fingerspelling sequences into text. I propose an end-to-end\napproach based on iterative attention that allows recognition from a raw video\nwithout explicit hand detection. I further show that using a Conformer-based\nnetwork jointly modeling handshape and mouthing can bring performance close to\nthat of humans. Next, I propose two tasks for building real-world\nfingerspelling-based applications: fingerspelling detection and search. For\nfingerspelling detection, I introduce a suite of evaluation metrics and a new\ndetection model via multi-task training. To address the problem of searching\nfor fingerspelled keywords in raw sign language videos, we propose a novel\nmethod that jointly localizes and matches fingerspelling segments to text.\nFinally, I will describe a benchmark for large-vocabulary open-domain sign\nlanguage translation based on OpenASL. To address the challenges of sign\nlanguage translation in realistic settings, we propose a set of techniques\nincluding sign search as a pretext task for pre-training and fusion of mouthing\nand handshape features.\n","authors":["Bowen Shi"],"pdf_url":"https://arxiv.org/pdf/2308.12419v1.pdf","comment":"PhD thesis"},{"id":"http://arxiv.org/abs/2308.12383v1","updated":"2023-08-23T18:53:00Z","published":"2023-08-23T18:53:00Z","title":"With a Little Help from your own Past: Prototypical Memory Networks for\n Image Captioning","summary":" Image captioning, like many tasks involving vision and language, currently\nrelies on Transformer-based architectures for extracting the semantics in an\nimage and translating it into linguistically coherent descriptions. Although\nsuccessful, the attention operator only considers a weighted summation of\nprojections of the current input sample, therefore ignoring the relevant\nsemantic information which can come from the joint observation of other\nsamples. In this paper, we devise a network which can perform attention over\nactivations obtained while processing other training samples, through a\nprototypical memory model. Our memory models the distribution of past keys and\nvalues through the definition of prototype vectors which are both\ndiscriminative and compact. Experimentally, we assess the performance of the\nproposed model on the COCO dataset, in comparison with carefully designed\nbaselines and state-of-the-art approaches, and by investigating the role of\neach of the proposed components. We demonstrate that our proposal can increase\nthe performance of an encoder-decoder Transformer by 3.7 CIDEr points both when\ntraining in cross-entropy only and when fine-tuning with self-critical sequence\ntraining. Source code and trained models are available at:\nhttps://github.com/aimagelab/PMA-Net.\n","authors":["Manuele Barraco","Sara Sarto","Marcella Cornia","Lorenzo Baraldi","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2308.12383v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.12372v1","updated":"2023-08-23T18:40:48Z","published":"2023-08-23T18:40:48Z","title":"Vision Transformer Adapters for Generalizable Multitask Learning","summary":" We introduce the first multitasking vision transformer adapters that learn\ngeneralizable task affinities which can be applied to novel tasks and domains.\nIntegrated into an off-the-shelf vision transformer backbone, our adapters can\nsimultaneously solve multiple dense vision tasks in a parameter-efficient\nmanner, unlike existing multitasking transformers that are parametrically\nexpensive. In contrast to concurrent methods, we do not require retraining or\nfine-tuning whenever a new task or domain is added. We introduce a task-adapted\nattention mechanism within our adapter framework that combines gradient-based\ntask similarities with attention-based ones. The learned task affinities\ngeneralize to the following settings: zero-shot task transfer, unsupervised\ndomain adaptation, and generalization without fine-tuning to novel domains. We\ndemonstrate that our approach outperforms not only the existing convolutional\nneural network-based multitasking methods but also the vision transformer-based\nones. Our project page is at \\url{https://ivrl.github.io/VTAGML}.\n","authors":["Deblina Bhattacharjee","Sabine Süsstrunk","Mathieu Salzmann"],"pdf_url":"https://arxiv.org/pdf/2308.12372v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.12923v1","updated":"2023-08-23T04:34:05Z","published":"2023-08-23T04:34:05Z","title":"Diagnosing Infeasible Optimization Problems Using Large Language Models","summary":" Decision-making problems can be represented as mathematical optimization\nmodels, finding wide applications in fields such as economics, engineering and\nmanufacturing, transportation, and health care. Optimization models are\nmathematical abstractions of the problem of making the best decision while\nsatisfying a set of requirements or constraints. One of the primary barriers to\ndeploying these models in practice is the challenge of helping practitioners\nunderstand and interpret such models, particularly when they are infeasible,\nmeaning no decision satisfies all the constraints. Existing methods for\ndiagnosing infeasible optimization models often rely on expert systems,\nnecessitating significant background knowledge in optimization. In this paper,\nwe introduce OptiChat, a first-of-its-kind natural language-based system\nequipped with a chatbot GUI for engaging in interactive conversations about\ninfeasible optimization models. OptiChat can provide natural language\ndescriptions of the optimization model itself, identify potential sources of\ninfeasibility, and offer suggestions to make the model feasible. The\nimplementation of OptiChat is built on GPT-4, which interfaces with an\noptimization solver to identify the minimal subset of constraints that render\nthe entire optimization problem infeasible, also known as the Irreducible\nInfeasible Subset (IIS). We utilize few-shot learning, expert chain-of-thought,\nkey-retrieve, and sentiment prompts to enhance OptiChat's reliability. Our\nexperiments demonstrate that OptiChat assists both expert and non-expert users\nin improving their understanding of the optimization models, enabling them to\nquickly identify the sources of infeasibility.\n","authors":["Hao Chen","Gonzalo E. Constante-Flores","Can Li"],"pdf_url":"https://arxiv.org/pdf/2308.12923v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2212.08663v2","updated":"2023-08-23T17:59:57Z","published":"2022-12-19T18:59:57Z","title":"Randomized Quantization: A Generic Augmentation for Data Agnostic\n Self-supervised Learning","summary":" Self-supervised representation learning follows a paradigm of withholding\nsome part of the data and tasking the network to predict it from the remaining\npart. Among many techniques, data augmentation lies at the core for creating\nthe information gap. Towards this end, masking has emerged as a generic and\npowerful tool where content is withheld along the sequential dimension, e.g.,\nspatial in images, temporal in audio, and syntactic in language. In this paper,\nwe explore the orthogonal channel dimension for generic data augmentation by\nexploiting precision redundancy. The data for each channel is quantized through\na non-uniform quantizer, with the quantized value sampled randomly within\nrandomly sampled quantization bins. From another perspective, quantization is\nanalogous to channel-wise masking, as it removes the information within each\nbin, but preserves the information across bins. Our approach significantly\nsurpasses existing generic data augmentation methods, while showing on par\nperformance against modality-specific augmentations. We comprehensively\nevaluate our approach on vision, audio, 3D point clouds, as well as the DABS\nbenchmark which is comprised of various data modalities. The code is available\nat https: //github.com/microsoft/random_quantize.\n","authors":["Huimin Wu","Chenyang Lei","Xiao Sun","Peng-Shuai Wang","Qifeng Chen","Kwang-Ting Cheng","Stephen Lin","Zhirong Wu"],"pdf_url":"https://arxiv.org/pdf/2212.08663v2.pdf","comment":"Accepted by ICCV 2023. The code is available at https:\n //github.com/microsoft/random_quantize"},{"id":"http://arxiv.org/abs/2308.12288v1","updated":"2023-08-23T17:59:11Z","published":"2023-08-23T17:59:11Z","title":"CHORUS: Learning Canonicalized 3D Human-Object Spatial Relations from\n Unbounded Synthesized Images","summary":" We present a method for teaching machines to understand and model the\nunderlying spatial common sense of diverse human-object interactions in 3D in a\nself-supervised way. This is a challenging task, as there exist specific\nmanifolds of the interactions that can be considered human-like and natural,\nbut the human pose and the geometry of objects can vary even for similar\ninteractions. Such diversity makes the annotating task of 3D interactions\ndifficult and hard to scale, which limits the potential to reason about that in\na supervised way. One way of learning the 3D spatial relationship between\nhumans and objects during interaction is by showing multiple 2D images captured\nfrom different viewpoints when humans interact with the same type of objects.\nThe core idea of our method is to leverage a generative model that produces\nhigh-quality 2D images from an arbitrary text prompt input as an \"unbounded\"\ndata generator with effective controllability and view diversity. Despite its\nimperfection of the image quality over real images, we demonstrate that the\nsynthesized images are sufficient to learn the 3D human-object spatial\nrelations. We present multiple strategies to leverage the synthesized images,\nincluding (1) the first method to leverage a generative image model for 3D\nhuman-object spatial relation learning; (2) a framework to reason about the 3D\nspatial relations from inconsistent 2D cues in a self-supervised manner via 3D\noccupancy reasoning with pose canonicalization; (3) semantic clustering to\ndisambiguate different types of interactions with the same object types; and\n(4) a novel metric to assess the quality of 3D spatial learning of interaction.\nProject Page: https://jellyheadandrew.github.io/projects/chorus\n","authors":["Sookwan Han","Hanbyul Joo"],"pdf_url":"https://arxiv.org/pdf/2308.12288v1.pdf","comment":"Accepted to ICCV 2023 (Oral Presentation). Project Page:\n https://jellyheadandrew.github.io/projects/chorus"},{"id":"http://arxiv.org/abs/2307.03833v2","updated":"2023-08-23T17:40:11Z","published":"2023-07-07T21:03:18Z","title":"Back to Optimization: Diffusion-based Zero-Shot 3D Human Pose Estimation","summary":" Learning-based methods have dominated the 3D human pose estimation (HPE)\ntasks with significantly better performance in most benchmarks than traditional\noptimization-based methods. Nonetheless, 3D HPE in the wild is still the\nbiggest challenge of learning-based models, whether with 2D-3D lifting,\nimage-to-3D, or diffusion-based methods, since the trained networks implicitly\nlearn camera intrinsic parameters and domain-based 3D human pose distributions\nand estimate poses by statistical average. On the other hand, the\noptimization-based methods estimate results case-by-case, which can predict\nmore diverse and sophisticated human poses in the wild. By combining the\nadvantages of optimization-based and learning-based methods, we propose the\nZero-shot Diffusion-based Optimization (ZeDO) pipeline for 3D HPE to solve the\nproblem of cross-domain and in-the-wild 3D HPE. Our multi-hypothesis ZeDO\nachieves state-of-the-art (SOTA) performance on Human3.6M as minMPJPE $51.4$mm\nwithout training with any 2D-3D or image-3D pairs. Moreover, our\nsingle-hypothesis ZeDO achieves SOTA performance on 3DPW dataset with PA-MPJPE\n$42.6$mm on cross-dataset evaluation, which even outperforms learning-based\nmethods trained on 3DPW.\n","authors":["Zhongyu Jiang","Zhuoran Zhou","Lei Li","Wenhao Chai","Cheng-Yen Yang","Jenq-Neng Hwang"],"pdf_url":"https://arxiv.org/pdf/2307.03833v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12271v1","updated":"2023-08-23T17:39:58Z","published":"2023-08-23T17:39:58Z","title":"A Generative Approach for Image Registration of Visible-Thermal (VT)\n Cancer Faces","summary":" Since thermal imagery offers a unique modality to investigate pain, the U.S.\nNational Institutes of Health (NIH) has collected a large and diverse set of\ncancer patient facial thermograms for AI-based pain research. However,\ndiffering angles from camera capture between thermal and visible sensors has\nled to misalignment between Visible-Thermal (VT) images. We modernize the\nclassic computer vision task of image registration by applying and modifying a\ngenerative alignment algorithm to register VT cancer faces, without the need\nfor a reference or alignment parameters. By registering VT faces, we\ndemonstrate that the quality of thermal images produced in the generative AI\ndownstream task of Visible-to-Thermal (V2T) image translation significantly\nimproves up to 52.5\\%, than without registration. Images in this paper have\nbeen approved by the NIH NCI for public dissemination.\n","authors":["Catherine Ordun","Alexandra Cha","Edward Raff","Sanjay Purushotham","Karen Kwok","Mason Rule","James Gulley"],"pdf_url":"https://arxiv.org/pdf/2308.12271v1.pdf","comment":"2nd Annual Artificial Intelligence over Infrared Images for Medical\n Applications Workshop (AIIIMA) at the 26th International Conference on\n Medical Image Computing and Computer Assisted Intervention (MICCAI 2023)"},{"id":"http://arxiv.org/abs/2110.03006v4","updated":"2023-08-23T16:47:25Z","published":"2021-10-06T18:25:50Z","title":"Unsupervised Selective Labeling for More Effective Semi-Supervised\n Learning","summary":" Given an unlabeled dataset and an annotation budget, we study how to\nselectively label a fixed number of instances so that semi-supervised learning\n(SSL) on such a partially labeled dataset is most effective. We focus on\nselecting the right data to label, in addition to usual SSL's propagating\nlabels from labeled data to the rest unlabeled data. This instance selection\ntask is challenging, as without any labeled data we do not know what the\nobjective of learning should be. Intuitively, no matter what the downstream\ntask is, instances to be labeled must be representative and diverse: The former\nwould facilitate label propagation to unlabeled data, whereas the latter would\nensure coverage of the entire dataset. We capture this idea by selecting\ncluster prototypes, either in a pretrained feature space, or along with feature\noptimization, both without labels. Our unsupervised selective labeling\nconsistently improves SSL methods over state-of-the-art active learning given\nlabeled data, by 8 to 25 times in label efficiency. For example, it boosts\nFixMatch by 10% (14%) in accuracy on CIFAR-10 (ImageNet-1K) with 0.08% (0.2%)\nlabeled data, demonstrating that small computation spent on selecting what data\nto label brings significant gain especially under a low annotation budget. Our\nwork sets a new standard for practical and efficient SSL.\n","authors":["Xudong Wang","Long Lian","Stella X. Yu"],"pdf_url":"https://arxiv.org/pdf/2110.03006v4.pdf","comment":"Accepted by ECCV 2022; Fixed a few typos"},{"id":"http://arxiv.org/abs/2304.13014v2","updated":"2023-08-23T16:28:52Z","published":"2023-04-25T17:38:41Z","title":"Methods and datasets for segmentation of minimally invasive surgical\n instruments in endoscopic images and videos: A review of the state of the art","summary":" In the field of computer- and robot-assisted minimally invasive surgery,\nenormous progress has been made in recent years based on the recognition of\nsurgical instruments in endoscopic images and videos. In particular, the\ndetermination of the position and type of instruments is of great interest.\nCurrent work involves both spatial and temporal information, with the idea that\npredicting the movement of surgical tools over time may improve the quality of\nthe final segmentations. The provision of publicly available datasets has\nrecently encouraged the development of new methods, mainly based on deep\nlearning. In this review, we identify and characterize datasets used for method\ndevelopment and evaluation and quantify their frequency of use in the\nliterature. We further present an overview of the current state of research\nregarding the segmentation and tracking of minimally invasive surgical\ninstruments in endoscopic images and videos. The paper focuses on methods that\nwork purely visually, without markers of any kind attached to the instruments,\nconsidering both single-frame semantic and instance segmentation approaches, as\nwell as those that incorporate temporal information. The publications analyzed\nwere identified through the platforms Google Scholar, Web of Science, and\nPubMed. The search terms used were \"instrument segmentation\", \"instrument\ntracking\", \"surgical tool segmentation\", and \"surgical tool tracking\",\nresulting in a total of 741 articles published between 01/2015 and 07/2023, of\nwhich 123 were included using systematic selection criteria. A discussion of\nthe reviewed literature is provided, highlighting existing shortcomings and\nemphasizing the available potential for future developments.\n","authors":["Tobias Rueckert","Daniel Rueckert","Christoph Palm"],"pdf_url":"https://arxiv.org/pdf/2304.13014v2.pdf","comment":"29 pages, 11 figures"},{"id":"http://arxiv.org/abs/2308.11489v2","updated":"2023-08-23T16:16:44Z","published":"2023-08-22T15:10:42Z","title":"Learning from Semantic Alignment between Unpaired Multiviews for\n Egocentric Video Recognition","summary":" We are concerned with a challenging scenario in unpaired multiview video\nlearning. In this case, the model aims to learn comprehensive multiview\nrepresentations while the cross-view semantic information exhibits variations.\nWe propose Semantics-based Unpaired Multiview Learning (SUM-L) to tackle this\nunpaired multiview learning problem. The key idea is to build cross-view\npseudo-pairs and do view-invariant alignment by leveraging the semantic\ninformation of videos. To facilitate the data efficiency of multiview learning,\nwe further perform video-text alignment for first-person and third-person\nvideos, to fully leverage the semantic knowledge to improve video\nrepresentations. Extensive experiments on multiple benchmark datasets verify\nthe effectiveness of our framework. Our method also outperforms multiple\nexisting view-alignment methods, under the more challenging scenario than\ntypical paired or unpaired multimodal or multiview learning. Our code is\navailable at https://github.com/wqtwjt1996/SUM-L.\n","authors":["Qitong Wang","Long Zhao","Liangzhe Yuan","Ting Liu","Xi Peng"],"pdf_url":"https://arxiv.org/pdf/2308.11489v2.pdf","comment":"Proceedings of IEEE International Conference on Computer Vision\n (ICCV) 2023"},{"id":"http://arxiv.org/abs/2308.12234v1","updated":"2023-08-23T16:16:11Z","published":"2023-08-23T16:16:11Z","title":"MolGrapher: Graph-based Visual Recognition of Chemical Structures","summary":" The automatic analysis of chemical literature has immense potential to\naccelerate the discovery of new materials and drugs. Much of the critical\ninformation in patent documents and scientific articles is contained in\nfigures, depicting the molecule structures. However, automatically parsing the\nexact chemical structure is a formidable challenge, due to the amount of\ndetailed information, the diversity of drawing styles, and the need for\ntraining data. In this work, we introduce MolGrapher to recognize chemical\nstructures visually. First, a deep keypoint detector detects the atoms. Second,\nwe treat all candidate atoms and bonds as nodes and put them in a graph. This\nconstruct allows a natural graph representation of the molecule. Last, we\nclassify atom and bond nodes in the graph with a Graph Neural Network. To\naddress the lack of real training data, we propose a synthetic data generation\npipeline producing diverse and realistic results. In addition, we introduce a\nlarge-scale benchmark of annotated real molecule images, USPTO-30K, to spur\nresearch on this critical topic. Extensive experiments on five datasets show\nthat our approach significantly outperforms classical and learning-based\nmethods in most settings. Code, models, and datasets are available.\n","authors":["Lucas Morin","Martin Danelljan","Maria Isabel Agea","Ahmed Nassar","Valery Weber","Ingmar Meijer","Peter Staar","Fisher Yu"],"pdf_url":"https://arxiv.org/pdf/2308.12234v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12231v1","updated":"2023-08-23T16:13:58Z","published":"2023-08-23T16:13:58Z","title":"SPPNet: A Single-Point Prompt Network for Nuclei Image Segmentation","summary":" Image segmentation plays an essential role in nuclei image analysis.\nRecently, the segment anything model has made a significant breakthrough in\nsuch tasks. However, the current model exists two major issues for cell\nsegmentation: (1) the image encoder of the segment anything model involves a\nlarge number of parameters. Retraining or even fine-tuning the model still\nrequires expensive computational resources. (2) in point prompt mode, points\nare sampled from the center of the ground truth and more than one set of points\nis expected to achieve reliable performance, which is not efficient for\npractical applications. In this paper, a single-point prompt network is\nproposed for nuclei image segmentation, called SPPNet. We replace the original\nimage encoder with a lightweight vision transformer. Also, an effective\nconvolutional block is added in parallel to extract the low-level semantic\ninformation from the image and compensate for the performance degradation due\nto the small image encoder. We propose a new point-sampling method based on the\nGaussian kernel. The proposed model is evaluated on the MoNuSeg-2018 dataset.\nThe result demonstrated that SPPNet outperforms existing U-shape architectures\nand shows faster convergence in training. Compared to the segment anything\nmodel, SPPNet shows roughly 20 times faster inference, with 1/70 parameters and\ncomputational cost. Particularly, only one set of points is required in both\nthe training and inference phases, which is more reasonable for clinical\napplications. The code for our work and more technical details can be found at\nhttps://github.com/xq141839/SPPNet.\n","authors":["Qing Xu","Wenwei Kuang","Zeyu Zhang","Xueyao Bao","Haoran Chen","Wenting Duan"],"pdf_url":"https://arxiv.org/pdf/2308.12231v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08913v2","updated":"2023-08-23T16:07:52Z","published":"2023-06-15T07:32:10Z","title":"Advancing Volumetric Medical Image Segmentation via Global-Local Masked\n Autoencoder","summary":" Masked autoencoder (MAE) is a promising self-supervised pre-training\ntechnique that can improve the representation learning of a neural network\nwithout human intervention. However, applying MAE directly to volumetric\nmedical images poses two challenges: (i) a lack of global information that is\ncrucial for understanding the clinical context of the holistic data, (ii) no\nguarantee of stabilizing the representations learned from randomly masked\ninputs. To address these limitations, we propose the\n\\textbf{G}lobal-\\textbf{L}ocal \\textbf{M}asked \\textbf{A}uto\\textbf{E}ncoder\n(GL-MAE), a simple yet effective self-supervised pre-training strategy. In\naddition to reconstructing masked local views, as in previous methods, GL-MAE\nincorporates global context learning by reconstructing masked global views.\nFurthermore, a complete global view is integrated as an anchor to guide the\nreconstruction and stabilize the learning process through global-to-global\nconsistency learning and global-to-local consistency learning. Finetuning\nresults on multiple datasets demonstrate the superiority of our method over\nother state-of-the-art self-supervised algorithms, highlighting its\neffectiveness on versatile volumetric medical image segmentation tasks, even\nwhen annotations are scarce. Our codes and models will be released upon\nacceptance.\n","authors":["Jia-Xin Zhuang","Luyang Luo","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2306.08913v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12218v1","updated":"2023-08-23T15:56:26Z","published":"2023-08-23T15:56:26Z","title":"CIParsing: Unifying Causality Properties into Multiple Human Parsing","summary":" Existing methods of multiple human parsing (MHP) apply statistical models to\nacquire underlying associations between images and labeled body parts. However,\nacquired associations often contain many spurious correlations that degrade\nmodel generalization, leading statistical models to be vulnerable to visually\ncontextual variations in images (e.g., unseen image styles/external\ninterventions). To tackle this, we present a causality inspired parsing\nparadigm termed CIParsing, which follows fundamental causal principles\ninvolving two causal properties for human parsing (i.e., the causal diversity\nand the causal invariance). Specifically, we assume that an input image is\nconstructed by a mix of causal factors (the characteristics of body parts) and\nnon-causal factors (external contexts), where only the former ones cause the\ngeneration process of human parsing.Since causal/non-causal factors are\nunobservable, a human parser in proposed CIParsing is required to construct\nlatent representations of causal factors and learns to enforce representations\nto satisfy the causal properties. In this way, the human parser is able to rely\non causal factors w.r.t relevant evidence rather than non-causal factors w.r.t\nspurious correlations, thus alleviating model degradation and yielding improved\nparsing ability. Notably, the CIParsing is designed in a plug-and-play fashion\nand can be integrated into any existing MHP models. Extensive experiments\nconducted on two widely used benchmarks demonstrate the effectiveness and\ngeneralizability of our method.\n","authors":["Xiaojia Chen","Xuanhan Wang","Lianli Gao","Beitao Chen","Jingkuan Song","HenTao Shen"],"pdf_url":"https://arxiv.org/pdf/2308.12218v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12216v1","updated":"2023-08-23T15:52:45Z","published":"2023-08-23T15:52:45Z","title":"SG-Former: Self-guided Transformer with Evolving Token Reallocation","summary":" Vision Transformer has demonstrated impressive success across various vision\ntasks. However, its heavy computation cost, which grows quadratically with\nrespect to the token sequence length, largely limits its power in handling\nlarge feature maps. To alleviate the computation cost, previous works rely on\neither fine-grained self-attentions restricted to local small regions, or\nglobal self-attentions but to shorten the sequence length resulting in coarse\ngranularity. In this paper, we propose a novel model, termed as Self-guided\nTransformer~(SG-Former), towards effective global self-attention with adaptive\nfine granularity. At the heart of our approach is to utilize a significance\nmap, which is estimated through hybrid-scale self-attention and evolves itself\nduring training, to reallocate tokens based on the significance of each region.\nIntuitively, we assign more tokens to the salient regions for achieving\nfine-grained attention, while allocating fewer tokens to the minor regions in\nexchange for efficiency and global receptive fields. The proposed SG-Former\nachieves performance superior to state of the art: our base size model achieves\n\\textbf{84.7\\%} Top-1 accuracy on ImageNet-1K, \\textbf{51.2mAP} bbAP on CoCo,\n\\textbf{52.7mIoU} on ADE20K surpassing the Swin Transformer by \\textbf{+1.3\\% /\n+2.7 mAP/ +3 mIoU}, with lower computation costs and fewer parameters. The code\nis available at\n\\href{https://github.com/OliverRensu/SG-Former}{https://github.com/OliverRensu/SG-Former}\n","authors":["Sucheng Ren","Xingyi Yang","Songhua Liu","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2308.12216v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.12213v1","updated":"2023-08-23T15:51:36Z","published":"2023-08-23T15:51:36Z","title":"CLIPN for Zero-Shot OOD Detection: Teaching CLIP to Say No","summary":" Out-of-distribution (OOD) detection refers to training the model on an\nin-distribution (ID) dataset to classify whether the input images come from\nunknown classes. Considerable effort has been invested in designing various OOD\ndetection methods based on either convolutional neural networks or\ntransformers. However, zero-shot OOD detection methods driven by CLIP, which\nonly require class names for ID, have received less attention. This paper\npresents a novel method, namely CLIP saying \"no\" (\\textbf{CLIPN}), which\nempowers the logic of saying \"no\" within CLIP. Our key motivation is to equip\nCLIP with the capability of distinguishing OOD and ID samples using\npositive-semantic prompts and negation-semantic prompts. Specifically, we\ndesign a novel learnable \"no\" prompt and a \"no\" text encoder to capture\nnegation semantics within images. Subsequently, we introduce two loss\nfunctions: the image-text binary-opposite loss and the text semantic-opposite\nloss, which we use to teach CLIPN to associate images with \"no\" prompts,\nthereby enabling it to identify unknown samples. Furthermore, we propose two\nthreshold-free inference algorithms to perform OOD detection by utilizing\nnegation semantics from \"no\" prompts and the text encoder. Experimental results\non 9 benchmark datasets (3 ID datasets and 6 OOD datasets) for the OOD\ndetection task demonstrate that CLIPN, based on ViT-B-16, outperforms 7\nwell-used algorithms by at least 2.34\\% and 11.64\\% in terms of AUROC and FPR95\nfor zero-shot OOD detection on ImageNet-1K. Our CLIPN can serve as a solid\nfoundation for effectively leveraging CLIP in downstream OOD tasks. The code is\navailable on\nhttps://github.com/xmed-lab/CLIPN}{https://github.com/xmed-lab/CLIPN.\n","authors":["Hualiang Wang","Yi Li","Huifeng Yao","Xiaomeng Li"],"pdf_url":"https://arxiv.org/pdf/2308.12213v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2206.00309v2","updated":"2023-08-23T15:51:28Z","published":"2022-06-01T08:22:34Z","title":"Label-Efficient Online Continual Object Detection in Streaming Video","summary":" Humans can watch a continuous video stream and effortlessly perform continual\nacquisition and transfer of new knowledge with minimal supervision yet\nretaining previously learnt experiences. In contrast, existing continual\nlearning (CL) methods require fully annotated labels to effectively learn from\nindividual frames in a video stream. Here, we examine a more realistic and\nchallenging problem$\\unicode{x2014}$Label-Efficient Online Continual Object\nDetection (LEOCOD) in streaming video. We propose a plug-and-play module,\nEfficient-CLS, that can be easily inserted into and improve existing continual\nlearners for object detection in video streams with reduced data annotation\ncosts and model retraining time. We show that our method has achieved\nsignificant improvement with minimal forgetting across all supervision levels\non two challenging CL benchmarks for streaming real-world videos. Remarkably,\nwith only 25% annotated video frames, our method still outperforms the base CL\nlearners, which are trained with 100% annotations on all video frames. The data\nand source code will be publicly available at\nhttps://github.com/showlab/Efficient-CLS.\n","authors":["Jay Zhangjie Wu","David Junhao Zhang","Wynne Hsu","Mengmi Zhang","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2206.00309v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.12199v1","updated":"2023-08-23T15:38:26Z","published":"2023-08-23T15:38:26Z","title":"Towards Real-Time Analysis of Broadcast Badminton Videos","summary":" Analysis of player movements is a crucial subset of sports analysis. Existing\nplayer movement analysis methods use recorded videos after the match is over.\nIn this work, we propose an end-to-end framework for player movement analysis\nfor badminton matches on live broadcast match videos. We only use the visual\ninputs from the match and, unlike other approaches which use multi-modal sensor\ndata, our approach uses only visual cues. We propose a method to calculate the\non-court distance covered by both the players from the video feed of a live\nbroadcast badminton match. To perform this analysis, we focus on the gameplay\nby removing replays and other redundant parts of the broadcast match. We then\nperform player tracking to identify and track the movements of both players in\neach frame. Finally, we calculate the distance covered by each player and the\naverage speed with which they move on the court. We further show a heatmap of\nthe areas covered by the player on the court which is useful for analyzing the\ngameplay of the player. Our proposed framework was successfully used to analyze\nlive broadcast matches in real-time during the Premier Badminton League 2019\n(PBL 2019), with commentators and broadcasters appreciating the utility.\n","authors":["Nitin Nilesh","Tushar Sharma","Anurag Ghosh","C. V. Jawahar"],"pdf_url":"https://arxiv.org/pdf/2308.12199v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12191v1","updated":"2023-08-23T15:27:50Z","published":"2023-08-23T15:27:50Z","title":"Sign Language Translation with Iterative Prototype","summary":" This paper presents IP-SLT, a simple yet effective framework for sign\nlanguage translation (SLT). Our IP-SLT adopts a recurrent structure and\nenhances the semantic representation (prototype) of the input sign language\nvideo via an iterative refinement manner. Our idea mimics the behavior of human\nreading, where a sentence can be digested repeatedly, till reaching accurate\nunderstanding. Technically, IP-SLT consists of feature extraction, prototype\ninitialization, and iterative prototype refinement. The initialization module\ngenerates the initial prototype based on the visual feature extracted by the\nfeature extraction module. Then, the iterative refinement module leverages the\ncross-attention mechanism to polish the previous prototype by aggregating it\nwith the original video feature. Through repeated refinement, the prototype\nfinally converges to a more stable and accurate state, leading to a fluent and\nappropriate translation. In addition, to leverage the sequential dependence of\nprototypes, we further propose an iterative distillation loss to compress the\nknowledge of the final iteration into previous ones. As the autoregressive\ndecoding process is executed only once in inference, our IP-SLT is ready to\nimprove various SLT systems with acceptable overhead. Extensive experiments are\nconducted on public benchmarks to demonstrate the effectiveness of the IP-SLT.\n","authors":["Huijie Yao","Wengang Zhou","Hao Feng","Hezhen Hu","Hao Zhou","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2308.12191v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2303.10891v2","updated":"2023-08-23T15:16:42Z","published":"2023-03-20T06:16:22Z","title":"Non-Exemplar Online Class-incremental Continual Learning via\n Dual-prototype Self-augment and Refinement","summary":" This paper investigates a new, practical, but challenging problem named\nNon-exemplar Online Class-incremental continual Learning (NO-CL), which aims to\npreserve the discernibility of base classes without buffering data examples and\nefficiently learn novel classes continuously in a single-pass (i.e., online)\ndata stream. The challenges of this task are mainly two-fold: (1) Both base and\nnovel classes suffer from severe catastrophic forgetting as no previous samples\nare available for replay. (2) As the online data can only be observed once,\nthere is no way to fully re-train the whole model, e.g., re-calibrate the\ndecision boundaries via prototype alignment or feature distillation. In this\npaper, we propose a novel Dual-prototype Self-augment and Refinement method\n(DSR) for NO-CL problem, which consists of two strategies: 1) Dual class\nprototypes: vanilla and high-dimensional prototypes are exploited to utilize\nthe pre-trained information and obtain robust quasi-orthogonal representations\nrather than example buffers for both privacy preservation and memory reduction.\n2) Self-augment and refinement: Instead of updating the whole network, we\noptimize high-dimensional prototypes alternatively with the extra projection\nmodule based on self-augment vanilla prototypes, through a bi-level\noptimization problem. Extensive experiments demonstrate the effectiveness and\nsuperiority of the proposed DSR in NO-CL.\n","authors":["Fushuo Huo","Wenchao Xu","Jingcai Guo","Haozhao Wang","Yunfeng Fan","Song Guo"],"pdf_url":"https://arxiv.org/pdf/2303.10891v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.10410v2","updated":"2023-08-23T15:15:59Z","published":"2023-04-20T15:48:50Z","title":"Radar-Camera Fusion for Object Detection and Semantic Segmentation in\n Autonomous Driving: A Comprehensive Review","summary":" Driven by deep learning techniques, perception technology in autonomous\ndriving has developed rapidly in recent years, enabling vehicles to accurately\ndetect and interpret surrounding environment for safe and efficient navigation.\nTo achieve accurate and robust perception capabilities, autonomous vehicles are\noften equipped with multiple sensors, making sensor fusion a crucial part of\nthe perception system. Among these fused sensors, radars and cameras enable a\ncomplementary and cost-effective perception of the surrounding environment\nregardless of lighting and weather conditions. This review aims to provide a\ncomprehensive guideline for radar-camera fusion, particularly concentrating on\nperception tasks related to object detection and semantic segmentation.Based on\nthe principles of the radar and camera sensors, we delve into the data\nprocessing process and representations, followed by an in-depth analysis and\nsummary of radar-camera fusion datasets. In the review of methodologies in\nradar-camera fusion, we address interrogative questions, including \"why to\nfuse\", \"what to fuse\", \"where to fuse\", \"when to fuse\", and \"how to fuse\",\nsubsequently discussing various challenges and potential research directions\nwithin this domain. To ease the retrieval and comparison of datasets and fusion\nmethods, we also provide an interactive website:\nhttps://radar-camera-fusion.github.io.\n","authors":["Shanliang Yao","Runwei Guan","Xiaoyu Huang","Zhuoxiao Li","Xiangyu Sha","Yong Yue","Eng Gee Lim","Hyungjoon Seo","Ka Lok Man","Xiaohui Zhu","Yutao Yue"],"pdf_url":"https://arxiv.org/pdf/2304.10410v2.pdf","comment":"Accepted by IEEE Transactions on Intelligent Vehicles (T-IV)"},{"id":"http://arxiv.org/abs/2305.07881v3","updated":"2023-08-23T14:53:59Z","published":"2023-05-13T10:00:24Z","title":"Black-box Source-free Domain Adaptation via Two-stage Knowledge\n Distillation","summary":" Source-free domain adaptation aims to adapt deep neural networks using only\npre-trained source models and target data. However, accessing the source model\nstill has a potential concern about leaking the source data, which reveals the\npatient's privacy. In this paper, we study the challenging but practical\nproblem: black-box source-free domain adaptation where only the outputs of the\nsource model and target data are available. We propose a simple but effective\ntwo-stage knowledge distillation method. In Stage\n\\uppercase\\expandafter{\\romannumeral1}, we train the target model from scratch\nwith soft pseudo-labels generated by the source model in a knowledge\ndistillation manner. In Stage \\uppercase\\expandafter{\\romannumeral2}, we\ninitialize another model as the new student model to avoid the error\naccumulation caused by noisy pseudo-labels. We feed the images with weak\naugmentation to the teacher model to guide the learning of the student model.\nOur method is simple and flexible, and achieves surprising results on three\ncross-domain segmentation tasks.\n","authors":["Shuai Wang","Daoan Zhang","Zipei Yan","Shitong Shao","Rui Li"],"pdf_url":"https://arxiv.org/pdf/2305.07881v3.pdf","comment":"The short version is accepted by IJCAI 1st International Workshop on\n Generalizing from Limited Resources in the Open World. (This version is long\n version)"},{"id":"http://arxiv.org/abs/2209.11355v3","updated":"2023-08-23T14:51:47Z","published":"2022-09-23T00:35:22Z","title":"Learning Interpretable Dynamics from Images of a Freely Rotating 3D\n Rigid Body","summary":" In many real-world settings, image observations of freely rotating 3D rigid\nbodies, such as satellites, may be available when low-dimensional measurements\nare not. However, the high-dimensionality of image data precludes the use of\nclassical estimation techniques to learn the dynamics and a lack of\ninterpretability reduces the usefulness of standard deep learning methods. In\nthis work, we present a physics-informed neural network model to estimate and\npredict 3D rotational dynamics from image sequences. We achieve this using a\nmulti-stage prediction pipeline that maps individual images to a latent\nrepresentation homeomorphic to $\\mathbf{SO}(3)$, computes angular velocities\nfrom latent pairs, and predicts future latent states using the Hamiltonian\nequations of motion with a learned representation of the Hamiltonian. We\ndemonstrate the efficacy of our approach on a new rotating rigid-body dataset\nwith sequences of rotating cubes and rectangular prisms with uniform and\nnon-uniform density.\n","authors":["Justice Mason","Christine Allen-Blanchette","Nicholas Zolman","Elizabeth Davison","Naomi Leonard"],"pdf_url":"https://arxiv.org/pdf/2209.11355v3.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.12168v1","updated":"2023-08-23T14:35:03Z","published":"2023-08-23T14:35:03Z","title":"Tumor-Centered Patching for Enhanced Medical Image Segmentation","summary":" The realm of medical image diagnosis has advanced significantly with the\nintegration of computer-aided diagnosis and surgical systems. However,\nchallenges persist, particularly in achieving precise image segmentation. While\ndeep learning techniques show potential, obstacles like limited resources, slow\nconvergence, and class imbalance impede their effectiveness. Traditional\npatch-based methods, though common, struggle to capture intricate tumor\nboundaries and often lead to redundant samples, compromising computational\nefficiency and feature quality. To tackle these issues, this research\nintroduces an innovative approach centered on the tumor itself for patch-based\nimage analysis. This novel tumor-centered patching method aims to address the\nclass imbalance and boundary deficiencies, enabling focused and accurate tumor\nsegmentation. By aligning patches with the tumor's anatomical context, this\ntechnique enhances feature extraction accuracy and reduces computational load.\nExperimental results demonstrate improved class imbalance, with segmentation\nscores of 0.78, 0.76, and 0.71 for whole, core, and enhancing tumors,\nrespectively using a lightweight simple U-Net. This approach shows potential\nfor enhancing medical image segmentation and improving computer-aided diagnosis\nsystems.\n","authors":["Mutyyba Asghar","Ahmad Raza Shahid","Akhtar Jamil","Kiran Aftab","Syed Ather Enam"],"pdf_url":"https://arxiv.org/pdf/2308.12168v1.pdf","comment":"20 pages, 12 figures"},{"id":"http://arxiv.org/abs/2308.12163v1","updated":"2023-08-23T14:25:22Z","published":"2023-08-23T14:25:22Z","title":"NPF-200: A Multi-Modal Eye Fixation Dataset and Method for\n Non-Photorealistic Videos","summary":" Non-photorealistic videos are in demand with the wave of the metaverse, but\nlack of sufficient research studies. This work aims to take a step forward to\nunderstand how humans perceive non-photorealistic videos with eye fixation\n(\\ie, saliency detection), which is critical for enhancing media production,\nartistic design, and game user experience. To fill in the gap of missing a\nsuitable dataset for this research line, we present NPF-200, the first\nlarge-scale multi-modal dataset of purely non-photorealistic videos with eye\nfixations. Our dataset has three characteristics: 1) it contains soundtracks\nthat are essential according to vision and psychological studies; 2) it\nincludes diverse semantic content and videos are of high-quality; 3) it has\nrich motions across and within videos. We conduct a series of analyses to gain\ndeeper insights into this task and compare several state-of-the-art methods to\nexplore the gap between natural images and non-photorealistic data.\nAdditionally, as the human attention system tends to extract visual and audio\nfeatures with different frequencies, we propose a universal frequency-aware\nmulti-modal non-photorealistic saliency detection model called NPSNet,\ndemonstrating the state-of-the-art performance of our task. The results uncover\nstrengths and weaknesses of multi-modal network design and multi-domain\ntraining, opening up promising directions for future works. {Our dataset and\ncode can be found at \\url{https://github.com/Yangziyu/NPF200}}.\n","authors":["Ziyu Yang","Sucheng Ren","Zongwei Wu","Nanxuan Zhao","Junle Wang","Jing Qin","Shengfeng He"],"pdf_url":"https://arxiv.org/pdf/2308.12163v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2303.11702v4","updated":"2023-08-23T14:18:30Z","published":"2023-03-21T09:42:27Z","title":"On the link between generative semi-supervised learning and generative\n open-set recognition","summary":" This study investigates the relationship between semi-supervised learning\n(SSL, which is training off partially labelled datasets) and open-set\nrecognition (OSR, which is classification with simultaneous novelty detection)\nunder the context of generative adversarial networks (GANs). Although no\nprevious study has formally linked SSL and OSR, their respective methods share\nstriking similarities. Specifically, SSL-GANs and OSR-GANs require their\ngenerators to produce 'bad-looking' samples which are used to regularise their\nclassifier networks. We hypothesise that the definitions of bad-looking samples\nin SSL and OSR represents the same concept and realises the same goal. More\nformally, bad-looking samples lie in the complementary space, which is the area\nbetween and around the boundaries of the labelled categories within the\nclassifier's embedding space. By regularising a classifier with samples in the\ncomplementary space, classifiers achieve improved generalisation for SSL and\nalso generalise the open space for OSR. To test this hypothesis, we compare a\nfoundational SSL-GAN with the state-of-the-art OSR-GAN under the same SSL-OSR\nexperimental conditions. Our results find that SSL-GANs achieve near identical\nresults to OSR-GANs, proving the SSL-OSR link. Subsequently, to further this\nnew research path, we compare several SSL-GANs various SSL-OSR setups which\nthis first benchmark results. A combined framework of SSL-OSR certainly\nimproves the practicality and cost-efficiency of classifier training, and so\nfurther theoretical and application studies are also discussed.\n","authors":["Emile Reyn Engelbrecht","Johan du Preez"],"pdf_url":"https://arxiv.org/pdf/2303.11702v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12156v1","updated":"2023-08-23T14:17:44Z","published":"2023-08-23T14:17:44Z","title":"Multimodal Latent Emotion Recognition from Micro-expression and\n Physiological Signals","summary":" This paper discusses the benefits of incorporating multimodal data for\nimproving latent emotion recognition accuracy, focusing on micro-expression\n(ME) and physiological signals (PS). The proposed approach presents a novel\nmultimodal learning framework that combines ME and PS, including a 1D separable\nand mixable depthwise inception network, a standardised normal distribution\nweighted feature fusion method, and depth/physiology guided attention modules\nfor multimodal learning. Experimental results show that the proposed approach\noutperforms the benchmark method, with the weighted fusion method and guided\nattention modules both contributing to enhanced performance.\n","authors":["Liangfei Zhang","Yifei Qian","Ognjen Arandjelovic","Anthony Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.12156v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12143v1","updated":"2023-08-23T14:00:58Z","published":"2023-08-23T14:00:58Z","title":"A Probabilistic Fluctuation based Membership Inference Attack for\n Generative Models","summary":" Membership Inference Attack (MIA) identifies whether a record exists in a\nmachine learning model's training set by querying the model. MIAs on the\nclassic classification models have been well-studied, and recent works have\nstarted to explore how to transplant MIA onto generative models. Our\ninvestigation indicates that existing MIAs designed for generative models\nmainly depend on the overfitting in target models. However, overfitting can be\navoided by employing various regularization techniques, whereas existing MIAs\ndemonstrate poor performance in practice. Unlike overfitting, memorization is\nessential for deep learning models to attain optimal performance, making it a\nmore prevalent phenomenon. Memorization in generative models leads to an\nincreasing trend in the probability distribution of generating records around\nthe member record. Therefore, we propose a Probabilistic Fluctuation Assessing\nMembership Inference Attack (PFAMI), a black-box MIA that infers memberships by\ndetecting these trends via analyzing the overall probabilistic fluctuations\naround given records. We conduct extensive experiments across multiple\ngenerative models and datasets, which demonstrate PFAMI can improve the attack\nsuccess rate (ASR) by about 27.9% when compared with the best baseline.\n","authors":["Wenjie Fu","Huandong Wang","Chen Gao","Guanghua Liu","Yong Li","Tao Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.12143v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10718v2","updated":"2023-08-23T13:56:52Z","published":"2023-08-21T13:39:04Z","title":"Backdooring Textual Inversion for Concept Censorship","summary":" Recent years have witnessed success in AIGC (AI Generated Content). People\ncan make use of a pre-trained diffusion model to generate images of high\nquality or freely modify existing pictures with only prompts in nature\nlanguage. More excitingly, the emerging personalization techniques make it\nfeasible to create specific-desired images with only a few images as\nreferences. However, this induces severe threats if such advanced techniques\nare misused by malicious users, such as spreading fake news or defaming\nindividual reputations. Thus, it is necessary to regulate personalization\nmodels (i.e., concept censorship) for their development and advancement.\n In this paper, we focus on the personalization technique dubbed Textual\nInversion (TI), which is becoming prevailing for its lightweight nature and\nexcellent performance. TI crafts the word embedding that contains detailed\ninformation about a specific object. Users can easily download the word\nembedding from public websites like Civitai and add it to their own stable\ndiffusion model without fine-tuning for personalization. To achieve the concept\ncensorship of a TI model, we propose leveraging the backdoor technique for good\nby injecting backdoors into the Textual Inversion embeddings. Briefly, we\nselect some sensitive words as triggers during the training of TI, which will\nbe censored for normal use. In the subsequent generation stage, if the triggers\nare combined with personalized embeddings as final prompts, the model will\noutput a pre-defined target image rather than images including the desired\nmalicious concept.\n To demonstrate the effectiveness of our approach, we conduct extensive\nexperiments on Stable Diffusion, a prevailing open-sourced text-to-image model.\nOur code, data, and results are available at\nhttps://concept-censorship.github.io.\n","authors":["Yutong Wu","Jie Zhang","Florian Kerschbaum","Tianwei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.10718v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12139v1","updated":"2023-08-23T13:54:15Z","published":"2023-08-23T13:54:15Z","title":"Mesh Conflation of Oblique Photogrammetric Models using Virtual Cameras\n and Truncated Signed Distance Field","summary":" Conflating/stitching 2.5D raster digital surface models (DSM) into a large\none has been a running practice in geoscience applications, however, conflating\nfull-3D mesh models, such as those from oblique photogrammetry, is extremely\nchallenging. In this letter, we propose a novel approach to address this\nchallenge by conflating multiple full-3D oblique photogrammetric models into a\nsingle, and seamless mesh for high-resolution site modeling. Given two or more\nindividually collected and created photogrammetric meshes, we first propose to\ncreate a virtual camera field (with a panoramic field of view) to incubate\nvirtual spaces represented by Truncated Signed Distance Field (TSDF), an\nimplicit volumetric field friendly for linear 3D fusion; then we adaptively\nleverage the truncated bound of meshes in TSDF to conflate them into a single\nand accurate full 3D site model. With drone-based 3D meshes, we show that our\napproach significantly improves upon traditional methods for model conflations,\nto drive new potentials to create excessively large and accurate full 3D mesh\nmodels in support of geoscience and environmental applications.\n","authors":["Shuang Song","Rongjun Qin"],"pdf_url":"https://arxiv.org/pdf/2308.12139v1.pdf","comment":"5 Figures"},{"id":"http://arxiv.org/abs/2308.12138v1","updated":"2023-08-23T13:51:54Z","published":"2023-08-23T13:51:54Z","title":"Select-and-Combine (SAC): A Novel Multi-Stereo Depth Fusion Algorithm\n for Point Cloud Generation via Efficient Local Markov Netlets","summary":" Many practical systems for image-based surface reconstruction employ a\nstereo/multi-stereo paradigm, due to its ability to scale for large scenes and\nits ease of implementation for out-of-core operations. In this process,\nmultiple and abundant depth maps from stereo matching must be combined and\nfused into a single, consistent, and clean point cloud. However, the noises and\noutliers caused by stereo matching and the heterogenous geometric errors of the\nposes present a challenge for existing fusion algorithms, since they mostly\nassume Gaussian errors and predict fused results based on data from local\nspatial neighborhoods, which may inherit uncertainties from multiple depths\nresulting in lowered accuracy. In this paper, we propose a novel depth fusion\nparadigm, that instead of numerically fusing points from multiple depth maps,\nselects the best depth map per point, and combines them into a single and clean\npoint cloud. This paradigm, called select-and-combine (SAC), is achieved\nthrough modeling the point level fusion using local Markov Netlets, a\nmicro-network over point across neighboring views for depth/view selection,\nfollowed by a Netlets collapse process for point combination. The Markov\nNetlets are optimized such that they can inherently leverage spatial\nconsistencies among depth maps of neighboring views, thus they can address\nerrors beyond Gaussian ones. Our experiment results show that our approach\noutperforms existing depth fusion approaches by increasing the F1 score that\nconsiders both accuracy and completeness by 2.07% compared to the best existing\nmethod. Finally, our approach generates clearer point clouds that are 18% less\nredundant while with a higher accuracy before fusion\n","authors":["Mostafa Elhashash","Rongjun Qin"],"pdf_url":"https://arxiv.org/pdf/2308.12138v1.pdf","comment":"6 Figures"},{"id":"http://arxiv.org/abs/2308.12133v1","updated":"2023-08-23T13:43:42Z","published":"2023-08-23T13:43:42Z","title":"Lite-HRNet Plus: Fast and Accurate Facial Landmark Detection","summary":" Facial landmark detection is an essential technology for driver status\ntracking and has been in demand for real-time estimations. As a landmark\ncoordinate prediction, heatmap-based methods are known to achieve a high\naccuracy, and Lite-HRNet can achieve a fast estimation. However, with\nLite-HRNet, the problem of a heavy computational cost of the fusion block,\nwhich connects feature maps with different resolutions, has yet to be solved.\nIn addition, the strong output module used in HRNetV2 is not applied to\nLite-HRNet. Given these problems, we propose a novel architecture called\nLite-HRNet Plus. Lite-HRNet Plus achieves two improvements: a novel fusion\nblock based on a channel attention and a novel output module with less\ncomputational intensity using multi-resolution feature maps. Through\nexperiments conducted on two facial landmark datasets, we confirmed that\nLite-HRNet Plus further improved the accuracy in comparison with conventional\nmethods, and achieved a state-of-the-art accuracy with a computational\ncomplexity with the range of 10M FLOPs.\n","authors":["Sota Kato","Kazuhiro Hotta","Yuhki Hatakeyama","Yoshinori Konishi"],"pdf_url":"https://arxiv.org/pdf/2308.12133v1.pdf","comment":"Accepted at ICIP2023"},{"id":"http://arxiv.org/abs/2308.12127v1","updated":"2023-08-23T13:33:39Z","published":"2023-08-23T13:33:39Z","title":"Masking Strategies for Background Bias Removal in Computer Vision Models","summary":" Models for fine-grained image classification tasks, where the difference\nbetween some classes can be extremely subtle and the number of samples per\nclass tends to be low, are particularly prone to picking up background-related\nbiases and demand robust methods to handle potential examples with\nout-of-distribution (OOD) backgrounds. To gain deeper insights into this\ncritical problem, our research investigates the impact of background-induced\nbias on fine-grained image classification, evaluating standard backbone models\nsuch as Convolutional Neural Network (CNN) and Vision Transformers (ViT). We\nexplore two masking strategies to mitigate background-induced bias: Early\nmasking, which removes background information at the (input) image level, and\nlate masking, which selectively masks high-level spatial features corresponding\nto the background. Extensive experiments assess the behavior of CNN and ViT\nmodels under different masking strategies, with a focus on their generalization\nto OOD backgrounds. The obtained findings demonstrate that both proposed\nstrategies enhance OOD performance compared to the baseline models, with early\nmasking consistently exhibiting the best OOD performance. Notably, a ViT\nvariant employing GAP-Pooled Patch token-based classification combined with\nearly masking achieves the highest OOD robustness.\n","authors":["Ananthu Aniraj","Cassio F. Dantas","Dino Ienco","Diego Marcos"],"pdf_url":"https://arxiv.org/pdf/2308.12127v1.pdf","comment":"Accepted at the 2023 IEEE/CVF International Conference on Computer\n Vision Workshop (ICCVW) on Out Of Distribution Generalization in Computer\n Vision (OOD-CV)"},{"id":"http://arxiv.org/abs/2211.13579v2","updated":"2023-08-23T13:20:44Z","published":"2022-11-24T13:08:43Z","title":"Knowledge-Aware Federated Active Learning with Non-IID Data","summary":" Federated learning enables multiple decentralized clients to learn\ncollaboratively without sharing the local training data. However, the expensive\nannotation cost to acquire data labels on local clients remains an obstacle in\nutilizing local data. In this paper, we propose a federated active learning\nparadigm to efficiently learn a global model with limited annotation budget\nwhile protecting data privacy in a decentralized learning way. The main\nchallenge faced by federated active learning is the mismatch between the active\nsampling goal of the global model on the server and that of the asynchronous\nlocal clients. This becomes even more significant when data is distributed\nnon-IID across local clients. To address the aforementioned challenge, we\npropose Knowledge-Aware Federated Active Learning (KAFAL), which consists of\nKnowledge-Specialized Active Sampling (KSAS) and Knowledge-Compensatory\nFederated Update (KCFU). KSAS is a novel active sampling method tailored for\nthe federated active learning problem. It deals with the mismatch challenge by\nsampling actively based on the discrepancies between local and global models.\nKSAS intensifies specialized knowledge in local clients, ensuring the sampled\ndata to be informative for both the local clients and the global model. KCFU,\nin the meantime, deals with the client heterogeneity caused by limited data and\nnon-IID data distributions. It compensates for each client's ability in weak\nclasses by the assistance of the global model. Extensive experiments and\nanalyses are conducted to show the superiority of KSAS over the\nstate-of-the-art active learning methods and the efficiency of KCFU under the\nfederated active learning framework.\n","authors":["Yu-Tong Cao","Ye Shi","Baosheng Yu","Jingya Wang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2211.13579v2.pdf","comment":"14 pages, 12 figures, ICCV23"},{"id":"http://arxiv.org/abs/2304.04521v3","updated":"2023-08-23T13:11:20Z","published":"2023-04-10T11:35:42Z","title":"Zero-Shot In-Distribution Detection in Multi-Object Settings Using\n Vision-Language Foundation Models","summary":" Extracting in-distribution (ID) images from noisy images scraped from the\nInternet is an important preprocessing for constructing datasets, which has\ntraditionally been done manually. Automating this preprocessing with deep\nlearning techniques presents two key challenges. First, images should be\ncollected using only the name of the ID class without training on the ID data.\nSecond, as we can see why COCO was created, it is crucial to identify images\ncontaining not only ID objects but also both ID and out-of-distribution (OOD)\nobjects as ID images to create robust recognizers. In this paper, we propose a\nnovel problem setting called zero-shot in-distribution (ID) detection, where we\nidentify images containing ID objects as ID images (even if they contain OOD\nobjects), and images lacking ID objects as OOD images without any training. To\nsolve this problem, we leverage the powerful zero-shot capability of CLIP and\npresent a simple and effective approach, Global-Local Maximum Concept Matching\n(GL-MCM), based on both global and local visual-text alignments of CLIP\nfeatures. Extensive experiments demonstrate that GL-MCM outperforms comparison\nmethods on both multi-object datasets and single-object ImageNet benchmarks.\nThe code will be available via https://github.com/AtsuMiyai/GL-MCM.\n","authors":["Atsuyuki Miyai","Qing Yu","Go Irie","Kiyoharu Aizawa"],"pdf_url":"https://arxiv.org/pdf/2304.04521v3.pdf","comment":"v3: I fixed some typos from v2"},{"id":"http://arxiv.org/abs/2308.12116v1","updated":"2023-08-23T13:10:33Z","published":"2023-08-23T13:10:33Z","title":"The TYC Dataset for Understanding Instance-Level Semantics and Motions\n of Cells in Microstructures","summary":" Segmenting cells and tracking their motion over time is a common task in\nbiomedical applications. However, predicting accurate instance-wise\nsegmentation and cell motions from microscopy imagery remains a challenging\ntask. Using microstructured environments for analyzing single cells in a\nconstant flow of media adds additional complexity. While large-scale labeled\nmicroscopy datasets are available, we are not aware of any large-scale dataset,\nincluding both cells and microstructures. In this paper, we introduce the\ntrapped yeast cell (TYC) dataset, a novel dataset for understanding\ninstance-level semantics and motions of cells in microstructures. We release\n$105$ dense annotated high-resolution brightfield microscopy images, including\nabout $19$k instance masks. We also release $261$ curated video clips composed\nof $1293$ high-resolution microscopy images to facilitate unsupervised\nunderstanding of cell motions and morphology. TYC offers ten times more\ninstance annotations than the previously largest dataset, including cells and\nmicrostructures. Our effort also exceeds previous attempts in terms of\nmicrostructure variability, resolution, complexity, and capturing device\n(microscopy) variability. We facilitate a unified comparison on our novel\ndataset by introducing a standardized evaluation strategy. TYC and evaluation\ncode are publicly available under CC BY 4.0 license.\n","authors":["Christoph Reich","Tim Prangemeier","Heinz Koeppl"],"pdf_url":"https://arxiv.org/pdf/2308.12116v1.pdf","comment":"Accepted at ICCV 2023 Workshop on BioImage Computing. Project page\n (with links to the dataset and code):\n https://christophreich1996.github.io/tyc_dataset/"},{"id":"http://arxiv.org/abs/2308.12114v1","updated":"2023-08-23T13:09:03Z","published":"2023-08-23T13:09:03Z","title":"Less is More -- Towards parsimonious multi-task models using structured\n sparsity","summary":" Group sparsity in Machine Learning (ML) encourages simpler, more\ninterpretable models with fewer active parameter groups. This work aims to\nincorporate structured group sparsity into the shared parameters of a\nMulti-Task Learning (MTL) framework, to develop parsimonious models that can\neffectively address multiple tasks with fewer parameters while maintaining\ncomparable or superior performance to a dense model. Sparsifying the model\nduring training helps decrease the model's memory footprint, computation\nrequirements, and prediction time during inference. We use channel-wise l1/l2\ngroup sparsity in the shared layers of the Convolutional Neural Network (CNN).\nThis approach not only facilitates the elimination of extraneous groups\n(channels) but also imposes a penalty on the weights, thereby enhancing the\nlearning of all tasks. We compare the outcomes of single-task and multi-task\nexperiments under group sparsity on two publicly available MTL datasets, NYU-v2\nand CelebAMask-HQ. We also investigate how changing the sparsification degree\nimpacts both the performance of the model and the sparsity of groups.\n","authors":["Richa Upadhyay","Ronald Phlypo","Rajkumar Saini","Marcus Liwicki"],"pdf_url":"https://arxiv.org/pdf/2308.12114v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2308.12113v1","updated":"2023-08-23T13:06:59Z","published":"2023-08-23T13:06:59Z","title":"Advancements in Point Cloud Data Augmentation for Deep Learning: A\n Survey","summary":" Point cloud has a wide range of applications in areas such as autonomous\ndriving, mapping, navigation, scene reconstruction, and medical imaging. Due to\nits great potentials in these applications, point cloud processing has gained\ngreat attention in the field of computer vision. Among various point cloud\nprocessing techniques, deep learning (DL) has become one of the mainstream and\neffective methods for tasks such as detection, segmentation and classification.\nTo reduce overfitting during training DL models and improve model performance\nespecially when the amount and/or diversity of training data are limited,\naugmentation is often crucial. Although various point cloud data augmentation\nmethods have been widely used in different point cloud processing tasks, there\nare currently no published systematic surveys or reviews of these methods.\nTherefore, this article surveys and discusses these methods and categorizes\nthem into a taxonomy framework. Through the comprehensive evaluation and\ncomparison of the augmentation methods, this article identifies their\npotentials and limitations and suggests possible future research directions.\nThis work helps researchers gain a holistic understanding of the current status\nof point cloud data augmentation and promotes its wider application and\ndevelopment.\n","authors":["Qinfeng Zhu","Lei Fan","Ningxin Weng"],"pdf_url":"https://arxiv.org/pdf/2308.12113v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12112v1","updated":"2023-08-23T13:02:52Z","published":"2023-08-23T13:02:52Z","title":"Generalized Continual Category Discovery","summary":" Most of Continual Learning (CL) methods push the limit of supervised learning\nsettings, where an agent is expected to learn new labeled tasks and not forget\nprevious knowledge. However, these settings are not well aligned with real-life\nscenarios, where a learning agent has access to a vast amount of unlabeled data\nencompassing both novel (entirely unlabeled) classes and examples from known\nclasses. Drawing inspiration from Generalized Category Discovery (GCD), we\nintroduce a novel framework that relaxes this assumption. Precisely, in any\ntask, we allow for the existence of novel and known classes, and one must use\ncontinual version of unsupervised learning methods to discover them. We call\nthis setting Generalized Continual Category Discovery (GCCD). It unifies CL and\nGCD, bridging the gap between synthetic benchmarks and real-life scenarios.\nWith a series of experiments, we present that existing methods fail to\naccumulate knowledge from subsequent tasks in which unlabeled samples of novel\nclasses are present. In light of these limitations, we propose a method that\nincorporates both supervised and unsupervised signals and mitigates the\nforgetting through the use of centroid adaptation. Our method surpasses strong\nCL methods adopted for GCD techniques and presents a superior representation\nlearning performance.\n","authors":["Daniel Marczak","Grzegorz Rypeść","Sebastian Cygert","Tomasz Trzciński","Bartłomiej Twardowski"],"pdf_url":"https://arxiv.org/pdf/2308.12112v1.pdf","comment":"7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.12111v1","updated":"2023-08-23T12:58:51Z","published":"2023-08-23T12:58:51Z","title":"Cross-Modality Proposal-guided Feature Mining for Unregistered\n RGB-Thermal Pedestrian Detection","summary":" RGB-Thermal (RGB-T) pedestrian detection aims to locate the pedestrians in\nRGB-T image pairs to exploit the complementation between the two modalities for\nimproving detection robustness in extreme conditions. Most existing algorithms\nassume that the RGB-T image pairs are well registered, while in the real world\nthey are not aligned ideally due to parallax or different field-of-view of the\ncameras. The pedestrians in misaligned image pairs may locate at different\npositions in two images, which results in two challenges: 1) how to achieve\ninter-modality complementation using spatially misaligned RGB-T pedestrian\npatches, and 2) how to recognize the unpaired pedestrians at the boundary. To\ndeal with these issues, we propose a new paradigm for unregistered RGB-T\npedestrian detection, which predicts two separate pedestrian locations in the\nRGB and thermal images, respectively. Specifically, we propose a cross-modality\nproposal-guided feature mining (CPFM) mechanism to extract the two precise\nfusion features for representing the pedestrian in the two modalities, even if\nthe RGB-T image pair is unaligned. It enables us to effectively exploit the\ncomplementation between the two modalities. With the CPFM mechanism, we build a\ntwo-stream dense detector; it predicts the two pedestrian locations in the two\nmodalities based on the corresponding fusion feature mined by the CPFM\nmechanism. Besides, we design a data augmentation method, named Homography, to\nsimulate the discrepancy in scales and views between images. We also\ninvestigate two non-maximum suppression (NMS) methods for post-processing.\nFavorable experimental results demonstrate the effectiveness and robustness of\nour method in dealing with unregistered pedestrians with different shifts.\n","authors":["Chao Tian","Zikun Zhou","Yuqing Huang","Gaojun Li","Zhenyu He"],"pdf_url":"https://arxiv.org/pdf/2308.12111v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.02051v2","updated":"2023-08-23T12:45:27Z","published":"2023-04-04T18:03:04Z","title":"Multimodal Garment Designer: Human-Centric Latent Diffusion Models for\n Fashion Image Editing","summary":" Fashion illustration is used by designers to communicate their vision and to\nbring the design idea from conceptualization to realization, showing how\nclothes interact with the human body. In this context, computer vision can thus\nbe used to improve the fashion design process. Differently from previous works\nthat mainly focused on the virtual try-on of garments, we propose the task of\nmultimodal-conditioned fashion image editing, guiding the generation of\nhuman-centric fashion images by following multimodal prompts, such as text,\nhuman body poses, and garment sketches. We tackle this problem by proposing a\nnew architecture based on latent diffusion models, an approach that has not\nbeen used before in the fashion domain. Given the lack of existing datasets\nsuitable for the task, we also extend two existing fashion datasets, namely\nDress Code and VITON-HD, with multimodal annotations collected in a\nsemi-automatic manner. Experimental results on these new datasets demonstrate\nthe effectiveness of our proposal, both in terms of realism and coherence with\nthe given multimodal inputs. Source code and collected multimodal annotations\nare publicly available at:\nhttps://github.com/aimagelab/multimodal-garment-designer.\n","authors":["Alberto Baldrati","Davide Morelli","Giuseppe Cartella","Marcella Cornia","Marco Bertini","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2304.02051v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.12084v1","updated":"2023-08-23T12:07:39Z","published":"2023-08-23T12:07:39Z","title":"DISGAN: Wavelet-informed Discriminator Guides GAN to MRI\n Super-resolution with Noise Cleaning","summary":" MRI super-resolution (SR) and denoising tasks are fundamental challenges in\nthe field of deep learning, which have traditionally been treated as distinct\ntasks with separate paired training data. In this paper, we propose an\ninnovative method that addresses both tasks simultaneously using a single deep\nlearning model, eliminating the need for explicitly paired noisy and clean\nimages during training. Our proposed model is primarily trained for SR, but\nalso exhibits remarkable noise-cleaning capabilities in the super-resolved\nimages. Instead of conventional approaches that introduce frequency-related\noperations into the generative process, our novel approach involves the use of\na GAN model guided by a frequency-informed discriminator. To achieve this, we\nharness the power of the 3D Discrete Wavelet Transform (DWT) operation as a\nfrequency constraint within the GAN framework for the SR task on magnetic\nresonance imaging (MRI) data. Specifically, our contributions include: 1) a 3D\ngenerator based on residual-in-residual connected blocks; 2) the integration of\nthe 3D DWT with $1\\times 1$ convolution into a DWT+conv unit within a 3D Unet\nfor the discriminator; 3) the use of the trained model for high-quality image\nSR, accompanied by an intrinsic denoising process. We dub the model \"Denoising\nInduced Super-resolution GAN (DISGAN)\" due to its dual effects of SR image\ngeneration and simultaneous denoising. Departing from the traditional approach\nof training SR and denoising tasks as separate models, our proposed DISGAN is\ntrained only on the SR task, but also achieves exceptional performance in\ndenoising. The model is trained on 3D MRI data from dozens of subjects from the\nHuman Connectome Project (HCP) and further evaluated on previously unseen MRI\ndata from subjects with brain tumours and epilepsy to assess its denoising and\nSR performance.\n","authors":["Qi Wang","Lucas Mahler","Julius Steiglechner","Florian Birk","Klaus Scheffler","Gabriele Lohmann"],"pdf_url":"https://arxiv.org/pdf/2308.12084v1.pdf","comment":"10 pages, 9 figures"},{"id":"http://arxiv.org/abs/2306.14538v3","updated":"2023-08-23T12:03:04Z","published":"2023-06-26T09:21:13Z","title":"Learnable Differencing Center for Nighttime Depth Perception","summary":" Depth completion is the task of recovering dense depth maps from sparse ones,\nusually with the help of color images. Existing image-guided methods perform\nwell on daytime depth perception self-driving benchmarks, but struggle in\nnighttime scenarios with poor visibility and complex illumination. To address\nthese challenges, we propose a simple yet effective framework called LDCNet.\nOur key idea is to use Recurrent Inter-Convolution Differencing (RICD) and\nIllumination-Affinitive Intra-Convolution Differencing (IAICD) to enhance the\nnighttime color images and reduce the negative effects of the varying\nillumination, respectively. RICD explicitly estimates global illumination by\ndifferencing two convolutions with different kernels, treating the\nsmall-kernel-convolution feature as the center of the large-kernel-convolution\nfeature in a new perspective. IAICD softly alleviates local relative light\nintensity by differencing a single convolution, where the center is dynamically\naggregated based on neighboring pixels and the estimated illumination map in\nRICD. On both nighttime depth completion and depth estimation tasks, extensive\nexperiments demonstrate the effectiveness of our LDCNet, reaching the state of\nthe art.\n","authors":["Zhiqiang Yan","Yupeng Zheng","Chongyi Li","Jun Li","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2306.14538v3.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2303.11225v2","updated":"2023-08-23T11:46:57Z","published":"2023-03-20T16:07:02Z","title":"HiFace: High-Fidelity 3D Face Reconstruction by Learning Static and\n Dynamic Details","summary":" 3D Morphable Models (3DMMs) demonstrate great potential for reconstructing\nfaithful and animatable 3D facial surfaces from a single image. The facial\nsurface is influenced by the coarse shape, as well as the static detail (e,g.,\nperson-specific appearance) and dynamic detail (e.g., expression-driven\nwrinkles). Previous work struggles to decouple the static and dynamic details\nthrough image-level supervision, leading to reconstructions that are not\nrealistic. In this paper, we aim at high-fidelity 3D face reconstruction and\npropose HiFace to explicitly model the static and dynamic details.\nSpecifically, the static detail is modeled as the linear combination of a\ndisplacement basis, while the dynamic detail is modeled as the linear\ninterpolation of two displacement maps with polarized expressions. We exploit\nseveral loss functions to jointly learn the coarse shape and fine details with\nboth synthetic and real-world datasets, which enable HiFace to reconstruct\nhigh-fidelity 3D shapes with animatable details. Extensive quantitative and\nqualitative experiments demonstrate that HiFace presents state-of-the-art\nreconstruction quality and faithfully recovers both the static and dynamic\ndetails. Our project page can be found at https://project-hiface.github.io.\n","authors":["Zenghao Chai","Tianke Zhang","Tianyu He","Xu Tan","Tadas Baltrušaitis","HsiangTao Wu","Runnan Li","Sheng Zhao","Chun Yuan","Jiang Bian"],"pdf_url":"https://arxiv.org/pdf/2303.11225v2.pdf","comment":"Accepted to ICCV 2023, camera-ready version; Project page:\n https://project-hiface.github.io/"},{"id":"http://arxiv.org/abs/2308.12067v1","updated":"2023-08-23T11:27:30Z","published":"2023-08-23T11:27:30Z","title":"InstructionGPT-4: A 200-Instruction Paradigm for Fine-Tuning MiniGPT-4","summary":" Multimodal large language models acquire their instruction-following\ncapabilities through a two-stage training process: pre-training on image-text\npairs and fine-tuning on supervised vision-language instruction data. Recent\nstudies have shown that large language models can achieve satisfactory results\neven with a limited amount of high-quality instruction-following data. In this\npaper, we introduce InstructionGPT-4, which is fine-tuned on a small dataset\ncomprising only 200 examples, amounting to approximately 6% of the\ninstruction-following data used in the alignment dataset for MiniGPT-4. We\nfirst propose several metrics to access the quality of multimodal instruction\ndata. Based on these metrics, we present a simple and effective data selector\nto automatically identify and filter low-quality vision-language data. By\nemploying this method, InstructionGPT-4 outperforms the original MiniGPT-4 on\nvarious evaluations (e.g., visual question answering, GPT-4 preference).\nOverall, our findings demonstrate that less but high-quality instruction tuning\ndata is efficient to enable multimodal large language models to generate better\noutput.\n","authors":["Lai Wei","Zihao Jiang","Weiran Huang","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2308.12067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.10762v3","updated":"2023-08-23T11:21:29Z","published":"2023-03-19T20:31:38Z","title":"Deep Image Fingerprint: Towards Low Budget Synthetic Image Detection and\n Model Lineage Analysis","summary":" The generation of high-quality images has become widely accessible and is a\nrapidly evolving process. As a result, anyone can generate images that are\nindistinguishable from real ones. This leads to a wide range of applications,\nincluding malicious usage with deceptive intentions. Despite advances in\ndetection techniques for generated images, a robust detection method still\neludes us. Furthermore, model personalization techniques might affect the\ndetection capabilities of existing methods. In this work, we utilize the\narchitectural properties of convolutional neural networks (CNNs) to develop a\nnew detection method. Our method can detect images from a known generative\nmodel and enable us to establish relationships between fine-tuned generative\nmodels. We tested the method on images produced by both Generative Adversarial\nNetworks (GANs) and recent large text-to-image models (LTIMs) that rely on\nDiffusion Models. Our approach outperforms others trained under identical\nconditions and achieves comparable performance to state-of-the-art pre-trained\ndetection methods on images generated by Stable Diffusion and MidJourney, with\nsignificantly fewer required train samples.\n","authors":["Sergey Sinitsa","Ohad Fried"],"pdf_url":"https://arxiv.org/pdf/2303.10762v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12064v1","updated":"2023-08-23T11:16:36Z","published":"2023-08-23T11:16:36Z","title":"SILT: Shadow-aware Iterative Label Tuning for Learning to Detect Shadows\n from Noisy Labels","summary":" Existing shadow detection datasets often contain missing or mislabeled\nshadows, which can hinder the performance of deep learning models trained\ndirectly on such data. To address this issue, we propose SILT, the Shadow-aware\nIterative Label Tuning framework, which explicitly considers noise in shadow\nlabels and trains the deep model in a self-training manner. Specifically, we\nincorporate strong data augmentations with shadow counterfeiting to help the\nnetwork better recognize non-shadow regions and alleviate overfitting. We also\ndevise a simple yet effective label tuning strategy with global-local fusion\nand shadow-aware filtering to encourage the network to make significant\nrefinements on the noisy labels. We evaluate the performance of SILT by\nrelabeling the test set of the SBU dataset and conducting various experiments.\nOur results show that even a simple U-Net trained with SILT can outperform all\nstate-of-the-art methods by a large margin. When trained on SBU / UCF / ISTD,\nour network can successfully reduce the Balanced Error Rate by 25.2% / 36.9% /\n21.3% over the best state-of-the-art method.\n","authors":["Han Yang","Tianyu Wang","Xiaowei Hu","Chi-Wing Fu"],"pdf_url":"https://arxiv.org/pdf/2308.12064v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.12061v1","updated":"2023-08-23T11:03:28Z","published":"2023-08-23T11:03:28Z","title":"HarvestNet: A Dataset for Detecting Smallholder Farming Activity Using\n Harvest Piles and Remote Sensing","summary":" Small farms contribute to a large share of the productive land in developing\ncountries. In regions such as sub-Saharan Africa, where 80% of farms are small\n(under 2 ha in size), the task of mapping smallholder cropland is an important\npart of tracking sustainability measures such as crop productivity. However,\nthe visually diverse and nuanced appearance of small farms has limited the\neffectiveness of traditional approaches to cropland mapping. Here we introduce\na new approach based on the detection of harvest piles characteristic of many\nsmallholder systems throughout the world. We present HarvestNet, a dataset for\nmapping the presence of farms in the Ethiopian regions of Tigray and Amhara\nduring 2020-2023, collected using expert knowledge and satellite images,\ntotaling 7k hand-labeled images and 2k ground collected labels. We also\nbenchmark a set of baselines including SOTA models in remote sensing with our\nbest models having around 80% classification performance on hand labelled data\nand 90%, 98% accuracy on ground truth data for Tigray, Amhara respectively. We\nalso perform a visual comparison with a widely used pre-existing coverage map\nand show that our model detects an extra 56,621 hectares of cropland in Tigray.\nWe conclude that remote sensing of harvest piles can contribute to more timely\nand accurate cropland assessments in food insecure region.\n","authors":["Jonathan Xu","Amna Elmustafa","Liya Weldegebriel","Emnet Negash","Richard Lee","Chenlin Meng","Stefano Ermon","David Lobell"],"pdf_url":"https://arxiv.org/pdf/2308.12061v1.pdf","comment":"18 pages, 22 figures"},{"id":"http://arxiv.org/abs/2308.12059v1","updated":"2023-08-23T10:59:41Z","published":"2023-08-23T10:59:41Z","title":"Manipulating Embeddings of Stable Diffusion Prompts","summary":" Generative text-to-image models such as Stable Diffusion allow users to\ngenerate images based on a textual description, the prompt. Changing the prompt\nis still the primary means for the user to change a generated image as desired.\nHowever, changing the image by reformulating the prompt remains a difficult\nprocess of trial and error, which has led to the emergence of prompt\nengineering as a new field of research. We propose and analyze methods to\nchange the embedding of a prompt directly instead of the prompt text. It allows\nfor more fine-grained and targeted control that takes into account user\nintentions. Our approach treats the generative text-to-image model as a\ncontinuous function and passes gradients between the image space and the prompt\nembedding space. By addressing different user interaction problems, we can\napply this idea in three scenarios: (1) Optimization of a metric defined in\nimage space that could measure, for example, image style. (2) Assistance of\nusers in creative tasks by enabling them to navigate the image space along a\nselection of directions of \"near\" prompt embeddings. (3) Changing the embedding\nof the prompt to include information that the user has seen in a particular\nseed but finds difficult to describe in the prompt. Our experiments demonstrate\nthe feasibility of the described methods.\n","authors":["Niklas Deckers","Julia Peters","Martin Potthast"],"pdf_url":"https://arxiv.org/pdf/2308.12059v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12058v1","updated":"2023-08-23T10:59:20Z","published":"2023-08-23T10:59:20Z","title":"DR-Tune: Improving Fine-tuning of Pretrained Visual Models by\n Distribution Regularization with Semantic Calibration","summary":" The visual models pretrained on large-scale benchmarks encode general\nknowledge and prove effective in building more powerful representations for\ndownstream tasks. Most existing approaches follow the fine-tuning paradigm,\neither by initializing or regularizing the downstream model based on the\npretrained one. The former fails to retain the knowledge in the successive\nfine-tuning phase, thereby prone to be over-fitting, and the latter imposes\nstrong constraints to the weights or feature maps of the downstream model\nwithout considering semantic drift, often incurring insufficient optimization.\nTo deal with these issues, we propose a novel fine-tuning framework, namely\ndistribution regularization with semantic calibration (DR-Tune). It employs\ndistribution regularization by enforcing the downstream task head to decrease\nits classification error on the pretrained feature distribution, which prevents\nit from over-fitting while enabling sufficient training of downstream encoders.\nFurthermore, to alleviate the interference by semantic drift, we develop the\nsemantic calibration (SC) module to align the global shape and class centers of\nthe pretrained and downstream feature distributions. Extensive experiments on\nwidely used image classification datasets show that DR-Tune consistently\nimproves the performance when combing with various backbones under different\npretraining strategies. Code is available at:\nhttps://github.com/weeknan/DR-Tune.\n","authors":["Nan Zhou","Jiaxin Chen","Di Huang"],"pdf_url":"https://arxiv.org/pdf/2308.12058v1.pdf","comment":"Accepted by ICCV'2023"},{"id":"http://arxiv.org/abs/2307.12907v3","updated":"2023-08-23T10:37:21Z","published":"2023-07-24T16:02:42Z","title":"GridMM: Grid Memory Map for Vision-and-Language Navigation","summary":" Vision-and-language navigation (VLN) enables the agent to navigate to a\nremote location following the natural language instruction in 3D environments.\nTo represent the previously visited environment, most approaches for VLN\nimplement memory using recurrent states, topological maps, or top-down semantic\nmaps. In contrast to these approaches, we build the top-down egocentric and\ndynamically growing Grid Memory Map (i.e., GridMM) to structure the visited\nenvironment. From a global perspective, historical observations are projected\ninto a unified grid map in a top-down view, which can better represent the\nspatial relations of the environment. From a local perspective, we further\npropose an instruction relevance aggregation method to capture fine-grained\nvisual clues in each grid region. Extensive experiments are conducted on both\nthe REVERIE, R2R, SOON datasets in the discrete environments, and the R2R-CE\ndataset in the continuous environments, showing the superiority of our proposed\nmethod.\n","authors":["Zihan Wang","Xiangyang Li","Jiahao Yang","Yeqi Liu","Shuqiang Jiang"],"pdf_url":"https://arxiv.org/pdf/2307.12907v3.pdf","comment":"Accepted by ICCV 2023. The code is available at\n https://github.com/MrZihan/GridMM"},{"id":"http://arxiv.org/abs/2308.12049v1","updated":"2023-08-23T10:35:37Z","published":"2023-08-23T10:35:37Z","title":"Towards Privacy-Supporting Fall Detection via Deep Unsupervised\n RGB2Depth Adaptation","summary":" Fall detection is a vital task in health monitoring, as it allows the system\nto trigger an alert and therefore enabling faster interventions when a person\nexperiences a fall. Although most previous approaches rely on standard RGB\nvideo data, such detailed appearance-aware monitoring poses significant privacy\nconcerns. Depth sensors, on the other hand, are better at preserving privacy as\nthey merely capture the distance of objects from the sensor or camera, omitting\ncolor and texture information. In this paper, we introduce a privacy-supporting\nsolution that makes the RGB-trained model applicable in depth domain and\nutilizes depth data at test time for fall detection. To achieve cross-modal\nfall detection, we present an unsupervised RGB to Depth (RGB2Depth) cross-modal\ndomain adaptation approach that leverages labelled RGB data and unlabelled\ndepth data during training. Our proposed pipeline incorporates an intermediate\ndomain module for feature bridging, modality adversarial loss for modality\ndiscrimination, classification loss for pseudo-labeled depth data and labeled\nsource data, triplet loss that considers both source and target domains, and a\nnovel adaptive loss weight adjustment method for improved coordination among\nvarious losses. Our approach achieves state-of-the-art results in the\nunsupervised RGB2Depth domain adaptation task for fall detection. Code is\navailable at https://github.com/1015206533/privacy_supporting_fall_detection.\n","authors":["Hejun Xiao","Kunyu Peng","Xiangsheng Huang","Alina Roitberg1","Hao Li","Zhaohui Wang","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2308.12049v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12048v1","updated":"2023-08-23T10:29:25Z","published":"2023-08-23T10:29:25Z","title":"Head-Tail Cooperative Learning Network for Unbiased Scene Graph\n Generation","summary":" Scene Graph Generation (SGG) as a critical task in image understanding,\nfacing the challenge of head-biased prediction caused by the long-tail\ndistribution of predicates. However, current unbiased SGG methods can easily\nprioritize improving the prediction of tail predicates while ignoring the\nsubstantial sacrifice in the prediction of head predicates, leading to a shift\nfrom head bias to tail bias. To address this issue, we propose a model-agnostic\nHead-Tail Collaborative Learning (HTCL) network that includes head-prefer and\ntail-prefer feature representation branches that collaborate to achieve\naccurate recognition of both head and tail predicates. We also propose a\nself-supervised learning approach to enhance the prediction ability of the\ntail-prefer feature representation branch by constraining tail-prefer predicate\nfeatures. Specifically, self-supervised learning converges head predicate\nfeatures to their class centers while dispersing tail predicate features as\nmuch as possible through contrast learning and head center loss. We demonstrate\nthe effectiveness of our HTCL by applying it to various SGG models on VG150,\nOpen Images V6 and GQA200 datasets. The results show that our method achieves\nhigher mean Recall with a minimal sacrifice in Recall and achieves a new\nstate-of-the-art overall performance. Our code is available at\nhttps://github.com/wanglei0618/HTCL.\n","authors":["Lei Wang","Zejian Yuan","Yao Lu","Badong Chen"],"pdf_url":"https://arxiv.org/pdf/2308.12048v1.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.12045v1","updated":"2023-08-23T10:25:37Z","published":"2023-08-23T10:25:37Z","title":"CgT-GAN: CLIP-guided Text GAN for Image Captioning","summary":" The large-scale visual-language pre-trained model, Contrastive Language-Image\nPre-training (CLIP), has significantly improved image captioning for scenarios\nwithout human-annotated image-caption pairs. Recent advanced CLIP-based image\ncaptioning without human annotations follows a text-only training paradigm,\ni.e., reconstructing text from shared embedding space. Nevertheless, these\napproaches are limited by the training/inference gap or huge storage\nrequirements for text embeddings. Given that it is trivial to obtain images in\nthe real world, we propose CLIP-guided text GAN (CgT-GAN), which incorporates\nimages into the training process to enable the model to \"see\" real visual\nmodality. Particularly, we use adversarial training to teach CgT-GAN to mimic\nthe phrases of an external text corpus and CLIP-based reward to provide\nsemantic guidance. The caption generator is jointly rewarded based on the\ncaption naturalness to human language calculated from the GAN's discriminator\nand the semantic guidance reward computed by the CLIP-based reward module. In\naddition to the cosine similarity as the semantic guidance reward (i.e.,\nCLIP-cos), we further introduce a novel semantic guidance reward called\nCLIP-agg, which aligns the generated caption with a weighted text embedding by\nattentively aggregating the entire corpus. Experimental results on three\nsubtasks (ZS-IC, In-UIC and Cross-UIC) show that CgT-GAN outperforms\nstate-of-the-art methods significantly across all metrics. Code is available at\nhttps://github.com/Lihr747/CgtGAN.\n","authors":["Jiarui Yu","Haoran Li","Yanbin Hao","Bin Zhu","Tong Xu","Xiangnan He"],"pdf_url":"https://arxiv.org/pdf/2308.12045v1.pdf","comment":"Accepted at ACM MM 2023"},{"id":"http://arxiv.org/abs/2306.15782v3","updated":"2023-08-23T10:02:15Z","published":"2023-06-27T20:09:56Z","title":"UTRNet: High-Resolution Urdu Text Recognition In Printed Documents","summary":" In this paper, we propose a novel approach to address the challenges of\nprinted Urdu text recognition using high-resolution, multi-scale semantic\nfeature extraction. Our proposed UTRNet architecture, a hybrid CNN-RNN model,\ndemonstrates state-of-the-art performance on benchmark datasets. To address the\nlimitations of previous works, which struggle to generalize to the intricacies\nof the Urdu script and the lack of sufficient annotated real-world data, we\nhave introduced the UTRSet-Real, a large-scale annotated real-world dataset\ncomprising over 11,000 lines and UTRSet-Synth, a synthetic dataset with 20,000\nlines closely resembling real-world and made corrections to the ground truth of\nthe existing IIITH dataset, making it a more reliable resource for future\nresearch. We also provide UrduDoc, a benchmark dataset for Urdu text line\ndetection in scanned documents. Additionally, we have developed an online tool\nfor end-to-end Urdu OCR from printed documents by integrating UTRNet with a\ntext detection model. Our work not only addresses the current limitations of\nUrdu OCR but also paves the way for future research in this area and\nfacilitates the continued advancement of Urdu OCR technology. The project page\nwith source code, datasets, annotations, trained models, and online tool is\navailable at abdur75648.github.io/UTRNet.\n","authors":["Abdur Rahman","Arjun Ghosh","Chetan Arora"],"pdf_url":"https://arxiv.org/pdf/2306.15782v3.pdf","comment":"Accepted at The 17th International Conference on Document Analysis\n and Recognition (ICDAR 2023)"},{"id":"http://arxiv.org/abs/2308.12038v1","updated":"2023-08-23T09:55:41Z","published":"2023-08-23T09:55:41Z","title":"Large Multilingual Models Pivot Zero-Shot Multimodal Learning across\n Languages","summary":" Recently there has been a significant surge in multimodal learning in terms\nof both image-to-text and text-to-image generation. However, the success is\ntypically limited to English, leaving other languages largely behind. Building\na competitive counterpart in other languages is highly challenging due to the\nlow-resource nature of non-English multimodal data (i.e., lack of large-scale,\nhigh-quality image-text data). In this work, we propose MPM, an effective\ntraining paradigm for training large multimodal models in low-resource\nlanguages. MPM demonstrates that Multilingual language models can Pivot\nzero-shot Multimodal learning across languages. Specifically, based on a strong\nmultilingual large language model, multimodal models pretrained on English-only\nimage-text data can well generalize to other languages in a zero-shot manner\nfor both image-to-text and text-to-image generation, even surpassing models\ntrained on image-text data in native languages. Taking Chinese as a practice of\nMPM, we build large multimodal models VisCPM in image-to-text and text-to-image\ngeneration, which achieve state-of-the-art (open-source) performance in\nChinese. To facilitate future research, we open-source codes and model weights\nat https://github.com/OpenBMB/VisCPM.git.\n","authors":["Jinyi Hu","Yuan Yao","Chongyi Wang","Shan Wang","Yinxu Pan","Qianyu Chen","Tianyu Yu","Hanghao Wu","Yue Zhao","Haoye Zhang","Xu Han","Yankai Lin","Jiao Xue","Dahai Li","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2308.12038v1.pdf","comment":"https://github.com/OpenBMB/VisCPM.git"},{"id":"http://arxiv.org/abs/2308.12035v1","updated":"2023-08-23T09:49:20Z","published":"2023-08-23T09:49:20Z","title":"RefEgo: Referring Expression Comprehension Dataset from First-Person\n Perception of Ego4D","summary":" Grounding textual expressions on scene objects from first-person views is a\ntruly demanding capability in developing agents that are aware of their\nsurroundings and behave following intuitive text instructions. Such capability\nis of necessity for glass-devices or autonomous robots to localize referred\nobjects in the real-world. In the conventional referring expression\ncomprehension tasks of images, however, datasets are mostly constructed based\non the web-crawled data and don't reflect diverse real-world structures on the\ntask of grounding textual expressions in diverse objects in the real world.\nRecently, a massive-scale egocentric video dataset of Ego4D was proposed. Ego4D\ncovers around the world diverse real-world scenes including numerous indoor and\noutdoor situations such as shopping, cooking, walking, talking, manufacturing,\netc. Based on egocentric videos of Ego4D, we constructed a broad coverage of\nthe video-based referring expression comprehension dataset: RefEgo. Our dataset\nincludes more than 12k video clips and 41 hours for video-based referring\nexpression comprehension annotation. In experiments, we combine the\nstate-of-the-art 2D referring expression comprehension models with the object\ntracking algorithm, achieving the video-wise referred object tracking even in\ndifficult conditions: the referred object becomes out-of-frame in the middle of\nthe video or multiple similar objects are presented in the video.\n","authors":["Shuhei Kurita","Naoki Katsura","Eri Onami"],"pdf_url":"https://arxiv.org/pdf/2308.12035v1.pdf","comment":"15 pages, 11 figures. ICCV2023"},{"id":"http://arxiv.org/abs/2209.08996v2","updated":"2023-08-23T09:31:26Z","published":"2022-09-19T13:20:19Z","title":"EDO-Net: Learning Elastic Properties of Deformable Objects from Graph\n Dynamics","summary":" We study the problem of learning graph dynamics of deformable objects that\ngeneralizes to unknown physical properties. Our key insight is to leverage a\nlatent representation of elastic physical properties of cloth-like deformable\nobjects that can be extracted, for example, from a pulling interaction. In this\npaper we propose EDO-Net (Elastic Deformable Object - Net), a model of graph\ndynamics trained on a large variety of samples with different elastic\nproperties that does not rely on ground-truth labels of the properties. EDO-Net\njointly learns an adaptation module, and a forward-dynamics module. The former\nis responsible for extracting a latent representation of the physical\nproperties of the object, while the latter leverages the latent representation\nto predict future states of cloth-like objects represented as graphs. We\nevaluate EDO-Net both in simulation and real world, assessing its capabilities\nof: 1) generalizing to unknown physical properties, 2) transferring the learned\nrepresentation to new downstream tasks.\n","authors":["Alberta Longhini","Marco Moletta","Alfredo Reichlin","Michael C. Welle","David Held","Zackory Erickson","Danica Kragic"],"pdf_url":"https://arxiv.org/pdf/2209.08996v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.13991v2","updated":"2023-08-23T09:27:19Z","published":"2023-02-27T17:30:00Z","title":"Learning to Generalize towards Unseen Domains via a Content-Aware Style\n Invariant Model for Disease Detection from Chest X-rays","summary":" Performance degradation due to source domain mismatch is a longstanding\nchallenge in deep learning-based medical image analysis, particularly for chest\nX-rays (CXRs). Several methods (e.g., adversarial training, multi-domain\nmixups) have been proposed to extract domain-invariant high-level features to\naddress this domain shift. However, these methods do not explicitly regularize\nthe content and style characteristics of the extracted domain-invariant\nfeatures. Recent studies have demonstrated that CNN models exhibit a strong\nbias toward styles (e.g., uninformative textures) rather than content (e.g.,\nshape), in stark contrast to the human-vision system. Radiologists tend to\nlearn visual cues from CXRs and thus perform well across multiple domains.\nTherefore, in medical imaging for pathology diagnosis from CXR images, models\nshould extract domain-invariant features that are style-invariant and\ncontent-biased. Motivated by this, we employ the novel style randomization\nmodules (SRMs) at both image and feature levels that work together\nhierarchically to create rich style perturbed features on the fly while keeping\nthe content intact. In addition, we leverage consistency regularizations\nbetween global semantic features and predicted probability distributions,\nrespectively, for with and without style perturbed versions of the same CXR\nimage to tweak the model's sensitivity toward content markers for accurate\npredictions. Extensive experiments with three large-scale thoracic disease\ndatasets, i.e., CheXpert, MIMIC-CXR, and BRAX, demonstrate that our proposed\nframework is more robust in the presence of domain shift and achieves\nstate-of-the-art performance.\n","authors":["Mohammad Zunaed","Md. Aynal Haque","Taufiq Hasan"],"pdf_url":"https://arxiv.org/pdf/2302.13991v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12017v1","updated":"2023-08-23T09:20:05Z","published":"2023-08-23T09:20:05Z","title":"Distribution-Aware Calibration for Object Detection with Noisy Bounding\n Boxes","summary":" Large-scale well-annotated datasets are of great importance for training an\neffective object detector. However, obtaining accurate bounding box annotations\nis laborious and demanding. Unfortunately, the resultant noisy bounding boxes\ncould cause corrupt supervision signals and thus diminish detection\nperformance. Motivated by the observation that the real ground-truth is usually\nsituated in the aggregation region of the proposals assigned to a noisy\nground-truth, we propose DIStribution-aware CalibratiOn (DISCO) to model the\nspatial distribution of proposals for calibrating supervision signals. In\nDISCO, spatial distribution modeling is performed to statistically extract the\npotential locations of objects. Based on the modeled distribution, three\ndistribution-aware techniques, i.e., distribution-aware proposal augmentation\n(DA-Aug), distribution-aware box refinement (DA-Ref), and distribution-aware\nconfidence estimation (DA-Est), are developed to improve classification,\nlocalization, and interpretability, respectively. Extensive experiments on\nlarge-scale noisy image datasets (i.e., Pascal VOC and MS-COCO) demonstrate\nthat DISCO can achieve state-of-the-art detection performance, especially at\nhigh noise levels.\n","authors":["Donghao Zhou","Jialin Li","Jinpeng Li","Jiancheng Huang","Qiang Nie","Yong Liu","Bin-Bin Gao","Qiong Wang","Pheng-Ann Heng","Guangyong Chen"],"pdf_url":"https://arxiv.org/pdf/2308.12017v1.pdf","comment":"12 pages, 9 figures"},{"id":"http://arxiv.org/abs/2308.08210v2","updated":"2023-08-23T09:15:20Z","published":"2023-08-16T08:28:01Z","title":"Neural Spherical Harmonics for structurally coherent continuous\n representation of diffusion MRI signal","summary":" We present a novel way to model diffusion magnetic resonance imaging (dMRI)\ndatasets, that benefits from the structural coherence of the human brain while\nonly using data from a single subject. Current methods model the dMRI signal in\nindividual voxels, disregarding the intervoxel coherence that is present. We\nuse a neural network to parameterize a spherical harmonics series (NeSH) to\nrepresent the dMRI signal of a single subject from the Human Connectome Project\ndataset, continuous in both the angular and spatial domain. The reconstructed\ndMRI signal using this method shows a more structurally coherent representation\nof the data. Noise in gradient images is removed and the fiber orientation\ndistribution functions show a smooth change in direction along a fiber tract.\nWe showcase how the reconstruction can be used to calculate mean diffusivity,\nfractional anisotropy, and total apparent fiber density. These results can be\nachieved with a single model architecture, tuning only one hyperparameter. In\nthis paper we also demonstrate how upsampling in both the angular and spatial\ndomain yields reconstructions that are on par or better than existing methods.\n","authors":["Tom Hendriks","Anna Vilanova","Maxime Chamberland"],"pdf_url":"https://arxiv.org/pdf/2308.08210v2.pdf","comment":"12 pages, 6 figures, accepted for cdMRI workshop at MICCAI 2023\n Updated to fix typo in author name (Villanova -> Vilanova)"},{"id":"http://arxiv.org/abs/2308.12009v1","updated":"2023-08-23T09:02:01Z","published":"2023-08-23T09:02:01Z","title":"StofNet: Super-resolution Time of Flight Network","summary":" Time of Flight (ToF) is a prevalent depth sensing technology in the fields of\nrobotics, medical imaging, and non-destructive testing. Yet, ToF sensing faces\nchallenges from complex ambient conditions making an inverse modelling from the\nsparse temporal information intractable. This paper highlights the potential of\nmodern super-resolution techniques to learn varying surroundings for a reliable\nand accurate ToF detection. Unlike existing models, we tailor an architecture\nfor sub-sample precise semi-global signal localization by combining\nsuper-resolution with an efficient residual contraction block to balance\nbetween fine signal details and large scale contextual information. We\nconsolidate research on ToF by conducting a benchmark comparison against six\nstate-of-the-art methods for which we employ two publicly available datasets.\nThis includes the release of our SToF-Chirp dataset captured by an airborne\nultrasound transducer. Results showcase the superior performance of our\nproposed StofNet in terms of precision, reliability and model complexity. Our\ncode is available at https://github.com/hahnec/stofnet.\n","authors":["Christopher Hahne","Michel Hayoz","Raphael Sznitman"],"pdf_url":"https://arxiv.org/pdf/2308.12009v1.pdf","comment":"pre-print"},{"id":"http://arxiv.org/abs/2308.10522v3","updated":"2023-08-23T08:49:54Z","published":"2023-08-21T07:19:47Z","title":"Information Theory-Guided Heuristic Progressive Multi-View Coding","summary":" Multi-view representation learning aims to capture comprehensive information\nfrom multiple views of a shared context. Recent works intuitively apply\ncontrastive learning to different views in a pairwise manner, which is still\nscalable: view-specific noise is not filtered in learning view-shared\nrepresentations; the fake negative pairs, where the negative terms are actually\nwithin the same class as the positive, and the real negative pairs are\ncoequally treated; evenly measuring the similarities between terms might\ninterfere with optimization. Importantly, few works study the theoretical\nframework of generalized self-supervised multi-view learning, especially for\nmore than two views. To this end, we rethink the existing multi-view learning\nparadigm from the perspective of information theory and then propose a novel\ninformation theoretical framework for generalized multi-view learning. Guided\nby it, we build a multi-view coding method with a three-tier progressive\narchitecture, namely Information theory-guided hierarchical Progressive\nMulti-view Coding (IPMC). In the distribution-tier, IPMC aligns the\ndistribution between views to reduce view-specific noise. In the set-tier, IPMC\nconstructs self-adjusted contrasting pools, which are adaptively modified by a\nview filter. Lastly, in the instance-tier, we adopt a designed unified loss to\nlearn representations and reduce the gradient interference. Theoretically and\nempirically, we demonstrate the superiority of IPMC over state-of-the-art\nmethods.\n","authors":["Jiangmeng Li","Hang Gao","Wenwen Qiang","Changwen Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.10522v3.pdf","comment":"This paper is accepted by the jourcal of Neural Networks (Elsevier)\n by 2023. arXiv admin note: substantial text overlap with arXiv:2109.02344"},{"id":"http://arxiv.org/abs/2308.12006v1","updated":"2023-08-23T08:49:43Z","published":"2023-08-23T08:49:43Z","title":"Multi-stage Factorized Spatio-Temporal Representation for RGB-D Action\n and Gesture Recognition","summary":" RGB-D action and gesture recognition remain an interesting topic in\nhuman-centered scene understanding, primarily due to the multiple granularities\nand large variation in human motion. Although many RGB-D based action and\ngesture recognition approaches have demonstrated remarkable results by\nutilizing highly integrated spatio-temporal representations across multiple\nmodalities (i.e., RGB and depth data), they still encounter several challenges.\nFirstly, vanilla 3D convolution makes it hard to capture fine-grained motion\ndifferences between local clips under different modalities. Secondly, the\nintricate nature of highly integrated spatio-temporal modeling can lead to\noptimization difficulties. Thirdly, duplicate and unnecessary information can\nadd complexity and complicate entangled spatio-temporal modeling. To address\nthe above issues, we propose an innovative heuristic architecture called\nMulti-stage Factorized Spatio-Temporal (MFST) for RGB-D action and gesture\nrecognition. The proposed MFST model comprises a 3D Central Difference\nConvolution Stem (CDC-Stem) module and multiple factorized spatio-temporal\nstages. The CDC-Stem enriches fine-grained temporal perception, and the\nmultiple hierarchical spatio-temporal stages construct dimension-independent\nhigher-order semantic primitives. Specifically, the CDC-Stem module captures\nbottom-level spatio-temporal features and passes them successively to the\nfollowing spatio-temporal factored stages to capture the hierarchical spatial\nand temporal features through the Multi- Scale Convolution and Transformer\n(MSC-Trans) hybrid block and Weight-shared Multi-Scale Transformer (WMS-Trans)\nblock. The seamless integration of these innovative designs results in a robust\nspatio-temporal representation that outperforms state-of-the-art approaches on\nRGB-D action and gesture recognition datasets.\n","authors":["Yujun Ma","Benjia Zhou","Ruili Wang","Pichao Wang"],"pdf_url":"https://arxiv.org/pdf/2308.12006v1.pdf","comment":"ACM MM'23 has accepted this paper"},{"id":"http://arxiv.org/abs/2308.12001v1","updated":"2023-08-23T08:41:21Z","published":"2023-08-23T08:41:21Z","title":"Local Distortion Aware Efficient Transformer Adaptation for Image\n Quality Assessment","summary":" Image Quality Assessment (IQA) constitutes a fundamental task within the\nfield of computer vision, yet it remains an unresolved challenge, owing to the\nintricate distortion conditions, diverse image contents, and limited\navailability of data. Recently, the community has witnessed the emergence of\nnumerous large-scale pretrained foundation models, which greatly benefit from\ndramatically increased data and parameter capacities. However, it remains an\nopen problem whether the scaling law in high-level tasks is also applicable to\nIQA task which is closely related to low-level clues. In this paper, we\ndemonstrate that with proper injection of local distortion features, a larger\npretrained and fixed foundation model performs better in IQA tasks.\nSpecifically, for the lack of local distortion structure and inductive bias of\nvision transformer (ViT), alongside the large-scale pretrained ViT, we use\nanother pretrained convolution neural network (CNN), which is well known for\ncapturing the local structure, to extract multi-scale image features. Further,\nwe propose a local distortion extractor to obtain local distortion features\nfrom the pretrained CNN and a local distortion injector to inject the local\ndistortion features into ViT. By only training the extractor and injector, our\nmethod can benefit from the rich knowledge in the powerful foundation models\nand achieve state-of-the-art performance on popular IQA datasets, indicating\nthat IQA is not only a low-level problem but also benefits from stronger\nhigh-level features drawn from large-scale pretrained models.\n","authors":["Kangmin Xu","Liang Liao","Jing Xiao","Chaofeng Chen","Haoning Wu","Qiong Yan","Weisi Lin"],"pdf_url":"https://arxiv.org/pdf/2308.12001v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11994v1","updated":"2023-08-23T08:29:10Z","published":"2023-08-23T08:29:10Z","title":"Progressive Feature Mining and External Knowledge-Assisted\n Text-Pedestrian Image Retrieval","summary":" Text-Pedestrian Image Retrieval aims to use the text describing pedestrian\nappearance to retrieve the corresponding pedestrian image. This task involves\nnot only modality discrepancy, but also the challenge of the textual diversity\nof pedestrians with the same identity. At present, although existing research\nprogress has been made in text-pedestrian image retrieval, these methods do not\ncomprehensively consider the above-mentioned problems. Considering these, this\npaper proposes a progressive feature mining and external knowledge-assisted\nfeature purification method. Specifically, we use a progressive mining mode to\nenable the model to mine discriminative features from neglected information,\nthereby avoiding the loss of discriminative information and improving the\nexpression ability of features. In addition, to further reduce the negative\nimpact of modal discrepancy and text diversity on cross-modal matching, we\npropose to use other sample knowledge of the same modality, i.e., external\nknowledge to enhance identity-consistent features and weaken\nidentity-inconsistent features. This process purifies features and alleviates\nthe interference caused by textual diversity and negative sample correlation\nfeatures of the same modal. Extensive experiments on three challenging datasets\ndemonstrate the effectiveness and superiority of the proposed method, and the\nretrieval performance even surpasses that of the large-scale model-based method\non large-scale datasets.\n","authors":["Huafeng Li","Shedan Yang","Yafei Zhang","Dapeng Tao","Zhengtao Yu"],"pdf_url":"https://arxiv.org/pdf/2308.11994v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11990v1","updated":"2023-08-23T08:25:30Z","published":"2023-08-23T08:25:30Z","title":"RankMixup: Ranking-Based Mixup Training for Network Calibration","summary":" Network calibration aims to accurately estimate the level of confidences,\nwhich is particularly important for employing deep neural networks in\nreal-world systems. Recent approaches leverage mixup to calibrate the network's\npredictions during training. However, they do not consider the problem that\nmixtures of labels in mixup may not accurately represent the actual\ndistribution of augmented samples. In this paper, we present RankMixup, a novel\nmixup-based framework alleviating the problem of the mixture of labels for\nnetwork calibration. To this end, we propose to use an ordinal ranking\nrelationship between raw and mixup-augmented samples as an alternative\nsupervisory signal to the label mixtures for network calibration. We\nhypothesize that the network should estimate a higher level of confidence for\nthe raw samples than the augmented ones (Fig.1). To implement this idea, we\nintroduce a mixup-based ranking loss (MRL) that encourages lower confidences\nfor augmented samples compared to raw ones, maintaining the ranking\nrelationship. We also propose to leverage the ranking relationship among\nmultiple mixup-augmented samples to further improve the calibration capability.\nAugmented samples with larger mixing coefficients are expected to have higher\nconfidences and vice versa (Fig.1). That is, the order of confidences should be\naligned with that of mixing coefficients. To this end, we introduce a novel\nloss, M-NDCG, in order to reduce the number of misaligned pairs of the\ncoefficients and confidences. Extensive experimental results on standard\nbenchmarks for network calibration demonstrate the effectiveness of RankMixup.\n","authors":["Jongyoun Noh","Hyekang Park","Junghyup Lee","Bumsub Ham"],"pdf_url":"https://arxiv.org/pdf/2308.11990v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2308.11983v1","updated":"2023-08-23T08:15:15Z","published":"2023-08-23T08:15:15Z","title":"Multi-Modal Multi-Task (3MT) Road Segmentation","summary":" Multi-modal systems have the capacity of producing more reliable results than\nsystems with a single modality in road detection due to perceiving different\naspects of the scene. We focus on using raw sensor inputs instead of, as it is\ntypically done in many SOTA works, leveraging architectures that require high\npre-processing costs such as surface normals or dense depth predictions. By\nusing raw sensor inputs, we aim to utilize a low-cost model thatminimizes both\nthe pre-processing andmodel computation costs. This study presents a\ncost-effective and highly accurate solution for road segmentation by\nintegrating data from multiple sensorswithin a multi-task learning\narchitecture.Afusion architecture is proposed in which RGB and LiDAR depth\nimages constitute the inputs of the network. Another contribution of this study\nis to use IMU/GNSS (inertial measurement unit/global navigation satellite\nsystem) inertial navigation system whose data is collected synchronously and\ncalibrated with a LiDAR-camera to compute aggregated dense LiDAR depth images.\nIt has been demonstrated by experiments on the KITTI dataset that the proposed\nmethod offers fast and high-performance solutions. We have also shown the\nperformance of our method on Cityscapes where raw LiDAR data is not available.\nThe segmentation results obtained for both full and half resolution images are\ncompetitive with existing methods. Therefore, we conclude that our method is\nnot dependent only on raw LiDAR data; rather, it can be used with different\nsensor modalities. The inference times obtained in all experiments are very\npromising for real-time experiments.\n","authors":["Erkan Milli","Özgür Erkent","Asım Egemen Yılmaz"],"pdf_url":"https://arxiv.org/pdf/2308.11983v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11979v1","updated":"2023-08-23T07:58:20Z","published":"2023-08-23T07:58:20Z","title":"Rotation-Invariant Completion Network","summary":" Real-world point clouds usually suffer from incompleteness and display\ndifferent poses. While current point cloud completion methods excel in\nreproducing complete point clouds with consistent poses as seen in the training\nset, their performance tends to be unsatisfactory when handling point clouds\nwith diverse poses. We propose a network named Rotation-Invariant Completion\nNetwork (RICNet), which consists of two parts: a Dual Pipeline Completion\nNetwork (DPCNet) and an enhancing module. Firstly, DPCNet generates a coarse\ncomplete point cloud. The feature extraction module of DPCNet can extract\nconsistent features, no matter if the input point cloud has undergone rotation\nor translation. Subsequently, the enhancing module refines the fine-grained\ndetails of the final generated point cloud. RICNet achieves better rotation\ninvariance in feature extraction and incorporates structural relationships in\nman-made objects. To assess the performance of RICNet and existing methods on\npoint clouds with various poses, we applied random transformations to the point\nclouds in the MVP dataset and conducted experiments on them. Our experiments\ndemonstrate that RICNet exhibits superior completion performance compared to\nexisting methods.\n","authors":["Yu Chen","Pengcheng Shi"],"pdf_url":"https://arxiv.org/pdf/2308.11979v1.pdf","comment":"12 pages, accepted to PRCV 2023 (The 6th Chinese Conference on\n Pattern Recognition and Computer Vision)"},{"id":"http://arxiv.org/abs/2308.11974v1","updated":"2023-08-23T07:46:44Z","published":"2023-08-23T07:46:44Z","title":"Blending-NeRF: Text-Driven Localized Editing in Neural Radiance Fields","summary":" Text-driven localized editing of 3D objects is particularly difficult as\nlocally mixing the original 3D object with the intended new object and style\neffects without distorting the object's form is not a straightforward process.\nTo address this issue, we propose a novel NeRF-based model, Blending-NeRF,\nwhich consists of two NeRF networks: pretrained NeRF and editable NeRF.\nAdditionally, we introduce new blending operations that allow Blending-NeRF to\nproperly edit target regions which are localized by text. By using a pretrained\nvision-language aligned model, CLIP, we guide Blending-NeRF to add new objects\nwith varying colors and densities, modify textures, and remove parts of the\noriginal object. Our extensive experiments demonstrate that Blending-NeRF\nproduces naturally and locally edited 3D objects from various text prompts.\n","authors":["Hyeonseop Song","Seokhun Choi","Hoseok Do","Chul Lee","Taehyeong Kim"],"pdf_url":"https://arxiv.org/pdf/2308.11974v1.pdf","comment":"Accepted to ICCV 2023. The first two authors contributed equally to\n this work"},{"id":"http://arxiv.org/abs/2308.11971v1","updated":"2023-08-23T07:36:30Z","published":"2023-08-23T07:36:30Z","title":"EVE: Efficient Vision-Language Pre-training with Masked Prediction and\n Modality-Aware MoE","summary":" Building scalable vision-language models to learn from diverse, multimodal\ndata remains an open challenge. In this paper, we introduce an Efficient\nVision-languagE foundation model, namely EVE, which is one unified multimodal\nTransformer pre-trained solely by one unified pre-training task. Specifically,\nEVE encodes both vision and language within a shared Transformer network\nintegrated with modality-aware sparse Mixture-of-Experts (MoE) modules, which\ncapture modality-specific information by selectively switching to different\nexperts. To unify pre-training tasks of vision and language, EVE performs\nmasked signal modeling on image-text pairs to reconstruct masked signals, i.e.,\nimage pixels and text tokens, given visible signals. This simple yet effective\npre-training objective accelerates training by 3.5x compared to the model\npre-trained with Image-Text Contrastive and Image-Text Matching losses. Owing\nto the combination of the unified architecture and pre-training task, EVE is\neasy to scale up, enabling better downstream performance with fewer resources\nand faster training speed. Despite its simplicity, EVE achieves\nstate-of-the-art performance on various vision-language downstream tasks,\nincluding visual question answering, visual reasoning, and image-text\nretrieval.\n","authors":["Junyi Chen","Longteng Guo","Jia Sun","Shuai Shao","Zehuan Yuan","Liang Lin","Dongyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.11971v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11969v1","updated":"2023-08-23T07:30:16Z","published":"2023-08-23T07:30:16Z","title":"Anisotropic Hybrid Networks for liver tumor segmentation with\n uncertainty quantification","summary":" The burden of liver tumors is important, ranking as the fourth leading cause\nof cancer mortality. In case of hepatocellular carcinoma (HCC), the delineation\nof liver and tumor on contrast-enhanced magnetic resonance imaging (CE-MRI) is\nperformed to guide the treatment strategy. As this task is time-consuming,\nneeds high expertise and could be subject to inter-observer variability there\nis a strong need for automatic tools. However, challenges arise from the lack\nof available training data, as well as the high variability in terms of image\nresolution and MRI sequence. In this work we propose to compare two different\npipelines based on anisotropic models to obtain the segmentation of the liver\nand tumors. The first pipeline corresponds to a baseline multi-class model that\nperforms the simultaneous segmentation of the liver and tumor classes. In the\nsecond approach, we train two distinct binary models, one segmenting the liver\nonly and the other the tumors. Our results show that both pipelines exhibit\ndifferent strengths and weaknesses. Moreover we propose an uncertainty\nquantification strategy allowing the identification of potential false positive\ntumor lesions. Both solutions were submitted to the MICCAI 2023 Atlas challenge\nregarding liver and tumor segmentation.\n","authors":["Benjamin Lambert","Pauline Roca","Florence Forbes","Senan Doyle","Michel Dojat"],"pdf_url":"https://arxiv.org/pdf/2308.11969v1.pdf","comment":"Accepted for presentation at MICCAI Workshop on 2nd\n Resource-Efficient Medical Image Analysis (REMIA)"},{"id":"http://arxiv.org/abs/2301.09091v2","updated":"2023-08-23T07:23:17Z","published":"2023-01-22T10:17:02Z","title":"BallGAN: 3D-aware Image Synthesis with a Spherical Background","summary":" 3D-aware GANs aim to synthesize realistic 3D scenes such that they can be\nrendered in arbitrary perspectives to produce images. Although previous methods\nproduce realistic images, they suffer from unstable training or degenerate\nsolutions where the 3D geometry is unnatural. We hypothesize that the 3D\ngeometry is underdetermined due to the insufficient constraint, i.e., being\nclassified as real image to the discriminator is not enough. To solve this\nproblem, we propose to approximate the background as a spherical surface and\nrepresent a scene as a union of the foreground placed in the sphere and the\nthin spherical background. It reduces the degree of freedom in the background\nfield. Accordingly, we modify the volume rendering equation and incorporate\ndedicated constraints to design a novel 3D-aware GAN framework named BallGAN.\nBallGAN has multiple advantages as follows. 1) It produces more reasonable 3D\ngeometry; the images of a scene across different viewpoints have better\nphotometric consistency and fidelity than the state-of-the-art methods. 2) The\ntraining becomes much more stable. 3) The foreground can be separately rendered\non top of different arbitrary backgrounds.\n","authors":["Minjung Shin","Yunji Seo","Jeongmin Bae","Young Sun Choi","Hyunsu Kim","Hyeran Byun","Youngjung Uh"],"pdf_url":"https://arxiv.org/pdf/2301.09091v2.pdf","comment":"ICCV 2023, Project Page: https://minjung-s.github.io/ballgan"},{"id":"http://arxiv.org/abs/2308.11951v1","updated":"2023-08-23T06:49:07Z","published":"2023-08-23T06:49:07Z","title":"Pose Modulated Avatars from Video","summary":" It is now possible to reconstruct dynamic human motion and shape from a\nsparse set of cameras using Neural Radiance Fields (NeRF) driven by an\nunderlying skeleton. However, a challenge remains to model the deformation of\ncloth and skin in relation to skeleton pose. Unlike existing avatar models that\nare learned implicitly or rely on a proxy surface, our approach is motivated by\nthe observation that different poses necessitate unique frequency assignments.\nNeglecting this distinction yields noisy artifacts in smooth areas or blurs\nfine-grained texture and shape details in sharp regions. We develop a\ntwo-branch neural network that is adaptive and explicit in the frequency\ndomain. The first branch is a graph neural network that models correlations\namong body parts locally, taking skeleton pose as input. The second branch\ncombines these correlation features to a set of global frequencies and then\nmodulates the feature encoding. Our experiments demonstrate that our network\noutperforms state-of-the-art methods in terms of preserving details and\ngeneralization capabilities.\n","authors":["Chunjin Song","Bastian Wandt","Helge Rhodin"],"pdf_url":"https://arxiv.org/pdf/2308.11951v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11949v1","updated":"2023-08-23T06:45:11Z","published":"2023-08-23T06:45:11Z","title":"High-quality Image Dehazing with Diffusion Model","summary":" Image dehazing is quite challenging in dense-haze scenarios, where quite less\noriginal information remains in the hazy image. Though previous methods have\nmade marvelous progress, they still suffer from information loss in content and\ncolor in dense-haze scenarios. The recently emerged Denoising Diffusion\nProbabilistic Model (DDPM) exhibits strong generation ability, showing\npotential for solving this problem. However, DDPM fails to consider the physics\nproperty of dehazing task, limiting its information completion capacity. In\nthis work, we propose DehazeDDPM: A DDPM-based and physics-aware image dehazing\nframework that applies to complex hazy scenarios. Specifically, DehazeDDPM\nworks in two stages. The former stage physically models the dehazing task with\nthe Atmospheric Scattering Model (ASM), pulling the distribution closer to the\nclear data and endowing DehazeDDPM with fog-aware ability. The latter stage\nexploits the strong generation ability of DDPM to compensate for the\nhaze-induced huge information loss, by working in conjunction with the physical\nmodelling. Extensive experiments demonstrate that our method attains\nstate-of-the-art performance on both synthetic and real-world hazy datasets.\n","authors":["Hu Yu","Jie Huang","Kaiwen Zheng","Man Zhou","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.11949v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11948v1","updated":"2023-08-23T06:44:44Z","published":"2023-08-23T06:44:44Z","title":"Efficient Transfer Learning in Diffusion Models via Adversarial Noise","summary":" Diffusion Probabilistic Models (DPMs) have demonstrated substantial promise\nin image generation tasks but heavily rely on the availability of large amounts\nof training data. Previous works, like GANs, have tackled the limited data\nproblem by transferring pre-trained models learned with sufficient data.\nHowever, those methods are hard to be utilized in DPMs since the distinct\ndifferences between DPM-based and GAN-based methods, showing in the unique\niterative denoising process integral and the need for many timesteps with\nno-targeted noise in DPMs. In this paper, we propose a novel DPMs-based\ntransfer learning method, TAN, to address the limited data problem. It includes\ntwo strategies: similarity-guided training, which boosts transfer with a\nclassifier, and adversarial noise selection which adaptive chooses targeted\nnoise based on the input image. Extensive experiments in the context of\nfew-shot image generation tasks demonstrate that our method is not only\nefficient but also excels in terms of image quality and diversity when compared\nto existing GAN-based and DDPM-based methods.\n","authors":["Xiyu Wang","Baijiong Lin","Daochang Liu","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2308.11948v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10632v2","updated":"2023-08-23T06:41:42Z","published":"2023-08-21T11:07:27Z","title":"Foundation Model-oriented Robustness: Robust Image Model Evaluation with\n Pretrained Models","summary":" Machine learning has demonstrated remarkable performance over finite\ndatasets, yet whether the scores over the fixed benchmarks can sufficiently\nindicate the model's performance in the real world is still in discussion. In\nreality, an ideal robust model will probably behave similarly to the oracle\n(e.g., the human users), thus a good evaluation protocol is probably to\nevaluate the models' behaviors in comparison to the oracle. In this paper, we\nintroduce a new robustness measurement that directly measures the image\nclassification model's performance compared with a surrogate oracle (i.e., a\nfoundation model). Besides, we design a simple method that can accomplish the\nevaluation beyond the scope of the benchmarks. Our method extends the image\ndatasets with new samples that are sufficiently perturbed to be distinct from\nthe ones in the original sets, but are still bounded within the same\nimage-label structure the original test image represents, constrained by a\nfoundation model pretrained with a large amount of samples. As a result, our\nnew method will offer us a new way to evaluate the models' robustness\nperformance, free of limitations of fixed benchmarks or constrained\nperturbations, although scoped by the power of the oracle. In addition to the\nevaluation results, we also leverage our generated data to understand the\nbehaviors of the model and our new evaluation strategies.\n","authors":["Peiyan Zhang","Haoyang Liu","Chaozhuo Li","Xing Xie","Sunghun Kim","Haohan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.10632v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11945v1","updated":"2023-08-23T06:37:41Z","published":"2023-08-23T06:37:41Z","title":"LongDanceDiff: Long-term Dance Generation with Conditional Diffusion\n Model","summary":" Dancing with music is always an essential human art form to express emotion.\nDue to the high temporal-spacial complexity, long-term 3D realist dance\ngeneration synchronized with music is challenging. Existing methods suffer from\nthe freezing problem when generating long-term dances due to error accumulation\nand training-inference discrepancy. To address this, we design a conditional\ndiffusion model, LongDanceDiff, for this sequence-to-sequence long-term dance\ngeneration, addressing the challenges of temporal coherency and spatial\nconstraint. LongDanceDiff contains a transformer-based diffusion model, where\nthe input is a concatenation of music, past motions, and noised future motions.\nThis partial noising strategy leverages the full-attention mechanism and learns\nthe dependencies among music and past motions. To enhance the diversity of\ngenerated dance motions and mitigate the freezing problem, we introduce a\nmutual information minimization objective that regularizes the dependency\nbetween past and future motions. We also address common visual quality issues\nin dance generation, such as foot sliding and unsmooth motion, by incorporating\nspatial constraints through a Global-Trajectory Modulation (GTM) layer and\nmotion perceptual losses, thereby improving the smoothness and naturalness of\nmotion generation. Extensive experiments demonstrate a significant improvement\nin our approach over the existing state-of-the-art methods. We plan to release\nour codes and models soon.\n","authors":["Siqi Yang","Zejun Yang","Zhisheng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.11945v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01095v2","updated":"2023-08-23T06:26:56Z","published":"2023-08-02T11:58:43Z","title":"AutoPoster: A Highly Automatic and Content-aware Design System for\n Advertising Poster Generation","summary":" Advertising posters, a form of information presentation, combine visual and\nlinguistic modalities. Creating a poster involves multiple steps and\nnecessitates design experience and creativity. This paper introduces\nAutoPoster, a highly automatic and content-aware system for generating\nadvertising posters. With only product images and titles as inputs, AutoPoster\ncan automatically produce posters of varying sizes through four key stages:\nimage cleaning and retargeting, layout generation, tagline generation, and\nstyle attribute prediction. To ensure visual harmony of posters, two\ncontent-aware models are incorporated for layout and tagline generation.\nMoreover, we propose a novel multi-task Style Attribute Predictor (SAP) to\njointly predict visual style attributes. Meanwhile, to our knowledge, we\npropose the first poster generation dataset that includes visual attribute\nannotations for over 76k posters. Qualitative and quantitative outcomes from\nuser studies and experiments substantiate the efficacy of our system and the\naesthetic superiority of the generated posters compared to other poster\ngeneration methods.\n","authors":["Jinpeng Lin","Min Zhou","Ye Ma","Yifan Gao","Chenxi Fei","Yangjian Chen","Zhang Yu","Tiezheng Ge"],"pdf_url":"https://arxiv.org/pdf/2308.01095v2.pdf","comment":"Accepted for ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.11941v1","updated":"2023-08-23T06:22:02Z","published":"2023-08-23T06:22:02Z","title":"Boosting Diffusion Models with an Adaptive Momentum Sampler","summary":" Diffusion probabilistic models (DPMs) have been shown to generate\nhigh-quality images without the need for delicate adversarial training.\nHowever, the current sampling process in DPMs is prone to violent shaking. In\nthis paper, we present a novel reverse sampler for DPMs inspired by the\nwidely-used Adam optimizer. Our proposed sampler can be readily applied to a\npre-trained diffusion model, utilizing momentum mechanisms and adaptive\nupdating to smooth the reverse sampling process and ensure stable generation,\nresulting in outputs of enhanced quality. By implicitly reusing update\ndirections from early steps, our proposed sampler achieves a better balance\nbetween high-level semantics and low-level details. Additionally, this sampler\nis flexible and can be easily integrated into pre-trained DPMs regardless of\nthe sampler used during training. Our experimental results on multiple\nbenchmarks demonstrate that our proposed reverse sampler yields remarkable\nimprovements over different baselines. We will make the source code available.\n","authors":["Xiyu Wang","Anh-Dung Dinh","Daochang Liu","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2308.11941v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09917v2","updated":"2023-08-23T06:19:28Z","published":"2023-08-19T05:49:13Z","title":"Learning Multiscale Consistency for Self-supervised Electron Microscopy\n Instance Segmentation","summary":" Instance segmentation in electron microscopy (EM) volumes poses a significant\nchallenge due to the complex morphology of instances and insufficient\nannotations. Self-supervised learning has recently emerged as a promising\nsolution, enabling the acquisition of prior knowledge of cellular tissue\nstructures that are essential for EM instance segmentation. However, existing\npretraining methods often lack the ability to capture complex visual patterns\nand relationships between voxels, which results in the acquired prior knowledge\nbeing insufficient for downstream EM analysis tasks. In this paper, we propose\na novel pretraining framework that leverages multiscale visual representations\nto capture both voxel-level and feature-level consistency in EM volumes.\nSpecifically, our framework enforces voxel-level consistency between the\noutputs of a Siamese network by a reconstruction function, and incorporates a\ncross-attention mechanism for soft feature matching to achieve fine-grained\nfeature-level consistency. Moreover, we propose a contrastive learning scheme\non the feature pyramid to extract discriminative features across multiple\nscales. We extensively pretrain our method on four large-scale EM datasets,\nachieving promising performance improvements in representative tasks of neuron\nand mitochondria instance segmentation.\n","authors":["Yinda Chen","Wei Huang","Xiaoyu Liu","Qi Chen","Zhiwei Xiong"],"pdf_url":"https://arxiv.org/pdf/2308.09917v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11937v1","updated":"2023-08-23T06:07:56Z","published":"2023-08-23T06:07:56Z","title":"Learning Bottleneck Transformer for Event Image-Voxel Feature Fusion\n based Classification","summary":" Recognizing target objects using an event-based camera draws more and more\nattention in recent years. Existing works usually represent the event streams\ninto point-cloud, voxel, image, etc, and learn the feature representations\nusing various deep neural networks. Their final results may be limited by the\nfollowing factors: monotonous modal expressions and the design of the network\nstructure. To address the aforementioned challenges, this paper proposes a\nnovel dual-stream framework for event representation, extraction, and fusion.\nThis framework simultaneously models two common representations: event images\nand event voxels. By utilizing Transformer and Structured Graph Neural Network\n(GNN) architectures, spatial information and three-dimensional stereo\ninformation can be learned separately. Additionally, a bottleneck Transformer\nis introduced to facilitate the fusion of the dual-stream information.\nExtensive experiments demonstrate that our proposed framework achieves\nstate-of-the-art performance on two widely used event-based classification\ndatasets. The source code of this work is available at:\n\\url{https://github.com/Event-AHU/EFV_event_classification}\n","authors":["Chengguo Yuan","Yu Jin","Zongzhen Wu","Fanting Wei","Yangzirui Wang","Lan Chen","Xiao Wang"],"pdf_url":"https://arxiv.org/pdf/2308.11937v1.pdf","comment":"Accepted by PRCV-2023"},{"id":"http://arxiv.org/abs/2303.15749v2","updated":"2023-08-23T06:04:56Z","published":"2023-03-28T06:12:53Z","title":"Iteratively Coupled Multiple Instance Learning from Instance to Bag\n Classifier for Whole Slide Image Classification","summary":" Whole Slide Image (WSI) classification remains a challenge due to their\nextremely high resolution and the absence of fine-grained labels. Presently,\nWSI classification is usually regarded as a Multiple Instance Learning (MIL)\nproblem when only slide-level labels are available. MIL methods involve a patch\nembedding module and a bag-level classification module, but they are\nprohibitively expensive to be trained in an end-to-end manner. Therefore,\nexisting methods usually train them separately, or directly skip the training\nof the embedder. Such schemes hinder the patch embedder's access to slide-level\nsemantic labels, resulting in inconsistency within the entire MIL pipeline. To\novercome this issue, we propose a novel framework called Iteratively Coupled\nMIL (ICMIL), which bridges the loss back-propagation process from the bag-level\nclassifier to the patch embedder. In ICMIL, we use category information in the\nbag-level classifier to guide the patch-level fine-tuning of the patch feature\nextractor. The refined embedder then generates better instance representations\nfor achieving a more accurate bag-level classifier. By coupling the patch\nembedder and bag classifier at a low cost, our proposed framework enables\ninformation exchange between the two modules, benefiting the entire MIL\nclassification model. We tested our framework on two datasets using three\ndifferent backbones, and our experimental results demonstrate consistent\nperformance improvements over state-of-the-art MIL methods. The code is\navailable at: https://github.com/Dootmaan/ICMIL.\n","authors":["Hongyi Wang","Luyang Luo","Fang Wang","Ruofeng Tong","Yen-Wei Chen","Hongjie Hu","Lanfen Lin","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2303.15749v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11561v2","updated":"2023-08-23T05:53:43Z","published":"2023-08-22T16:45:35Z","title":"Target-Grounded Graph-Aware Transformer for Aerial Vision-and-Dialog\n Navigation","summary":" This report details the method of the winning entry of the AVDN Challenge in\nICCV 2023. The competition addresses the Aerial Navigation from Dialog History\n(ANDH) task, which requires a drone agent to associate dialog history with\naerial observations to reach the destination. For better cross-modal grounding\nabilities of the drone agent, we propose a Target-Grounded Graph-Aware\nTransformer (TG-GAT) framework. Concretely, TG-GAT first leverages a\ngraph-aware transformer to capture spatiotemporal dependency, which benefits\nnavigation state tracking and robust action planning. In addition, an auxiliary\nvisual grounding task is devised to boost the agent's awareness of referred\nlandmarks. Moreover, a hybrid augmentation strategy based on large language\nmodels is utilized to mitigate data scarcity limitations. Our TG-GAT framework\nwon the AVDN Challenge 2023, with 2.2% and 3.0% absolute improvements over the\nbaseline on SPL and SR metrics, respectively. The code is available at\nhttps://github.com/yifeisu/avdn-challenge.\n","authors":["Yifei Su","Dong An","Yuan Xu","Kehan Chen","Yan Huang"],"pdf_url":"https://arxiv.org/pdf/2308.11561v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11298v2","updated":"2023-08-23T05:44:57Z","published":"2023-08-22T09:20:55Z","title":"BHSD: A 3D Multi-Class Brain Hemorrhage Segmentation Dataset","summary":" Intracranial hemorrhage (ICH) is a pathological condition characterized by\nbleeding inside the skull or brain, which can be attributed to various factors.\nIdentifying, localizing and quantifying ICH has important clinical\nimplications, in a bleed-dependent manner. While deep learning techniques are\nwidely used in medical image segmentation and have been applied to the ICH\nsegmentation task, existing public ICH datasets do not support the multi-class\nsegmentation problem. To address this, we develop the Brain Hemorrhage\nSegmentation Dataset (BHSD), which provides a 3D multi-class ICH dataset\ncontaining 192 volumes with pixel-level annotations and 2200 volumes with\nslice-level annotations across five categories of ICH. To demonstrate the\nutility of the dataset, we formulate a series of supervised and semi-supervised\nICH segmentation tasks. We provide experimental results with state-of-the-art\nmodels as reference benchmarks for further model developments and evaluations\non this dataset.\n","authors":["Biao Wu","Yutong Xie","Zeyu Zhang","Jinchao Ge","Kaspar Yaxley","Suzan Bahadir","Qi Wu","Yifan Liu","Minh-Son To"],"pdf_url":"https://arxiv.org/pdf/2308.11298v2.pdf","comment":"Accepted by MLMI 2023"},{"id":"http://arxiv.org/abs/2308.11932v1","updated":"2023-08-23T05:40:55Z","published":"2023-08-23T05:40:55Z","title":"Synergistic Multiscale Detail Refinement via Intrinsic Supervision for\n Underwater Image Enhancement","summary":" Visual restoration of underwater scenes is crucial for visual tasks, and\navoiding interference from underwater media has become a prominent concern. In\nthis work, we present a synergistic multiscale detail refinement via intrinsic\nsupervision (SMDR-IS) to recover underwater scene details. The low-degradation\nstage provides multiscale detail for original stage, which achieves synergistic\nmultiscale detail refinement through feature propagation via the adaptive\nselective intrinsic supervised feature module (ASISF), which achieves\nsynergistic multiscale detail refinement. ASISF is developed using intrinsic\nsupervision to precisely control and guide feature transmission in the\nmulti-degradation stages. ASISF improves the multiscale detail refinement while\nreducing interference from irrelevant scene information from the\nlow-degradation stage. Additionally, within the multi-degradation\nencoder-decoder of SMDR-IS, we introduce a bifocal intrinsic-context attention\nmodule (BICA). This module is designed to effectively leverage multi-scale\nscene information found in images, using intrinsic supervision principles as\nits foundation. BICA facilitates the guidance of higher-resolution spaces by\nleveraging lower-resolution spaces, considering the significant dependency of\nunderwater image restoration on spatial contextual relationships. During the\ntraining process, the network gains advantages from the integration of a\nmulti-degradation loss function. This function serves as a constraint, enabling\nthe network to effectively exploit information across various scales. When\ncompared with state-of-the-art methods, SMDR-IS demonstrates its outstanding\nperformance. Code will be made publicly available.\n","authors":["Dehuan Zhang","Jingchun Zhou","Weishi Zhang","ChunLe Guo","Chongyi Li"],"pdf_url":"https://arxiv.org/pdf/2308.11932v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.16016v2","updated":"2023-08-23T05:39:05Z","published":"2023-06-28T08:44:00Z","title":"Positive Label Is All You Need for Multi-Label Classification","summary":" Multi-label classification (MLC) suffers from the inevitable label noise in\ntraining data due to the difficulty in annotating various semantic labels in\neach image. To mitigate the influence of noisy labels, existing methods mainly\ndevote to identifying and correcting the label mistakes via a trained MLC\nmodel. However, these methods still involve annoying noisy labels in training,\nwhich can result in imprecise recognition of noisy labels and weaken the\nperformance. In this paper, considering that the negative labels are\nsubstantially more than positive labels, and most noisy labels are from the\nnegative labels, we directly discard all the negative labels in the dataset,\nand propose a new method dubbed positive and unlabeled multi-label\nclassification (PU-MLC). By extending positive-unlabeled learning into MLC\ntask, our method trains model with only positive labels and unlabeled data, and\nintroduces adaptive re-balance factor and adaptive temperature coefficient in\nthe loss function to alleviate the catastrophic imbalance in label distribution\nand over-smoothing of probabilities in training. Furthermore, to capture both\nlocal and global dependencies in the image, we also introduce a local-global\nconvolution module, which supplements global information into existing\nconvolution layers with no retraining of backbone required. Our PU-MLC is\nsimple and effective, and it is applicable to both MLC and MLC with partial\nlabels (MLC-PL) tasks. Extensive experiments on MS-COCO and PASCAL VOC datasets\ndemonstrate that our PU-MLC achieves significantly improvements on both MLC and\nMLC-PL settings with even fewer annotations. Code will be released.\n","authors":["Zhixiang Yuan","Kaixin Zhang","Tao Huang"],"pdf_url":"https://arxiv.org/pdf/2306.16016v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11928v1","updated":"2023-08-23T05:32:24Z","published":"2023-08-23T05:32:24Z","title":"OFVL-MS: Once for Visual Localization across Multiple Indoor Scenes","summary":" In this work, we seek to predict camera poses across scenes with a multi-task\nlearning manner, where we view the localization of each scene as a new task. We\npropose OFVL-MS, a unified framework that dispenses with the traditional\npractice of training a model for each individual scene and relieves gradient\nconflict induced by optimizing multiple scenes collectively, enabling efficient\nstorage yet precise visual localization for all scenes. Technically, in the\nforward pass of OFVL-MS, we design a layer-adaptive sharing policy with a\nlearnable score for each layer to automatically determine whether the layer is\nshared or not. Such sharing policy empowers us to acquire task-shared\nparameters for a reduction of storage cost and task-specific parameters for\nlearning scene-related features to alleviate gradient conflict. In the backward\npass of OFVL-MS, we introduce a gradient normalization algorithm that\nhomogenizes the gradient magnitude of the task-shared parameters so that all\ntasks converge at the same pace. Furthermore, a sparse penalty loss is applied\non the learnable scores to facilitate parameter sharing for all tasks without\nperformance degradation. We conduct comprehensive experiments on multiple\nbenchmarks and our new released indoor dataset LIVL, showing that OFVL-MS\nfamilies significantly outperform the state-of-the-arts with fewer parameters.\nWe also verify that OFVL-MS can generalize to a new scene with much few\nparameters while gaining superior localization performance.\n","authors":["Tao Xie","Kun Dai","Siyi Lu","Ke Wang","Zhiqiang Jiang","Jinghan Gao","Dedong Liu","Jie Xu","Lijun Zhao","Ruifeng Li"],"pdf_url":"https://arxiv.org/pdf/2308.11928v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11927v1","updated":"2023-08-23T05:26:27Z","published":"2023-08-23T05:26:27Z","title":"Recovering a Molecule's 3D Dynamics from Liquid-phase Electron\n Microscopy Movies","summary":" The dynamics of biomolecules are crucial for our understanding of their\nfunctioning in living systems. However, current 3D imaging techniques, such as\ncryogenic electron microscopy (cryo-EM), require freezing the sample, which\nlimits the observation of their conformational changes in real time. The\ninnovative liquid-phase electron microscopy (liquid-phase EM) technique allows\nmolecules to be placed in the native liquid environment, providing a unique\nopportunity to observe their dynamics. In this paper, we propose TEMPOR, a\nTemporal Electron MicroscoPy Object Reconstruction algorithm for liquid-phase\nEM that leverages an implicit neural representation (INR) and a dynamical\nvariational auto-encoder (DVAE) to recover time series of molecular structures.\nWe demonstrate its advantages in recovering different motion dynamics from two\nsimulated datasets, 7bcq and Cas9. To our knowledge, our work is the first\nattempt to directly recover 3D structures of a temporally-varying particle from\nliquid-phase EM movies. It provides a promising new approach for studying\nmolecules' 3D dynamics in structural biology.\n","authors":["Enze Ye","Yuhang Wang","Hong Zhang","Yiqin Gao","Huan Wang","He Sun"],"pdf_url":"https://arxiv.org/pdf/2308.11927v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09991v2","updated":"2023-08-23T05:19:03Z","published":"2023-08-19T11:52:12Z","title":"AltDiffusion: A Multilingual Text-to-Image Diffusion Model","summary":" Large Text-to-Image(T2I) diffusion models have shown a remarkable capability\nto produce photorealistic and diverse images based on text inputs. However,\nexisting works only support limited language input, e.g., English, Chinese, and\nJapanese, leaving users beyond these languages underserved and blocking the\nglobal expansion of T2I models. Therefore, this paper presents AltDiffusion, a\nnovel multilingual T2I diffusion model that supports eighteen different\nlanguages. Specifically, we first train a multilingual text encoder based on\nthe knowledge distillation. Then we plug it into a pretrained English-only\ndiffusion model and train the model with a two-stage schema to enhance the\nmultilingual capability, including concept alignment and quality improvement\nstage on a large-scale multilingual dataset. Furthermore, we introduce a new\nbenchmark, which includes Multilingual-General-18(MG-18) and\nMultilingual-Cultural-18(MC-18) datasets, to evaluate the capabilities of T2I\ndiffusion models for generating high-quality images and capturing\nculture-specific concepts in different languages. Experimental results on both\nMG-18 and MC-18 demonstrate that AltDiffusion outperforms current\nstate-of-the-art T2I models, e.g., Stable Diffusion in multilingual\nunderstanding, especially with respect to culture-specific concepts, while\nstill having comparable capability for generating high-quality images. All\nsource code and checkpoints could be found in\nhttps://github.com/superhero-7/AltDiffuson.\n","authors":["Fulong Ye","Guang Liu","Xinya Wu","Ledell Wu"],"pdf_url":"https://arxiv.org/pdf/2308.09991v2.pdf","comment":"15 pages; 17 figures"},{"id":"http://arxiv.org/abs/2306.14122v3","updated":"2023-08-23T05:04:58Z","published":"2023-06-25T04:33:56Z","title":"Chain-of-Thought Prompt Distillation for Multimodal Named Entity\n Recognition and Multimodal Relation Extraction","summary":" Multimodal Named Entity Recognition (MNER) and Multimodal Relation Extraction\n(MRE) necessitate the fundamental reasoning capacity for intricate linguistic\nand multimodal comprehension. In this study, we explore distilling the\nreasoning ability of large language models (LLMs) into a more compact student\nmodel by generating a \\textit{chain of thought} (CoT) -- a sequence of\nintermediate reasoning steps. Specifically, we commence by exemplifying the\nelicitation of such reasoning ability from LLMs through CoT prompts covering\nmulti-grain (noun, sentence, multimodality) and data-augmentation (style,\nentity, image) dimensions. Subsequently, we present a novel conditional prompt\ndistillation method to assimilate the commonsense reasoning ability from LLMs,\nthereby enhancing the utility of the student model in addressing text-only\ninputs without the requisite addition of image and CoT knowledge. Extensive\nexperiments reveal that our approach attains state-of-the-art accuracy and\nmanifests a plethora of advantages concerning interpretability, data\nefficiency, and cross-domain generalization on MNER and MRE datasets.\n","authors":["Feng Chen","Yujian Feng"],"pdf_url":"https://arxiv.org/pdf/2306.14122v3.pdf","comment":"modification"},{"id":"http://arxiv.org/abs/2308.11920v1","updated":"2023-08-23T05:04:01Z","published":"2023-08-23T05:04:01Z","title":"Concept Bottleneck with Visual Concept Filtering for Explainable Medical\n Image Classification","summary":" Interpretability is a crucial factor in building reliable models for various\nmedical applications. Concept Bottleneck Models (CBMs) enable interpretable\nimage classification by utilizing human-understandable concepts as intermediate\ntargets. Unlike conventional methods that require extensive human labor to\nconstruct the concept set, recent works leveraging Large Language Models (LLMs)\nfor generating concepts made automatic concept generation possible. However,\nthose methods do not consider whether a concept is visually relevant or not,\nwhich is an important factor in computing meaningful concept scores. Therefore,\nwe propose a visual activation score that measures whether the concept contains\nvisual cues or not, which can be easily computed with unlabeled image data.\nComputed visual activation scores are then used to filter out the less visible\nconcepts, thus resulting in a final concept set with visually meaningful\nconcepts. Our experimental results show that adopting the proposed visual\nactivation score for concept filtering consistently boosts performance compared\nto the baseline. Moreover, qualitative analyses also validate that visually\nrelevant concepts are successfully selected with the visual activation score.\n","authors":["Injae Kim","Jongha Kim","Joonmyung Choi","Hyunwoo J. Kim"],"pdf_url":"https://arxiv.org/pdf/2308.11920v1.pdf","comment":"Accepted to MedAGI Workshop at MICCAI 2023 (Oral Presentation)"},{"id":"http://arxiv.org/abs/2308.11918v1","updated":"2023-08-23T05:03:45Z","published":"2023-08-23T05:03:45Z","title":"AMSP-UOD: When Vortex Convolution and Stochastic Perturbation Meet\n Underwater Object Detection","summary":" In this paper, we present a novel Amplitude-Modulated Stochastic Perturbation\nand Vortex Convolutional Network, AMSP-UOD, designed for underwater object\ndetection. AMSP-UOD specifically addresses the impact of non-ideal imaging\nfactors on detection accuracy in complex underwater environments. To mitigate\nthe influence of noise on object detection performance, we propose AMSP Vortex\nConvolution (AMSP-VConv) to disrupt the noise distribution, enhance feature\nextraction capabilities, effectively reduce parameters, and improve network\nrobustness. We design the Feature Association Decoupling Cross Stage Partial\n(FAD-CSP) module, which strengthens the association of long and short-range\nfeatures, improving the network performance in complex underwater environments.\nAdditionally, our sophisticated post-processing method, based on non-maximum\nsuppression with aspect-ratio similarity thresholds, optimizes detection in\ndense scenes, such as waterweed and schools of fish, improving object detection\naccuracy. Extensive experiments on the URPC and RUOD datasets demonstrate that\nour method outperforms existing state-of-the-art methods in terms of accuracy\nand noise immunity. AMSP-UOD proposes an innovative solution with the potential\nfor real-world applications. Code will be made publicly available.\n","authors":["Jingchun Zhou","Zongxin He","Kin-Man Lam","Yudong Wang","Weishi Zhang","ChunLe Guo","Chongyi Li"],"pdf_url":"https://arxiv.org/pdf/2308.11918v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11917v1","updated":"2023-08-23T05:03:06Z","published":"2023-08-23T05:03:06Z","title":"LFS-GAN: Lifelong Few-Shot Image Generation","summary":" We address a challenging lifelong few-shot image generation task for the\nfirst time. In this situation, a generative model learns a sequence of tasks\nusing only a few samples per task. Consequently, the learned model encounters\nboth catastrophic forgetting and overfitting problems at a time. Existing\nstudies on lifelong GANs have proposed modulation-based methods to prevent\ncatastrophic forgetting. However, they require considerable additional\nparameters and cannot generate high-fidelity and diverse images from limited\ndata. On the other hand, the existing few-shot GANs suffer from severe\ncatastrophic forgetting when learning multiple tasks. To alleviate these\nissues, we propose a framework called Lifelong Few-Shot GAN (LFS-GAN) that can\ngenerate high-quality and diverse images in lifelong few-shot image generation\ntask. Our proposed framework learns each task using an efficient task-specific\nmodulator - Learnable Factorized Tensor (LeFT). LeFT is rank-constrained and\nhas a rich representation ability due to its unique reconstruction technique.\nFurthermore, we propose a novel mode seeking loss to improve the diversity of\nour model in low-data circumstances. Extensive experiments demonstrate that the\nproposed LFS-GAN can generate high-fidelity and diverse images without any\nforgetting and mode collapse in various domains, achieving state-of-the-art in\nlifelong few-shot image generation task. Surprisingly, we find that our LFS-GAN\neven outperforms the existing few-shot GANs in the few-shot image generation\ntask. The code is available at Github.\n","authors":["Juwon Seo","Ji-Su Kang","Gyeong-Moon Park"],"pdf_url":"https://arxiv.org/pdf/2308.11917v1.pdf","comment":"20 pages, 19 figures, 14 tables, ICCV 2023 Poster"},{"id":"http://arxiv.org/abs/2308.11916v1","updated":"2023-08-23T05:02:17Z","published":"2023-08-23T05:02:17Z","title":"Semantic-Aware Implicit Template Learning via Part Deformation\n Consistency","summary":" Learning implicit templates as neural fields has recently shown impressive\nperformance in unsupervised shape correspondence. Despite the success, we\nobserve current approaches, which solely rely on geometric information, often\nlearn suboptimal deformation across generic object shapes, which have high\nstructural variability. In this paper, we highlight the importance of part\ndeformation consistency and propose a semantic-aware implicit template learning\nframework to enable semantically plausible deformation. By leveraging semantic\nprior from a self-supervised feature extractor, we suggest local conditioning\nwith novel semantic-aware deformation code and deformation consistency\nregularizations regarding part deformation, global deformation, and global\nscaling. Our extensive experiments demonstrate the superiority of the proposed\nmethod over baselines in various tasks: keypoint transfer, part label transfer,\nand texture transfer. More interestingly, our framework shows a larger\nperformance gain under more challenging settings. We also provide qualitative\nanalyses to validate the effectiveness of semantic-aware deformation. The code\nis available at https://github.com/mlvlab/PDC.\n","authors":["Sihyeon Kim","Minseok Joo","Jaewon Lee","Juyeon Ko","Juhan Cha","Hyunwoo J. Kim"],"pdf_url":"https://arxiv.org/pdf/2308.11916v1.pdf","comment":"ICCV camera-ready version"},{"id":"http://arxiv.org/abs/2308.11911v1","updated":"2023-08-23T04:52:48Z","published":"2023-08-23T04:52:48Z","title":"ACLS: Adaptive and Conditional Label Smoothing for Network Calibration","summary":" We address the problem of network calibration adjusting miscalibrated\nconfidences of deep neural networks. Many approaches to network calibration\nadopt a regularization-based method that exploits a regularization term to\nsmooth the miscalibrated confidences. Although these approaches have shown the\neffectiveness on calibrating the networks, there is still a lack of\nunderstanding on the underlying principles of regularization in terms of\nnetwork calibration. We present in this paper an in-depth analysis of existing\nregularization-based methods, providing a better understanding on how they\naffect to network calibration. Specifically, we have observed that 1) the\nregularization-based methods can be interpreted as variants of label smoothing,\nand 2) they do not always behave desirably. Based on the analysis, we introduce\na novel loss function, dubbed ACLS, that unifies the merits of existing\nregularization methods, while avoiding the limitations. We show extensive\nexperimental results for image classification and semantic segmentation on\nstandard benchmarks, including CIFAR10, Tiny-ImageNet, ImageNet, and PASCAL\nVOC, demonstrating the effectiveness of our loss function.\n","authors":["Hyekang Park","Jongyoun Noh","Youngmin Oh","Donghyeon Baek","Bumsub Ham"],"pdf_url":"https://arxiv.org/pdf/2308.11911v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.02237v2","updated":"2023-08-23T04:40:45Z","published":"2023-08-04T10:26:59Z","title":"MSECNet: Accurate and Robust Normal Estimation for 3D Point Clouds by\n Multi-Scale Edge Conditioning","summary":" Estimating surface normals from 3D point clouds is critical for various\napplications, including surface reconstruction and rendering. While existing\nmethods for normal estimation perform well in regions where normals change\nslowly, they tend to fail where normals vary rapidly. To address this issue, we\npropose a novel approach called MSECNet, which improves estimation in normal\nvarying regions by treating normal variation modeling as an edge detection\nproblem. MSECNet consists of a backbone network and a multi-scale edge\nconditioning (MSEC) stream. The MSEC stream achieves robust edge detection\nthrough multi-scale feature fusion and adaptive edge detection. The detected\nedges are then combined with the output of the backbone network using the edge\nconditioning module to produce edge-aware representations. Extensive\nexperiments show that MSECNet outperforms existing methods on both synthetic\n(PCPNet) and real-world (SceneNN) datasets while running significantly faster.\nWe also conduct various analyses to investigate the contribution of each\ncomponent in the MSEC stream. Finally, we demonstrate the effectiveness of our\napproach in surface reconstruction.\n","authors":["Haoyi Xiu","Xin Liu","Weimin Wang","Kyoung-Sook Kim","Masashi Matsuoka"],"pdf_url":"https://arxiv.org/pdf/2308.02237v2.pdf","comment":"Accepted for ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.11909v1","updated":"2023-08-23T04:29:40Z","published":"2023-08-23T04:29:40Z","title":"Edge-aware Hard Clustering Graph Pooling for Brain Imaging Data","summary":" Graph Convolutional Networks (GCNs) can capture non-Euclidean spatial\ndependence between different brain regions, and the graph pooling operator in\nGCNs is key to enhancing the representation learning capability and acquiring\nabnormal brain maps. However, the majority of existing research designs graph\npooling operators only from the perspective of nodes while disregarding the\noriginal edge features, in a way that not only confines graph pooling\napplication scenarios, but also diminishes its ability to capture critical\nsubstructures. In this study, a clustering graph pooling method that first\nsupports multidimensional edge features, called Edge-aware hard clustering\ngraph pooling (EHCPool), is developed. EHCPool proposes the first\n'Edge-to-node' score evaluation criterion based on edge features to assess node\nfeature significance. To more effectively capture the critical subgraphs, a\nnovel Iteration n-top strategy is further designed to adaptively learn sparse\nhard clustering assignments for graphs. Subsequently, an innovative N-E\nAggregation strategy is presented to aggregate node and edge feature\ninformation in each independent subgraph. The proposed model was evaluated on\nmulti-site brain imaging public datasets and yielded state-of-the-art\nperformance. We believe this method is the first deep learning tool with the\npotential to probe different types of abnormal functional brain networks from\ndata-driven perspective.\n","authors":["Cheng Zhu","Jiayi Zhu","Lijuan Zhang","Xi Wu","Shuqi Yang","Ping Liang","Honghan Chen","Ying Tan"],"pdf_url":"https://arxiv.org/pdf/2308.11909v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.08942v2","updated":"2023-08-23T04:12:26Z","published":"2023-03-15T21:22:21Z","title":"Spherical Space Feature Decomposition for Guided Depth Map\n Super-Resolution","summary":" Guided depth map super-resolution (GDSR), as a hot topic in multi-modal image\nprocessing, aims to upsample low-resolution (LR) depth maps with additional\ninformation involved in high-resolution (HR) RGB images from the same scene.\nThe critical step of this task is to effectively extract domain-shared and\ndomain-private RGB/depth features. In addition, three detailed issues, namely\nblurry edges, noisy surfaces, and over-transferred RGB texture, need to be\naddressed. In this paper, we propose the Spherical Space feature Decomposition\nNetwork (SSDNet) to solve the above issues. To better model cross-modality\nfeatures, Restormer block-based RGB/depth encoders are employed for extracting\nlocal-global features. Then, the extracted features are mapped to the spherical\nspace to complete the separation of private features and the alignment of\nshared features. Shared features of RGB are fused with the depth features to\ncomplete the GDSR task. Subsequently, a spherical contrast refinement (SCR)\nmodule is proposed to further address the detail issues. Patches that are\nclassified according to imperfect categories are input into the SCR module,\nwhere the patch features are pulled closer to the ground truth and pushed away\nfrom the corresponding imperfect samples in the spherical feature space via\ncontrastive learning. Extensive experiments demonstrate that our method can\nachieve state-of-the-art results on four test datasets, as well as successfully\ngeneralize to real-world scenes. The code is available at\n\\url{https://github.com/Zhaozixiang1228/GDSR-SSDNet}.\n","authors":["Zixiang Zhao","Jiangshe Zhang","Xiang Gu","Chengli Tan","Shuang Xu","Yulun Zhang","Radu Timofte","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2303.08942v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.11903v1","updated":"2023-08-23T04:08:53Z","published":"2023-08-23T04:08:53Z","title":"Rethinking Data Perturbation and Model Stabilization for Semi-supervised\n Medical Image Segmentation","summary":" Studies on semi-supervised medical image segmentation (SSMIS) have seen fast\nprogress recently. Due to the limited labelled data, SSMIS methods mainly focus\non effectively leveraging unlabeled data to enhance the segmentation\nperformance. However, despite their promising performance, current\nstate-of-the-art methods often prioritize integrating complex techniques and\nloss terms rather than addressing the core challenges of semi-supervised\nscenarios directly. We argue that the key to SSMIS lies in generating\nsubstantial and appropriate prediction disagreement on unlabeled data. To this\nend, we emphasize the crutiality of data perturbation and model stabilization\nin semi-supervised segmentation, and propose a simple yet effective approach to\nboost SSMIS performance significantly, dubbed DPMS. Specifically, we first\nrevisit SSMIS from three distinct perspectives: the data, the model, and the\nloss, and conduct a comprehensive study of corresponding strategies to examine\ntheir effectiveness. Based on these examinations, we then propose DPMS, which\nadopts a plain teacher-student framework with a standard supervised loss and\nunsupervised consistency loss. To produce appropriate prediction disagreements,\nDPMS perturbs the unlabeled data via strong augmentations to enlarge prediction\ndisagreements considerably. On the other hand, using EMA teacher when strong\naugmentation is applied does not necessarily improve performance. DPMS further\nutilizes a forwarding-twice and momentum updating strategies for normalization\nstatistics to stabilize the training on unlabeled data effectively. Despite its\nsimplicity, DPMS can obtain new state-of-the-art performance on the public 2D\nACDC and 3D LA datasets across various semi-supervised settings, e.g. obtaining\na remarkable 22.62% improvement against previous SOTA on ACDC with 5% labels.\n","authors":["Zhen Zhao","Ye Liu","Meng Zhao","Di Yin","Yixuan Yuan","Luping Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.11903v1.pdf","comment":"Code and logs are available at https://github.com/ZhenZHAO/DPMS"},{"id":"http://arxiv.org/abs/2308.11901v1","updated":"2023-08-23T04:01:56Z","published":"2023-08-23T04:01:56Z","title":"Camera-Driven Representation Learning for Unsupervised Domain Adaptive\n Person Re-identification","summary":" We present a novel unsupervised domain adaption method for person\nre-identification (reID) that generalizes a model trained on a labeled source\ndomain to an unlabeled target domain. We introduce a camera-driven curriculum\nlearning (CaCL) framework that leverages camera labels of person images to\ntransfer knowledge from source to target domains progressively. To this end, we\ndivide target domain dataset into multiple subsets based on the camera labels,\nand initially train our model with a single subset (i.e., images captured by a\nsingle camera). We then gradually exploit more subsets for training, according\nto a curriculum sequence obtained with a camera-driven scheduling rule. The\nscheduler considers maximum mean discrepancies (MMD) between each subset and\nthe source domain dataset, such that the subset closer to the source domain is\nexploited earlier within the curriculum. For each curriculum sequence, we\ngenerate pseudo labels of person images in a target domain to train a reID\nmodel in a supervised way. We have observed that the pseudo labels are highly\nbiased toward cameras, suggesting that person images obtained from the same\ncamera are likely to have the same pseudo labels, even for different IDs. To\naddress the camera bias problem, we also introduce a camera-diversity (CD) loss\nencouraging person images of the same pseudo label, but captured across various\ncameras, to involve more for discriminative feature learning, providing person\nrepresentations robust to inter-camera variations. Experimental results on\nstandard benchmarks, including real-to-real and synthetic-to-real scenarios,\ndemonstrate the effectiveness of our framework.\n","authors":["Geon Lee","Sanghoon Lee","Dohyung Kim","Younghoon Shin","Yongsang Yoon","Bumsub Ham"],"pdf_url":"https://arxiv.org/pdf/2308.11901v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.11900v1","updated":"2023-08-23T04:01:54Z","published":"2023-08-23T04:01:54Z","title":"HashReID: Dynamic Network with Binary Codes for Efficient Person\n Re-identification","summary":" Biometric applications, such as person re-identification (ReID), are often\ndeployed on energy constrained devices. While recent ReID methods prioritize\nhigh retrieval performance, they often come with large computational costs and\nhigh search time, rendering them less practical in real-world settings. In this\nwork, we propose an input-adaptive network with multiple exit blocks, that can\nterminate computation early if the retrieval is straightforward or noisy,\nsaving a lot of computation. To assess the complexity of the input, we\nintroduce a temporal-based classifier driven by a new training strategy.\nFurthermore, we adopt a binary hash code generation approach instead of relying\non continuous-valued features, which significantly improves the search process\nby a factor of 20. To ensure similarity preservation, we utilize a new ranking\nregularizer that bridges the gap between continuous and binary features.\nExtensive analysis of our proposed method is conducted on three datasets:\nMarket1501, MSMT17 (Multi-Scene Multi-Time), and the BGC1 (BRIAR Government\nCollection). Using our approach, more than 70% of the samples with compact hash\ncodes exit early on the Market1501 dataset, saving 80% of the networks\ncomputational cost and improving over other hash-based methods by 60%. These\nresults demonstrate a significant improvement over dynamic networks and\nshowcase comparable accuracy performance to conventional ReID methods. Code\nwill be made available.\n","authors":["Kshitij Nikhal","Yujunrong Ma","Shuvra S. Bhattacharyya","Benjamin S. Riggan"],"pdf_url":"https://arxiv.org/pdf/2308.11900v1.pdf","comment":"WACV 2024"},{"id":"http://arxiv.org/abs/2302.06845v2","updated":"2023-08-23T03:56:24Z","published":"2023-02-14T05:47:45Z","title":"SEAM: Searching Transferable Mixed-Precision Quantization Policy through\n Large Margin Regularization","summary":" Mixed-precision quantization (MPQ) suffers from the time-consuming process of\nsearching the optimal bit-width allocation i.e., the policy) for each layer,\nespecially when using large-scale datasets such as ISLVRC-2012. This limits the\npracticality of MPQ in real-world deployment scenarios. To address this issue,\nthis paper proposes a novel method for efficiently searching for effective MPQ\npolicies using a small proxy dataset instead of the large-scale dataset used\nfor training the model. Deviating from the established norm of employing a\nconsistent dataset for both model training and MPQ policy search stages, our\napproach, therefore, yields a substantial enhancement in the efficiency of MPQ\nexploration. Nonetheless, using discrepant datasets poses challenges in\nsearching for a transferable MPQ policy. Driven by the observation that\nquantization noise of sub-optimal policy exerts a detrimental influence on the\ndiscriminability of feature representations -- manifesting as diminished class\nmargins and ambiguous decision boundaries -- our method aims to identify\npolicies that uphold the discriminative nature of feature representations,\ni.e., intra-class compactness and inter-class separation. This general and\ndataset-independent property makes us search for the MPQ policy over a rather\nsmall-scale proxy dataset and then the policy can be directly used to quantize\nthe model trained on a large-scale dataset. Our method offers several\nadvantages, including high proxy data utilization, no excessive hyper-parameter\ntuning, and high searching efficiency. We search high-quality MPQ policies with\nthe proxy dataset that has only 4% of the data scale compared to the\nlarge-scale target dataset, achieving the same accuracy as searching directly\non the latter, improving MPQ searching efficiency by up to 300 times.\n","authors":["Chen Tang","Kai Ouyang","Zenghao Chai","Yunpeng Bai","Yuan Meng","Zhi Wang","Wenwu Zhu"],"pdf_url":"https://arxiv.org/pdf/2302.06845v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11898v1","updated":"2023-08-23T03:46:04Z","published":"2023-08-23T03:46:04Z","title":"Exploring the Optimization Objective of One-Class Classification for\n Anomaly Detection","summary":" One-class classification (OCC) is a longstanding method for anomaly\ndetection. With the powerful representation capability of the pre-trained\nbackbone, OCC methods have witnessed significant performance improvements.\nTypically, most of these OCC methods employ transfer learning to enhance the\ndiscriminative nature of the pre-trained backbone's features, thus achieving\nremarkable efficacy. While most current approaches emphasize feature transfer\nstrategies, we argue that the optimization objective space within OCC methods\ncould also be an underlying critical factor influencing performance. In this\nwork, we conducted a thorough investigation into the optimization objective of\nOCC. Through rigorous theoretical analysis and derivation, we unveil a key\ninsights: any space with the suitable norm can serve as an equivalent\nsubstitute for the hypersphere center, without relying on the distribution\nassumption of training samples. Further, we provide guidelines for determining\nthe feasible domain of norms for the OCC optimization objective. This novel\ninsight sparks a simple and data-agnostic deep one-class classification method.\nOur method is straightforward, with a single 1x1 convolutional layer as a\ntrainable projector and any space with suitable norm as the optimization\nobjective. Extensive experiments validate the reliability and efficacy of our\nfindings and the corresponding methodology, resulting in state-of-the-art\nperformance in both one-class classification and industrial vision anomaly\ndetection and segmentation tasks.\n","authors":["Han Gao","Huiyuan Luo","Fei Shen","Zhengtao Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.11898v1.pdf","comment":"15 paegs, 10 figures"},{"id":"http://arxiv.org/abs/2308.11896v1","updated":"2023-08-23T03:43:34Z","published":"2023-08-23T03:43:34Z","title":"Age Prediction From Face Images Via Contrastive Learning","summary":" This paper presents a novel approach for accurately estimating age from face\nimages, which overcomes the challenge of collecting a large dataset of\nindividuals with the same identity at different ages. Instead, we leverage\nreadily available face datasets of different people at different ages and aim\nto extract age-related features using contrastive learning. Our method\nemphasizes these relevant features while suppressing identity-related features\nusing a combination of cosine similarity and triplet margin losses. We\ndemonstrate the effectiveness of our proposed approach by achieving\nstate-of-the-art performance on two public datasets, FG-NET and MORPH-II.\n","authors":["Yeongnam Chae","Poulami Raha","Mijung Kim","Bjorn Stenger"],"pdf_url":"https://arxiv.org/pdf/2308.11896v1.pdf","comment":"MVA2023"},{"id":"http://arxiv.org/abs/2308.11894v1","updated":"2023-08-23T03:40:47Z","published":"2023-08-23T03:40:47Z","title":"Does Physical Adversarial Example Really Matter to Autonomous Driving?\n Towards System-Level Effect of Adversarial Object Evasion Attack","summary":" In autonomous driving (AD), accurate perception is indispensable to achieving\nsafe and secure driving. Due to its safety-criticality, the security of AD\nperception has been widely studied. Among different attacks on AD perception,\nthe physical adversarial object evasion attacks are especially severe. However,\nwe find that all existing literature only evaluates their attack effect at the\ntargeted AI component level but not at the system level, i.e., with the entire\nsystem semantics and context such as the full AD pipeline. Thereby, this raises\na critical research question: can these existing researches effectively achieve\nsystem-level attack effects (e.g., traffic rule violations) in the real-world\nAD context? In this work, we conduct the first measurement study on whether and\nhow effectively the existing designs can lead to system-level effects,\nespecially for the STOP sign-evasion attacks due to their popularity and\nseverity. Our evaluation results show that all the representative prior works\ncannot achieve any system-level effects. We observe two design limitations in\nthe prior works: 1) physical model-inconsistent object size distribution in\npixel sampling and 2) lack of vehicle plant model and AD system model\nconsideration. Then, we propose SysAdv, a novel system-driven attack design in\nthe AD context and our evaluation results show that the system-level effects\ncan be significantly improved, i.e., the violation rate increases by around\n70%.\n","authors":["Ningfei Wang","Yunpeng Luo","Takami Sato","Kaidi Xu","Qi Alfred Chen"],"pdf_url":"https://arxiv.org/pdf/2308.11894v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.16680v4","updated":"2023-08-23T03:28:30Z","published":"2023-07-31T13:57:05Z","title":"On the Trustworthiness Landscape of State-of-the-art Generative Models:\n A Comprehensive Survey","summary":" Diffusion models and large language models have emerged as leading-edge\ngenerative models and have sparked a revolutionary impact on various aspects of\nhuman life. However, the practical implementation of these models has also\nexposed inherent risks, highlighting their dual nature and raising concerns\nregarding their trustworthiness. Despite the abundance of literature on this\nsubject, a comprehensive survey specifically delving into the intersection of\nlarge-scale generative models and their trustworthiness remains largely absent.\nTo bridge this gap, This paper investigates both the long-standing and emerging\nthreats associated with these models across four fundamental dimensions:\nprivacy, security, fairness, and responsibility. In this way, we construct an\nextensive map outlining the trustworthiness of these models, while also\nproviding practical recommendations and identifying future directions. These\nefforts are crucial for promoting the trustworthy deployment of these models,\nultimately benefiting society as a whole.\n","authors":["Mingyuan Fan","Cen Chen","Chengyu Wang","Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2307.16680v4.pdf","comment":"Draft Version"},{"id":"http://arxiv.org/abs/2210.01055v3","updated":"2023-08-23T03:24:13Z","published":"2022-10-03T16:13:14Z","title":"CLIP2Point: Transfer CLIP to Point Cloud Classification with Image-Depth\n Pre-training","summary":" Pre-training across 3D vision and language remains under development because\nof limited training data. Recent works attempt to transfer vision-language\npre-training models to 3D vision. PointCLIP converts point cloud data to\nmulti-view depth maps, adopting CLIP for shape classification. However, its\nperformance is restricted by the domain gap between rendered depth maps and\nimages, as well as the diversity of depth distributions. To address this issue,\nwe propose CLIP2Point, an image-depth pre-training method by contrastive\nlearning to transfer CLIP to the 3D domain, and adapt it to point cloud\nclassification. We introduce a new depth rendering setting that forms a better\nvisual effect, and then render 52,460 pairs of images and depth maps from\nShapeNet for pre-training. The pre-training scheme of CLIP2Point combines\ncross-modality learning to enforce the depth features for capturing expressive\nvisual and textual features and intra-modality learning to enhance the\ninvariance of depth aggregation. Additionally, we propose a novel Dual-Path\nAdapter (DPA) module, i.e., a dual-path structure with simplified adapters for\nfew-shot learning. The dual-path structure allows the joint use of CLIP and\nCLIP2Point, and the simplified adapter can well fit few-shot tasks without\npost-search. Experimental results show that CLIP2Point is effective in\ntransferring CLIP knowledge to 3D vision. Our CLIP2Point outperforms PointCLIP\nand other self-supervised 3D networks, achieving state-of-the-art results on\nzero-shot and few-shot classification.\n","authors":["Tianyu Huang","Bowen Dong","Yunhan Yang","Xiaoshui Huang","Rynson W. H. Lau","Wanli Ouyang","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2210.01055v3.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.11887v1","updated":"2023-08-23T03:20:31Z","published":"2023-08-23T03:20:31Z","title":"A Unified Framework for 3D Point Cloud Visual Grounding","summary":" 3D point cloud visual grounding plays a critical role in 3D scene\ncomprehension, encompassing 3D referring expression comprehension (3DREC) and\nsegmentation (3DRES). We argue that 3DREC and 3DRES should be unified in one\nframework, which is also a natural progression in the community. To explain,\n3DREC can help 3DRES locate the referent, while 3DRES can also facilitate 3DREC\nvia more finegrained language-visual alignment. To achieve this, this paper\ntakes the initiative step to integrate 3DREC and 3DRES into a unified\nframework, termed 3D Referring Transformer (3DRefTR). Its key idea is to build\nupon a mature 3DREC model and leverage ready query embeddings and visual tokens\nfrom the 3DREC model to construct a dedicated mask branch. Specially, we\npropose Superpoint Mask Branch, which serves a dual purpose: i) By leveraging\nthe heterogeneous CPU-GPU parallelism, while the GPU is occupied generating\nvisual tokens, the CPU concurrently produces superpoints, equivalently\naccomplishing the upsampling computation; ii) By harnessing on the inherent\nassociation between the superpoints and point cloud, it eliminates the heavy\ncomputational overhead on the high-resolution visual features for upsampling.\nThis elegant design enables 3DRefTR to achieve both well-performing 3DRES and\n3DREC capacities with only a 6% additional latency compared to the original\n3DREC model. Empirical evaluations affirm the superiority of 3DRefTR.\nSpecifically, on the ScanRefer dataset, 3DRefTR surpasses the state-of-the-art\n3DRES method by 12.43% in mIoU and improves upon the SOTA 3DREC method by 0.6%\nAcc@0.25IoU.\n","authors":["Haojia Lin","Yongdong Luo","Xiawu Zheng","Lijiang Li","Fei Chao","Taisong Jin","Donghao Luo","Chengjie Wang","Yan Wang","Liujuan Cao"],"pdf_url":"https://arxiv.org/pdf/2308.11887v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11579v2","updated":"2023-08-23T03:07:49Z","published":"2023-03-21T04:00:47Z","title":"Diffusion-Based 3D Human Pose Estimation with Multi-Hypothesis\n Aggregation","summary":" In this paper, a novel Diffusion-based 3D Pose estimation (D3DP) method with\nJoint-wise reProjection-based Multi-hypothesis Aggregation (JPMA) is proposed\nfor probabilistic 3D human pose estimation. On the one hand, D3DP generates\nmultiple possible 3D pose hypotheses for a single 2D observation. It gradually\ndiffuses the ground truth 3D poses to a random distribution, and learns a\ndenoiser conditioned on 2D keypoints to recover the uncontaminated 3D poses.\nThe proposed D3DP is compatible with existing 3D pose estimators and supports\nusers to balance efficiency and accuracy during inference through two\ncustomizable parameters. On the other hand, JPMA is proposed to assemble\nmultiple hypotheses generated by D3DP into a single 3D pose for practical use.\nIt reprojects 3D pose hypotheses to the 2D camera plane, selects the best\nhypothesis joint-by-joint based on the reprojection errors, and combines the\nselected joints into the final pose. The proposed JPMA conducts aggregation at\nthe joint level and makes use of the 2D prior information, both of which have\nbeen overlooked by previous approaches. Extensive experiments on Human3.6M and\nMPI-INF-3DHP datasets show that our method outperforms the state-of-the-art\ndeterministic and probabilistic approaches by 1.5% and 8.9%, respectively. Code\nis available at https://github.com/paTRICK-swk/D3DP.\n","authors":["Wenkang Shan","Zhenhua Liu","Xinfeng Zhang","Zhao Wang","Kai Han","Shanshe Wang","Siwei Ma","Wen Gao"],"pdf_url":"https://arxiv.org/pdf/2303.11579v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.02989v2","updated":"2023-08-23T02:58:53Z","published":"2023-08-06T02:15:19Z","title":"Novel Class Discovery for Long-tailed Recognition","summary":" While the novel class discovery has recently made great progress, existing\nmethods typically focus on improving algorithms on class-balanced benchmarks.\nHowever, in real-world recognition tasks, the class distributions of their\ncorresponding datasets are often imbalanced, which leads to serious performance\ndegeneration of those methods. In this paper, we consider a more realistic\nsetting for novel class discovery where the distributions of novel and known\nclasses are long-tailed. One main challenge of this new problem is to discover\nimbalanced novel classes with the help of long-tailed known classes. To tackle\nthis problem, we propose an adaptive self-labeling strategy based on an\nequiangular prototype representation of classes. Our method infers high-quality\npseudo-labels for the novel classes by solving a relaxed optimal transport\nproblem and effectively mitigates the class biases in learning the known and\nnovel classes. We perform extensive experiments on CIFAR100, ImageNet100,\nHerbarium19 and large-scale iNaturalist18 datasets, and the results demonstrate\nthe superiority of our method. Our code is available at\nhttps://github.com/kleinzcy/NCDLR.\n","authors":["Zhang Chuyu","Xu Ruijie","He Xuming"],"pdf_url":"https://arxiv.org/pdf/2308.02989v2.pdf","comment":"TMLR2023, Final version"},{"id":"http://arxiv.org/abs/2308.11880v1","updated":"2023-08-23T02:57:58Z","published":"2023-08-23T02:57:58Z","title":"SUMMIT: Source-Free Adaptation of Uni-Modal Models to Multi-Modal\n Targets","summary":" Scene understanding using multi-modal data is necessary in many applications,\ne.g., autonomous navigation. To achieve this in a variety of situations,\nexisting models must be able to adapt to shifting data distributions without\narduous data annotation. Current approaches assume that the source data is\navailable during adaptation and that the source consists of paired multi-modal\ndata. Both these assumptions may be problematic for many applications. Source\ndata may not be available due to privacy, security, or economic concerns.\nAssuming the existence of paired multi-modal data for training also entails\nsignificant data collection costs and fails to take advantage of widely\navailable freely distributed pre-trained uni-modal models. In this work, we\nrelax both of these assumptions by addressing the problem of adapting a set of\nmodels trained independently on uni-modal data to a target domain consisting of\nunlabeled multi-modal data, without having access to the original source\ndataset. Our proposed approach solves this problem through a switching\nframework which automatically chooses between two complementary methods of\ncross-modal pseudo-label fusion -- agreement filtering and entropy weighting --\nbased on the estimated domain gap. We demonstrate our work on the semantic\nsegmentation problem. Experiments across seven challenging adaptation scenarios\nverify the efficacy of our approach, achieving results comparable to, and in\nsome cases outperforming, methods which assume access to source data. Our\nmethod achieves an improvement in mIoU of up to 12% over competing baselines.\nOur code is publicly available at https://github.com/csimo005/SUMMIT.\n","authors":["Cody Simons","Dripta S. Raychaudhuri","Sk Miraj Ahmed","Suya You","Konstantinos Karydis","Amit K. Roy-Chowdhury"],"pdf_url":"https://arxiv.org/pdf/2308.11880v1.pdf","comment":"12 pages, 5 figures, 9 tables, ICCV 2023"},{"id":"http://arxiv.org/abs/2308.11877v1","updated":"2023-08-23T02:49:22Z","published":"2023-08-23T02:49:22Z","title":"Integrated Image and Location Analysis for Wound Classification: A Deep\n Learning Approach","summary":" The global burden of acute and chronic wounds presents a compelling case for\nenhancing wound classification methods, a vital step in diagnosing and\ndetermining optimal treatments. Recognizing this need, we introduce an\ninnovative multi-modal network based on a deep convolutional neural network for\ncategorizing wounds into four categories: diabetic, pressure, surgical, and\nvenous ulcers. Our multi-modal network uses wound images and their\ncorresponding body locations for more precise classification. A unique aspect\nof our methodology is incorporating a body map system that facilitates accurate\nwound location tagging, improving upon traditional wound image classification\ntechniques. A distinctive feature of our approach is the integration of models\nsuch as VGG16, ResNet152, and EfficientNet within a novel architecture. This\narchitecture includes elements like spatial and channel-wise\nSqueeze-and-Excitation modules, Axial Attention, and an Adaptive Gated\nMulti-Layer Perceptron, providing a robust foundation for classification. Our\nmulti-modal network was trained and evaluated on two distinct datasets\ncomprising relevant images and corresponding location information. Notably, our\nproposed network outperformed traditional methods, reaching an accuracy range\nof 74.79% to 100% for Region of Interest (ROI) without location\nclassifications, 73.98% to 100% for ROI with location classifications, and\n78.10% to 100% for whole image classifications. This marks a significant\nenhancement over previously reported performance metrics in the literature. Our\nresults indicate the potential of our multi-modal network as an effective\ndecision-support tool for wound image classification, paving the way for its\napplication in various clinical contexts.\n","authors":["Yash Patel","Tirth Shah","Mrinal Kanti Dhar","Taiyu Zhang","Jeffrey Niezgoda","Sandeep Gopalkrishnan","Zeyun Yu"],"pdf_url":"https://arxiv.org/pdf/2308.11877v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11875v1","updated":"2023-08-23T02:40:51Z","published":"2023-08-23T02:40:51Z","title":"Motion-to-Matching: A Mixed Paradigm for 3D Single Object Tracking","summary":" 3D single object tracking with LiDAR points is an important task in the\ncomputer vision field. Previous methods usually adopt the matching-based or\nmotion-centric paradigms to estimate the current target status. However, the\nformer is sensitive to the similar distractors and the sparseness of point\ncloud due to relying on appearance matching, while the latter usually focuses\non short-term motion clues (eg. two frames) and ignores the long-term motion\npattern of target. To address these issues, we propose a mixed paradigm with\ntwo stages, named MTM-Tracker, which combines motion modeling with feature\nmatching into a single network. Specifically, in the first stage, we exploit\nthe continuous historical boxes as motion prior and propose an encoder-decoder\nstructure to locate target coarsely. Then, in the second stage, we introduce a\nfeature interaction module to extract motion-aware features from consecutive\npoint clouds and match them to refine target movement as well as regress other\ntarget states. Extensive experiments validate that our paradigm achieves\ncompetitive performance on large-scale datasets (70.9% in KITTI and 51.70% in\nNuScenes). The code will be open soon at\nhttps://github.com/LeoZhiheng/MTM-Tracker.git.\n","authors":["Zhiheng Li","Yu Lin","Yubo Cui","Shuo Li","Zheng Fang"],"pdf_url":"https://arxiv.org/pdf/2308.11875v1.pdf","comment":"8 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.11874v1","updated":"2023-08-23T02:37:34Z","published":"2023-08-23T02:37:34Z","title":"Semi-Supervised Learning via Weight-aware Distillation under Class\n Distribution Mismatch","summary":" Semi-Supervised Learning (SSL) under class distribution mismatch aims to\ntackle a challenging problem wherein unlabeled data contain lots of unknown\ncategories unseen in the labeled ones. In such mismatch scenarios, traditional\nSSL suffers severe performance damage due to the harmful invasion of the\ninstances with unknown categories into the target classifier. In this study, by\nstrict mathematical reasoning, we reveal that the SSL error under class\ndistribution mismatch is composed of pseudo-labeling error and invasion error,\nboth of which jointly bound the SSL population risk. To alleviate the SSL\nerror, we propose a robust SSL framework called Weight-Aware Distillation (WAD)\nthat, by weights, selectively transfers knowledge beneficial to the target task\nfrom unsupervised contrastive representation to the target classifier.\nSpecifically, WAD captures adaptive weights and high-quality pseudo labels to\ntarget instances by exploring point mutual information (PMI) in representation\nspace to maximize the role of unlabeled data and filter unknown categories.\nTheoretically, we prove that WAD has a tight upper bound of population risk\nunder class distribution mismatch. Experimentally, extensive results\ndemonstrate that WAD outperforms five state-of-the-art SSL approaches and one\nstandard baseline on two benchmark datasets, CIFAR10 and CIFAR100, and an\nartificial cross-dataset. The code is available at\nhttps://github.com/RUC-DWBI-ML/research/tree/main/WAD-master.\n","authors":["Pan Du","Suyun Zhao","Zisen Sheng","Cuiping Li","Hong Chen"],"pdf_url":"https://arxiv.org/pdf/2308.11874v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2305.19301v2","updated":"2023-08-23T02:18:51Z","published":"2023-05-30T14:24:40Z","title":"On the Choice of Perception Loss Function for Learned Video Compression","summary":" We study causal, low-latency, sequential video compression when the output is\nsubjected to both a mean squared-error (MSE) distortion loss as well as a\nperception loss to target realism. Motivated by prior approaches, we consider\ntwo different perception loss functions (PLFs). The first, PLF-JD, considers\nthe joint distribution (JD) of all the video frames up to the current one,\nwhile the second metric, PLF-FMD, considers the framewise marginal\ndistributions (FMD) between the source and reconstruction. Using information\ntheoretic analysis and deep-learning based experiments, we demonstrate that the\nchoice of PLF can have a significant effect on the reconstruction, especially\nat low-bit rates. In particular, while the reconstruction based on PLF-JD can\nbetter preserve the temporal correlation across frames, it also imposes a\nsignificant penalty in distortion compared to PLF-FMD and further makes it more\ndifficult to recover from errors made in the earlier output frames. Although\nthe choice of PLF decisively affects reconstruction quality, we also\ndemonstrate that it may not be essential to commit to a particular PLF during\nencoding and the choice of PLF can be delegated to the decoder. In particular,\nencoded representations generated by training a system to minimize the MSE\n(without requiring either PLF) can be {\\em near universal} and can generate\nclose to optimal reconstructions for either choice of PLF at the decoder. We\nvalidate our results using (one-shot) information-theoretic analysis, detailed\nstudy of the rate-distortion-perception tradeoff of the Gauss-Markov source\nmodel as well as deep-learning based experiments on moving MNIST and KTH\ndatasets.\n","authors":["Sadaf Salehkalaibar","Buu Phan","Jun Chen","Wei Yu","Ashish Khisti"],"pdf_url":"https://arxiv.org/pdf/2305.19301v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11857v1","updated":"2023-08-23T01:19:58Z","published":"2023-08-23T01:19:58Z","title":"CoC-GAN: Employing Context Cluster for Unveiling a New Pathway in Image\n Generation","summary":" Image generation tasks are traditionally undertaken using Convolutional\nNeural Networks (CNN) or Transformer architectures for feature aggregating and\ndispatching. Despite the frequent application of convolution and attention\nstructures, these structures are not fundamentally required to solve the\nproblem of instability and the lack of interpretability in image generation. In\nthis paper, we propose a unique image generation process premised on the\nperspective of converting images into a set of point clouds. In other words, we\ninterpret an image as a set of points. As such, our methodology leverages\nsimple clustering methods named Context Clustering (CoC) to generate images\nfrom unordered point sets, which defies the convention of using convolution or\nattention mechanisms. Hence, we exclusively depend on this clustering\ntechnique, combined with the multi-layer perceptron (MLP) in a generative\nmodel. Furthermore, we implement the integration of a module termed the 'Point\nIncreaser' for the model. This module is just an MLP tasked with generating\nadditional points for clustering, which are subsequently integrated within the\nparadigm of the Generative Adversarial Network (GAN). We introduce this model\nwith the novel structure as the Context Clustering Generative Adversarial\nNetwork (CoC-GAN), which offers a distinctive viewpoint in the domain of\nfeature aggregating and dispatching. Empirical evaluations affirm that our\nCoC-GAN, devoid of convolution and attention mechanisms, exhibits outstanding\nperformance. Its interpretability, endowed by the CoC module, also allows for\nvisualization in our experiments. The promising results underscore the\nfeasibility of our method and thus warrant future investigations of applying\nContext Clustering to more novel and interpretable image generation.\n","authors":["Zihao Wang","Yiming Huang","Ziyu Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.11857v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.05184v2","updated":"2023-08-23T01:14:30Z","published":"2022-06-10T15:25:00Z","title":"SERE: Exploring Feature Self-relation for Self-supervised Transformer","summary":" Learning representations with self-supervision for convolutional networks\n(CNN) has been validated to be effective for vision tasks. As an alternative to\nCNN, vision transformers (ViT) have strong representation ability with spatial\nself-attention and channel-level feedforward networks. Recent works reveal that\nself-supervised learning helps unleash the great potential of ViT. Still, most\nworks follow self-supervised strategies designed for CNN, e.g., instance-level\ndiscrimination of samples, but they ignore the properties of ViT. We observe\nthat relational modeling on spatial and channel dimensions distinguishes ViT\nfrom other networks. To enforce this property, we explore the feature\nSElf-RElation (SERE) for training self-supervised ViT. Specifically, instead of\nconducting self-supervised learning solely on feature embeddings from multiple\nviews, we utilize the feature self-relations, i.e., spatial/channel\nself-relations, for self-supervised learning. Self-relation based learning\nfurther enhances the relation modeling ability of ViT, resulting in stronger\nrepresentations that stably improve performance on multiple downstream tasks.\nOur source code will be made publicly available.\n","authors":["Zhong-Yu Li","Shanghua Gao","Ming-Ming Cheng"],"pdf_url":"https://arxiv.org/pdf/2206.05184v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08345v3","updated":"2023-08-23T01:10:43Z","published":"2023-08-16T13:10:32Z","title":"GAEI-UNet: Global Attention and Elastic Interaction U-Net for Vessel\n Image Segmentation","summary":" Vessel image segmentation plays a pivotal role in medical diagnostics, aiding\nin the early detection and treatment of vascular diseases. While segmentation\nbased on deep learning has shown promising results, effectively segmenting\nsmall structures and maintaining connectivity between them remains challenging.\nTo address these limitations, we propose GAEI-UNet, a novel model that combines\nglobal attention and elastic interaction-based techniques. GAEI-UNet leverages\nglobal spatial and channel context information to enhance high-level semantic\nunderstanding within the U-Net architecture, enabling precise segmentation of\nsmall vessels. Additionally, we adopt an elastic interaction-based loss\nfunction to improve connectivity among these fine structures. By capturing the\nforces generated by misalignment between target and predicted shapes, our model\neffectively learns to preserve the correct topology of vessel networks.\nEvaluation on retinal vessel dataset -- DRIVE demonstrates the superior\nperformance of GAEI-UNet in terms of SE and connectivity of small structures,\nwithout significantly increasing computational complexity. This research aims\nto advance the field of vessel image segmentation, providing more accurate and\nreliable diagnostic tools for the medical community. The implementation code is\navailable on Code.\n","authors":["Ruiqiang Xiao","Zhuoyue Wan"],"pdf_url":"https://arxiv.org/pdf/2308.08345v3.pdf","comment":"arXiv admin note: text overlap with arXiv:2004.03696 by other authors"},{"id":"http://arxiv.org/abs/2308.11840v1","updated":"2023-08-23T00:17:50Z","published":"2023-08-23T00:17:50Z","title":"Compressed Models Decompress Race Biases: What Quantized Models Forget\n for Fair Face Recognition","summary":" With the ever-growing complexity of deep learning models for face\nrecognition, it becomes hard to deploy these systems in real life. Researchers\nhave two options: 1) use smaller models; 2) compress their current models.\nSince the usage of smaller models might lead to concerning biases, compression\ngains relevance. However, compressing might be also responsible for an increase\nin the bias of the final model. We investigate the overall performance, the\nperformance on each ethnicity subgroup and the racial bias of a\nState-of-the-Art quantization approach when used with synthetic and real data.\nThis analysis provides a few more details on potential benefits of performing\nquantization with synthetic data, for instance, the reduction of biases on the\nmajority of test scenarios. We tested five distinct architectures and three\ndifferent training datasets. The models were evaluated on a fourth dataset\nwhich was collected to infer and compare the performance of face recognition\nmodels on different ethnicity.\n","authors":["Pedro C. Neto","Eduarda Caldeira","Jaime S. Cardoso","Ana F. Sequeira"],"pdf_url":"https://arxiv.org/pdf/2308.11840v1.pdf","comment":"Accepted for Oral at BIOSIG 2023"},{"id":"http://arxiv.org/abs/2308.12469v1","updated":"2023-08-23T23:44:44Z","published":"2023-08-23T23:44:44Z","title":"Diffuse, Attend, and Segment: Unsupervised Zero-Shot Segmentation using\n Stable Diffusion","summary":" Producing quality segmentation masks for images is a fundamental problem in\ncomputer vision. Recent research has explored large-scale supervised training\nto enable zero-shot segmentation on virtually any image style and unsupervised\ntraining to enable segmentation without dense annotations. However,\nconstructing a model capable of segmenting anything in a zero-shot manner\nwithout any annotations is still challenging. In this paper, we propose to\nutilize the self-attention layers in stable diffusion models to achieve this\ngoal because the pre-trained stable diffusion model has learned inherent\nconcepts of objects within its attention layers. Specifically, we introduce a\nsimple yet effective iterative merging process based on measuring KL divergence\namong attention maps to merge them into valid segmentation masks. The proposed\nmethod does not require any training or language dependency to extract quality\nsegmentation for any images. On COCO-Stuff-27, our method surpasses the prior\nunsupervised zero-shot SOTA method by an absolute 26% in pixel accuracy and 17%\nin mean IoU.\n","authors":["Junjiao Tian","Lavisha Aggarwal","Andrea Colaco","Zsolt Kira","Mar Gonzalez-Franco"],"pdf_url":"https://arxiv.org/pdf/2308.12469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.04634v3","updated":"2023-08-23T23:40:59Z","published":"2023-01-11T18:39:34Z","title":"Street-View Image Generation from a Bird's-Eye View Layout","summary":" Bird's-Eye View (BEV) Perception has received increasing attention in recent\nyears as it provides a concise and unified spatial representation across views\nand benefits a diverse set of downstream driving applications. While the focus\nhas been placed on discriminative tasks such as BEV segmentation, the dual\ngenerative task of creating street-view images from a BEV layout has rarely\nbeen explored. The ability to generate realistic street-view images that align\nwith a given HD map and traffic layout is critical for visualizing complex\ntraffic scenarios and developing robust perception models for autonomous\ndriving. In this paper, we propose BEVGen, a conditional generative model that\nsynthesizes a set of realistic and spatially consistent surrounding images that\nmatch the BEV layout of a traffic scenario. BEVGen incorporates a novel\ncross-view transformation and spatial attention design which learn the\nrelationship between cameras and map views to ensure their consistency. Our\nmodel can accurately render road and lane lines, as well as generate traffic\nscenes under different weather conditions and times of day. The code will be\nmade publicly available.\n","authors":["Alexander Swerdlow","Runsheng Xu","Bolei Zhou"],"pdf_url":"https://arxiv.org/pdf/2301.04634v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.09818v2","updated":"2023-08-23T23:32:49Z","published":"2023-04-17T22:02:03Z","title":"What Should Be Balanced in a \"Balanced\" Face Recognition Dataset?","summary":" The issue of demographic disparities in face recognition accuracy has\nattracted increasing attention in recent years. Various face image datasets\nhave been proposed as 'fair' or 'balanced' to assess the accuracy of face\nrecognition algorithms across demographics. These datasets typically balance\nthe number of identities and images across demographics. It is important to\nnote that the number of identities and images in an evaluation dataset are {\\em\nnot} driving factors for 1-to-1 face matching accuracy. Moreover, balancing the\nnumber of identities and images does not ensure balance in other factors known\nto impact accuracy, such as head pose, brightness, and image quality. We\ndemonstrate these issues using several recently proposed datasets. To improve\nthe ability to perform less biased evaluations, we propose a bias-aware toolkit\nthat facilitates creation of cross-demographic evaluation datasets balanced on\nfactors mentioned in this paper.\n","authors":["Haiyu Wu","Kevin W. Bowyer"],"pdf_url":"https://arxiv.org/pdf/2304.09818v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12465v1","updated":"2023-08-23T23:04:42Z","published":"2023-08-23T23:04:42Z","title":"InverseSR: 3D Brain MRI Super-Resolution Using a Latent Diffusion Model","summary":" High-resolution (HR) MRI scans obtained from research-grade medical centers\nprovide precise information about imaged tissues. However, routine clinical MRI\nscans are typically in low-resolution (LR) and vary greatly in contrast and\nspatial resolution due to the adjustments of the scanning parameters to the\nlocal needs of the medical center. End-to-end deep learning methods for MRI\nsuper-resolution (SR) have been proposed, but they require re-training each\ntime there is a shift in the input distribution. To address this issue, we\npropose a novel approach that leverages a state-of-the-art 3D brain generative\nmodel, the latent diffusion model (LDM) trained on UK BioBank, to increase the\nresolution of clinical MRI scans. The LDM acts as a generative prior, which has\nthe ability to capture the prior distribution of 3D T1-weighted brain MRI.\nBased on the architecture of the brain LDM, we find that different methods are\nsuitable for different settings of MRI SR, and thus propose two novel\nstrategies: 1) for SR with more sparsity, we invert through both the decoder of\nthe LDM and also through a deterministic Denoising Diffusion Implicit Models\n(DDIM), an approach we will call InverseSR(LDM); 2) for SR with less sparsity,\nwe invert only through the LDM decoder, an approach we will call\nInverseSR(Decoder). These two approaches search different latent spaces in the\nLDM model to find the optimal latent code to map the given LR MRI into HR. The\ntraining process of the generative model is independent of the MRI\nunder-sampling process, ensuring the generalization of our method to many MRI\nSR problems with different input measurements. We validate our method on over\n100 brain T1w MRIs from the IXI dataset. Our method can demonstrate that\npowerful priors given by LDM can be used for MRI reconstruction.\n","authors":["Jueqi Wang","Jacob Levman","Walter Hugo Lopez Pinaya","Petru-Daniel Tudosiu","M. Jorge Cardoso","Razvan Marinescu"],"pdf_url":"https://arxiv.org/pdf/2308.12465v1.pdf","comment":"Early Accepted to MICCAI 2023 [top 14% of Submissions]"},{"id":"http://arxiv.org/abs/2308.12462v1","updated":"2023-08-23T22:55:45Z","published":"2023-08-23T22:55:45Z","title":"Overcoming General Knowledge Loss with Selective Parameter Finetuning","summary":" Foundation models encompass an extensive knowledge base and offer remarkable\ntransferability. However, this knowledge becomes outdated or insufficient over\ntime. The challenge lies in updating foundation models to accommodate novel\ninformation while retaining their original ability. In this paper, we present a\nnovel approach to achieving continual model updates by effecting localized\nmodifications to a small subset of parameters. Guided by insights gleaned from\nprior analyses of foundational models, we first localize a specific layer for\nmodel refinement and then introduce an importance scoring mechanism designed to\nupdate only the most crucial weights. Our method is exhaustively evaluated on\nfoundational vision-language models, measuring its efficacy in both learning\nnew information and preserving pre-established knowledge across a diverse\nspectrum of continual learning tasks, including Aircraft, Birdsnap CIFAR-100,\nCUB, Cars, and GTSRB. The results show that our method improves the existing\ncontinual learning methods by 0.5\\% - 10\\% on average, and reduces the loss of\npre-trained knowledge from around 5\\% to 0.97\\%. Comprehensive ablation studies\nsubstantiate our method design, shedding light on the contributions of each\ncomponent to controllably learning new knowledge and mitigating the forgetting\nof pre-trained knowledge.\n","authors":["Wenxuan Zhang","Paul Janson","Rahaf Aljundi","Mohamed Elhoseiny"],"pdf_url":"https://arxiv.org/pdf/2308.12462v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.07240v2","updated":"2023-08-23T22:54:40Z","published":"2022-06-15T01:57:12Z","title":"Test-Time Adaptation for Visual Document Understanding","summary":" For visual document understanding (VDU), self-supervised pretraining has been\nshown to successfully generate transferable representations, yet, effective\nadaptation of such representations to distribution shifts at test-time remains\nto be an unexplored area. We propose DocTTA, a novel test-time adaptation\nmethod for documents, that does source-free domain adaptation using unlabeled\ntarget document data. DocTTA leverages cross-modality self-supervised learning\nvia masked visual language modeling, as well as pseudo labeling to adapt models\nlearned on a \\textit{source} domain to an unlabeled \\textit{target} domain at\ntest time. We introduce new benchmarks using existing public datasets for\nvarious VDU tasks, including entity recognition, key-value extraction, and\ndocument visual question answering. DocTTA shows significant improvements on\nthese compared to the source model performance, up to 1.89\\% in (F1 score),\n3.43\\% (F1 score), and 17.68\\% (ANLS score), respectively. Our benchmark\ndatasets are available at \\url{https://saynaebrahimi.github.io/DocTTA.html}.\n","authors":["Sayna Ebrahimi","Sercan O. Arik","Tomas Pfister"],"pdf_url":"https://arxiv.org/pdf/2206.07240v2.pdf","comment":"Accepted at TMLR 2023"},{"id":"http://arxiv.org/abs/2308.12453v1","updated":"2023-08-23T22:34:49Z","published":"2023-08-23T22:34:49Z","title":"Augmenting medical image classifiers with synthetic data from latent\n diffusion models","summary":" While hundreds of artificial intelligence (AI) algorithms are now approved or\ncleared by the US Food and Drugs Administration (FDA), many studies have shown\ninconsistent generalization or latent bias, particularly for underrepresented\npopulations. Some have proposed that generative AI could reduce the need for\nreal data, but its utility in model development remains unclear. Skin disease\nserves as a useful case study in synthetic image generation due to the\ndiversity of disease appearance, particularly across the protected attribute of\nskin tone. Here we show that latent diffusion models can scalably generate\nimages of skin disease and that augmenting model training with these data\nimproves performance in data-limited settings. These performance gains saturate\nat synthetic-to-real image ratios above 10:1 and are substantially smaller than\nthe gains obtained from adding real images. As part of our analysis, we\ngenerate and analyze a new dataset of 458,920 synthetic images produced using\nseveral generation strategies. Our results suggest that synthetic data could\nserve as a force-multiplier for model development, but the collection of\ndiverse real-world data remains the most important step to improve medical AI\nalgorithms.\n","authors":["Luke W. Sagers","James A. Diao","Luke Melas-Kyriazi","Matthew Groh","Pranav Rajpurkar","Adewole S. Adamson","Veronica Rotemberg","Roxana Daneshjou","Arjun K. Manrai"],"pdf_url":"https://arxiv.org/pdf/2308.12453v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12452v1","updated":"2023-08-23T22:22:20Z","published":"2023-08-23T22:22:20Z","title":"ARF-Plus: Controlling Perceptual Factors in Artistic Radiance Fields for\n 3D Scene Stylization","summary":" The radiance fields style transfer is an emerging field that has recently\ngained popularity as a means of 3D scene stylization, thanks to the outstanding\nperformance of neural radiance fields in 3D reconstruction and view synthesis.\nWe highlight a research gap in radiance fields style transfer, the lack of\nsufficient perceptual controllability, motivated by the existing concept in the\n2D image style transfer. In this paper, we present ARF-Plus, a 3D neural style\ntransfer framework offering manageable control over perceptual factors, to\nsystematically explore the perceptual controllability in 3D scene stylization.\nFour distinct types of controls - color preservation control, (style pattern)\nscale control, spatial (selective stylization area) control, and depth\nenhancement control - are proposed and integrated into this framework. Results\nfrom real-world datasets, both quantitative and qualitative, show that the four\ntypes of controls in our ARF-Plus framework successfully accomplish their\ncorresponding perceptual controls when stylizing 3D scenes. These techniques\nwork well for individual style inputs as well as for the simultaneous\napplication of multiple styles within a scene. This unlocks a realm of\nlimitless possibilities, allowing customized modifications of stylization\neffects and flexible merging of the strengths of different styles, ultimately\nenabling the creation of novel and eye-catching stylistic effects on 3D scenes.\n","authors":["Wenzhao Li","Tianhao Wu","Fangcheng Zhong","Cengiz Oztireli"],"pdf_url":"https://arxiv.org/pdf/2308.12452v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12447v1","updated":"2023-08-23T22:03:57Z","published":"2023-08-23T22:03:57Z","title":"MOFO: MOtion FOcused Self-Supervision for Video Understanding","summary":" Self-supervised learning (SSL) techniques have recently produced outstanding\nresults in learning visual representations from unlabeled videos. Despite the\nimportance of motion in supervised learning techniques for action recognition,\nSSL methods often do not explicitly consider motion information in videos. To\naddress this issue, we propose MOFO (MOtion FOcused), a novel SSL method for\nfocusing representation learning on the motion area of a video, for action\nrecognition. MOFO automatically detects motion areas in videos and uses these\nto guide the self-supervision task. We use a masked autoencoder which randomly\nmasks out a high proportion of the input sequence; we force a specified\npercentage of the inside of the motion area to be masked and the remainder from\noutside. We further incorporate motion information into the finetuning step to\nemphasise motion in the downstream task. We demonstrate that our motion-focused\ninnovations can significantly boost the performance of the currently leading\nSSL method (VideoMAE) for action recognition. Our method improves the recent\nself-supervised Vision Transformer (ViT), VideoMAE, by achieving +2.6%, +2.1%,\n+1.3% accuracy on Epic-Kitchens verb, noun and action classification,\nrespectively, and +4.7% accuracy on Something-Something V2 action\nclassification. Our proposed approach significantly improves the performance of\nthe current SSL method for action recognition, indicating the importance of\nexplicitly encoding motion in SSL.\n","authors":["Mona Ahmadian","Frank Guerin","Andrew Gilbert"],"pdf_url":"https://arxiv.org/pdf/2308.12447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12443v1","updated":"2023-08-23T21:51:24Z","published":"2023-08-23T21:51:24Z","title":"TAI-GAN: Temporally and Anatomically Informed GAN for early-to-late\n frame conversion in dynamic cardiac PET motion correction","summary":" The rapid tracer kinetics of rubidium-82 ($^{82}$Rb) and high variation of\ncross-frame distribution in dynamic cardiac positron emission tomography (PET)\nraise significant challenges for inter-frame motion correction, particularly\nfor the early frames where conventional intensity-based image registration\ntechniques are not applicable. Alternatively, a promising approach utilizes\ngenerative methods to handle the tracer distribution changes to assist existing\nregistration methods. To improve frame-wise registration and parametric\nquantification, we propose a Temporally and Anatomically Informed Generative\nAdversarial Network (TAI-GAN) to transform the early frames into the late\nreference frame using an all-to-one mapping. Specifically, a feature-wise\nlinear modulation layer encodes channel-wise parameters generated from temporal\ntracer kinetics information, and rough cardiac segmentations with local shifts\nserve as the anatomical information. We validated our proposed method on a\nclinical $^{82}$Rb PET dataset and found that our TAI-GAN can produce converted\nearly frames with high image quality, comparable to the real reference frames.\nAfter TAI-GAN conversion, motion estimation accuracy and clinical myocardial\nblood flow (MBF) quantification were improved compared to using the original\nframes. Our code is published at https://github.com/gxq1998/TAI-GAN.\n","authors":["Xueqi Guo","Luyao Shi","Xiongchao Chen","Bo Zhou","Qiong Liu","Huidong Xie","Yi-Hwa Liu","Richard Palyo","Edward J. Miller","Albert J. Sinusas","Bruce Spottiswoode","Chi Liu","Nicha C. Dvornek"],"pdf_url":"https://arxiv.org/pdf/2308.12443v1.pdf","comment":"Accepted by Simulation and Synthesis in Medical Imaging (SASHIMI\n 2023, MICCAI workshop), preprint version"},{"id":"http://arxiv.org/abs/2308.12440v1","updated":"2023-08-23T21:47:28Z","published":"2023-08-23T21:47:28Z","title":"HNAS-reg: hierarchical neural architecture search for deformable medical\n image registration","summary":" Convolutional neural networks (CNNs) have been widely used to build deep\nlearning models for medical image registration, but manually designed network\narchitectures are not necessarily optimal. This paper presents a hierarchical\nNAS framework (HNAS-Reg), consisting of both convolutional operation search and\nnetwork topology search, to identify the optimal network architecture for\ndeformable medical image registration. To mitigate the computational overhead\nand memory constraints, a partial channel strategy is utilized without losing\noptimization quality. Experiments on three datasets, consisting of 636\nT1-weighted magnetic resonance images (MRIs), have demonstrated that the\nproposal method can build a deep learning model with improved image\nregistration accuracy and reduced model size, compared with state-of-the-art\nimage registration approaches, including one representative traditional\napproach and two unsupervised learning-based approaches.\n","authors":["Jiong Wu","Yong Fan"],"pdf_url":"https://arxiv.org/pdf/2308.12440v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12439v1","updated":"2023-08-23T21:47:06Z","published":"2023-08-23T21:47:06Z","title":"BaDExpert: Extracting Backdoor Functionality for Accurate Backdoor Input\n Detection","summary":" We present a novel defense, against backdoor attacks on Deep Neural Networks\n(DNNs), wherein adversaries covertly implant malicious behaviors (backdoors)\ninto DNNs. Our defense falls within the category of post-development defenses\nthat operate independently of how the model was generated. The proposed defense\nis built upon a novel reverse engineering approach that can directly extract\nbackdoor functionality of a given backdoored model to a backdoor expert model.\nThe approach is straightforward -- finetuning the backdoored model over a small\nset of intentionally mislabeled clean samples, such that it unlearns the normal\nfunctionality while still preserving the backdoor functionality, and thus\nresulting in a model (dubbed a backdoor expert model) that can only recognize\nbackdoor inputs. Based on the extracted backdoor expert model, we show the\nfeasibility of devising highly accurate backdoor input detectors that filter\nout the backdoor inputs during model inference. Further augmented by an\nensemble strategy with a finetuned auxiliary model, our defense, BaDExpert\n(Backdoor Input Detection with Backdoor Expert), effectively mitigates 16 SOTA\nbackdoor attacks while minimally impacting clean utility. The effectiveness of\nBaDExpert has been verified on multiple datasets (CIFAR10, GTSRB and ImageNet)\nacross various model architectures (ResNet, VGG, MobileNetV2 and Vision\nTransformer).\n","authors":["Tinghao Xie","Xiangyu Qi","Ping He","Yiming Li","Jiachen T. Wang","Prateek Mittal"],"pdf_url":"https://arxiv.org/pdf/2308.12439v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12435v1","updated":"2023-08-23T21:36:35Z","published":"2023-08-23T21:36:35Z","title":"Characterising representation dynamics in recurrent neural networks for\n object recognition","summary":" Recurrent neural networks (RNNs) have yielded promising results for both\nrecognizing objects in challenging conditions and modeling aspects of primate\nvision. However, the representational dynamics of recurrent computations remain\npoorly understood, especially in large-scale visual models. Here, we studied\nsuch dynamics in RNNs trained for object classification on MiniEcoset, a novel\nsubset of ecoset. We report two main insights. First, upon inference,\nrepresentations continued to evolve after correct classification, suggesting a\nlack of the notion of being ``done with classification''. Second, focusing on\n``readout zones'' as a way to characterize the activation trajectories, we\nobserve that misclassified representations exhibit activation patterns with\nlower L2 norm, and are positioned more peripherally in the readout zones. Such\narrangements help the misclassified representations move into the correct zones\nas time progresses. Our findings generalize to networks with lateral and\ntop-down connections, and include both additive and multiplicative interactions\nwith the bottom-up sweep. The results therefore contribute to a general\nunderstanding of RNN dynamics in naturalistic tasks. We hope that the analysis\nframework will aid future investigations of other types of RNNs, including\nunderstanding of representational dynamics in primate vision.\n","authors":["Sushrut Thorat","Adrien Doerig","Tim C. Kietzmann"],"pdf_url":"https://arxiv.org/pdf/2308.12435v1.pdf","comment":"8 pages, 6 figures; revision of our Conference on Cognitive\n Computational Neuroscience (CCN) 2023 paper"},{"id":"http://arxiv.org/abs/2308.12433v1","updated":"2023-08-23T21:32:46Z","published":"2023-08-23T21:32:46Z","title":"A Spatiotemporal Correspondence Approach to Unsupervised LiDAR\n Segmentation with Traffic Applications","summary":" We address the problem of unsupervised semantic segmentation of outdoor LiDAR\npoint clouds in diverse traffic scenarios. The key idea is to leverage the\nspatiotemporal nature of a dynamic point cloud sequence and introduce\ndrastically stronger augmentation by establishing spatiotemporal\ncorrespondences across multiple frames. We dovetail clustering and pseudo-label\nlearning in this work. Essentially, we alternate between clustering points into\nsemantic groups and optimizing models using point-wise pseudo-spatiotemporal\nlabels with a simple learning objective. Therefore, our method can learn\ndiscriminative features in an unsupervised learning fashion. We show promising\nsegmentation performance on Semantic-KITTI, SemanticPOSS, and FLORIDA benchmark\ndatasets covering scenarios in autonomous vehicle and intersection\ninfrastructure, which is competitive when compared against many existing fully\nsupervised learning methods. This general framework can lead to a unified\nrepresentation learning approach for LiDAR point clouds incorporating domain\nknowledge.\n","authors":["Xiao Li","Pan He","Aotian Wu","Sanjay Ranka","Anand Rangarajan"],"pdf_url":"https://arxiv.org/pdf/2308.12433v1.pdf","comment":"Accepted for publication in IEEE International Conference on\n Intelligent Transportation Systems (ITSC 2023)"},{"id":"http://arxiv.org/abs/2306.01891v2","updated":"2023-08-23T21:29:03Z","published":"2023-06-02T19:52:13Z","title":"DH-PTAM: A Deep Hybrid Stereo Events-Frames Parallel Tracking And\n Mapping System","summary":" This paper presents a robust approach for a visual parallel tracking and\nmapping (PTAM) system that excels in challenging environments. Our proposed\nmethod combines the strengths of heterogeneous multi-modal visual sensors,\nincluding stereo event-based and frame-based sensors, in a unified reference\nframe through a novel spatio-temporal synchronization of stereo visual frames\nand stereo event streams. We employ deep learning-based feature extraction and\ndescription for estimation to enhance robustness further. We also introduce an\nend-to-end parallel tracking and mapping optimization layer complemented by a\nsimple loop-closure algorithm for efficient SLAM behavior. Through\ncomprehensive experiments on both small-scale and large-scale real-world\nsequences of VECtor and TUM-VIE benchmarks, our proposed method (DH-PTAM)\ndemonstrates superior performance in terms of robustness and accuracy in\nadverse conditions, especially in large-scale HDR scenarios. Our\nimplementation's research-based Python API is publicly available on GitHub for\nfurther research and development: https://github.com/AbanobSoliman/DH-PTAM.\n","authors":["Abanob Soliman","Fabien Bonardi","Désiré Sidibé","Samia Bouchafa"],"pdf_url":"https://arxiv.org/pdf/2306.01891v2.pdf","comment":"9 pages, 9 figures and 4 tables"},{"id":"http://arxiv.org/abs/2304.05669v2","updated":"2023-08-23T20:52:27Z","published":"2023-04-12T07:46:05Z","title":"Factorized Inverse Path Tracing for Efficient and Accurate\n Material-Lighting Estimation","summary":" Inverse path tracing has recently been applied to joint material and lighting\nestimation, given geometry and multi-view HDR observations of an indoor scene.\nHowever, it has two major limitations: path tracing is expensive to compute,\nand ambiguities exist between reflection and emission. Our Factorized Inverse\nPath Tracing (FIPT) addresses these challenges by using a factored light\ntransport formulation and finds emitters driven by rendering errors. Our\nalgorithm enables accurate material and lighting optimization faster than\nprevious work, and is more effective at resolving ambiguities. The exhaustive\nexperiments on synthetic scenes show that our method (1) outperforms\nstate-of-the-art indoor inverse rendering and relighting methods particularly\nin the presence of complex illumination effects; (2) speeds up inverse path\ntracing optimization to less than an hour. We further demonstrate robustness to\nnoisy inputs through material and lighting estimates that allow plausible\nrelighting in a real scene. The source code is available at:\nhttps://github.com/lwwu2/fipt\n","authors":["Liwen Wu","Rui Zhu","Mustafa B. Yaldiz","Yinhao Zhu","Hong Cai","Janarbek Matai","Fatih Porikli","Tzu-Mao Li","Manmohan Chandraker","Ravi Ramamoorthi"],"pdf_url":"https://arxiv.org/pdf/2304.05669v2.pdf","comment":"Updated experiment results; modified real-world sections"},{"id":"http://arxiv.org/abs/2308.12419v1","updated":"2023-08-23T20:38:19Z","published":"2023-08-23T20:38:19Z","title":"Toward American Sign Language Processing in the Real World: Data, Tasks,\n and Methods","summary":" Sign language, which conveys meaning through gestures, is the chief means of\ncommunication among deaf people. Recognizing sign language in natural settings\npresents significant challenges due to factors such as lighting, background\nclutter, and variations in signer characteristics. In this thesis, I study\nautomatic sign language processing in the wild, using signing videos collected\nfrom the Internet. This thesis contributes new datasets, tasks, and methods.\nMost chapters of this thesis address tasks related to fingerspelling, an\nimportant component of sign language and yet has not been studied widely by\nprior work. I present three new large-scale ASL datasets in the wild:\nChicagoFSWild, ChicagoFSWild+, and OpenASL. Using ChicagoFSWild and\nChicagoFSWild+, I address fingerspelling recognition, which consists of\ntranscribing fingerspelling sequences into text. I propose an end-to-end\napproach based on iterative attention that allows recognition from a raw video\nwithout explicit hand detection. I further show that using a Conformer-based\nnetwork jointly modeling handshape and mouthing can bring performance close to\nthat of humans. Next, I propose two tasks for building real-world\nfingerspelling-based applications: fingerspelling detection and search. For\nfingerspelling detection, I introduce a suite of evaluation metrics and a new\ndetection model via multi-task training. To address the problem of searching\nfor fingerspelled keywords in raw sign language videos, we propose a novel\nmethod that jointly localizes and matches fingerspelling segments to text.\nFinally, I will describe a benchmark for large-vocabulary open-domain sign\nlanguage translation based on OpenASL. To address the challenges of sign\nlanguage translation in realistic settings, we propose a set of techniques\nincluding sign search as a pretext task for pre-training and fusion of mouthing\nand handshape features.\n","authors":["Bowen Shi"],"pdf_url":"https://arxiv.org/pdf/2308.12419v1.pdf","comment":"PhD thesis"},{"id":"http://arxiv.org/abs/2308.12416v1","updated":"2023-08-23T20:33:22Z","published":"2023-08-23T20:33:22Z","title":"Reframing the Brain Age Prediction Problem to a More Interpretable and\n Quantitative Approach","summary":" Deep learning models have achieved state-of-the-art results in estimating\nbrain age, which is an important brain health biomarker, from magnetic\nresonance (MR) images. However, most of these models only provide a global age\nprediction, and rely on techniques, such as saliency maps to interpret their\nresults. These saliency maps highlight regions in the input image that were\nsignificant for the model's predictions, but they are hard to be interpreted,\nand saliency map values are not directly comparable across different samples.\nIn this work, we reframe the age prediction problem from MR images to an\nimage-to-image regression problem where we estimate the brain age for each\nbrain voxel in MR images. We compare voxel-wise age prediction models against\nglobal age prediction models and their corresponding saliency maps. The results\nindicate that voxel-wise age prediction models are more interpretable, since\nthey provide spatial information about the brain aging process, and they\nbenefit from being quantitative.\n","authors":["Neha Gianchandani","Mahsa Dibaji","Mariana Bento","Ethan MacDonald","Roberto Souza"],"pdf_url":"https://arxiv.org/pdf/2308.12416v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.12256v1","updated":"2023-08-23T17:16:07Z","published":"2023-08-23T17:16:07Z","title":"Learning from Negative User Feedback and Measuring Responsiveness for\n Sequential Recommenders","summary":" Sequential recommenders have been widely used in industry due to their\nstrength in modeling user preferences. While these models excel at learning a\nuser's positive interests, less attention has been paid to learning from\nnegative user feedback. Negative user feedback is an important lever of user\ncontrol, and comes with an expectation that recommenders should respond quickly\nand reduce similar recommendations to the user. However, negative feedback\nsignals are often ignored in the training objective of sequential retrieval\nmodels, which primarily aim at predicting positive user interactions. In this\nwork, we incorporate explicit and implicit negative user feedback into the\ntraining objective of sequential recommenders in the retrieval stage using a\n\"not-to-recommend\" loss function that optimizes for the log-likelihood of not\nrecommending items with negative feedback. We demonstrate the effectiveness of\nthis approach using live experiments on a large-scale industrial recommender\nsystem. Furthermore, we address a challenge in measuring recommender\nresponsiveness to negative feedback by developing a counterfactual simulation\nframework to compare recommender responses between different user actions,\nshowing improved responsiveness from the modeling change.\n","authors":["Yueqi Wang","Yoni Halpern","Shuo Chang","Jingchen Feng","Elaine Ya Le","Longfei Li","Xujian Liang","Min-Cheng Huang","Shane Li","Alex Beutel","Yaping Zhang","Shuchao Bi"],"pdf_url":"https://arxiv.org/pdf/2308.12256v1.pdf","comment":"RecSys 2023 Industry Track"},{"id":"http://arxiv.org/abs/2308.12241v1","updated":"2023-08-23T16:32:54Z","published":"2023-08-23T16:32:54Z","title":"LLMRec: Benchmarking Large Language Models on Recommendation Task","summary":" Recently, the fast development of Large Language Models (LLMs) such as\nChatGPT has significantly advanced NLP tasks by enhancing the capabilities of\nconversational models. However, the application of LLMs in the recommendation\ndomain has not been thoroughly investigated. To bridge this gap, we propose\nLLMRec, a LLM-based recommender system designed for benchmarking LLMs on\nvarious recommendation tasks. Specifically, we benchmark several popular\noff-the-shelf LLMs, such as ChatGPT, LLaMA, ChatGLM, on five recommendation\ntasks, including rating prediction, sequential recommendation, direct\nrecommendation, explanation generation, and review summarization. Furthermore,\nwe investigate the effectiveness of supervised finetuning to improve LLMs'\ninstruction compliance ability. The benchmark results indicate that LLMs\ndisplayed only moderate proficiency in accuracy-based tasks such as sequential\nand direct recommendation. However, they demonstrated comparable performance to\nstate-of-the-art methods in explainability-based tasks. We also conduct\nqualitative evaluations to further evaluate the quality of contents generated\nby different models, and the results show that LLMs can truly understand the\nprovided information and generate clearer and more reasonable results. We\naspire that this benchmark will serve as an inspiration for researchers to\ndelve deeper into the potential of LLMs in enhancing recommendation\nperformance. Our codes, processed data and benchmark results are available at\nhttps://github.com/williamliujl/LLMRec.\n","authors":["Junling Liu","Chao Liu","Peilin Zhou","Qichen Ye","Dading Chong","Kang Zhou","Yueqi Xie","Yuwei Cao","Shoujin Wang","Chenyu You","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2308.12241v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12083v1","updated":"2023-08-23T12:05:48Z","published":"2023-08-23T12:05:48Z","title":"Counterfactual Graph Augmentation for Consumer Unfairness Mitigation in\n Recommender Systems","summary":" In recommendation literature, explainability and fairness are becoming two\nprominent perspectives to consider. However, prior works have mostly addressed\nthem separately, for instance by explaining to consumers why a certain item was\nrecommended or mitigating disparate impacts in recommendation utility. None of\nthem has leveraged explainability techniques to inform unfairness mitigation.\nIn this paper, we propose an approach that relies on counterfactual\nexplanations to augment the set of user-item interactions, such that using them\nwhile inferring recommendations leads to fairer outcomes. Modeling user-item\ninteractions as a bipartite graph, our approach augments the latter by\nidentifying new user-item edges that not only can explain the original\nunfairness by design, but can also mitigate it. Experiments on two public data\nsets show that our approach effectively leads to a better trade-off between\nfairness and recommendation utility compared with state-of-the-art mitigation\nprocedures. We further analyze the characteristics of added edges to highlight\nkey unfairness patterns. Source code available at\nhttps://github.com/jackmedda/RS-BGExplainer/tree/cikm2023.\n","authors":["Ludovico Boratto","Francesco Fabbri","Gianni Fenu","Mirko Marras","Giacomo Medda"],"pdf_url":"https://arxiv.org/pdf/2308.12083v1.pdf","comment":"Accepted as a short paper at CIKM 2023"},{"id":"http://arxiv.org/abs/2308.12039v1","updated":"2023-08-23T09:56:59Z","published":"2023-08-23T09:56:59Z","title":"Hybrid Retrieval and Multi-stage Text Ranking Solution at TREC 2022 Deep\n Learning Track","summary":" Large-scale text retrieval technology has been widely used in various\npractical business scenarios. This paper presents our systems for the TREC 2022\nDeep Learning Track. We explain the hybrid text retrieval and multi-stage text\nranking method adopted in our solution. The retrieval stage combined the two\nstructures of traditional sparse retrieval and neural dense retrieval. In the\nranking stage, in addition to the full interaction-based ranking model built on\nlarge pre-trained language model, we also proposes a lightweight sub-ranking\nmodule to further enhance the final text ranking performance. Evaluation\nresults demonstrate the effectiveness of our proposed approach. Our models\nachieve the 1st and 4th rank on the test set of passage ranking and document\nranking respectively.\n","authors":["Guangwei Xu","Yangzhao Zhang","Longhui Zhang","Dingkun Long","Pengjun Xie","Ruijie Guo"],"pdf_url":"https://arxiv.org/pdf/2308.12039v1.pdf","comment":"TREC 2022 Deep Learning Track"},{"id":"http://arxiv.org/abs/2308.12028v1","updated":"2023-08-23T09:39:18Z","published":"2023-08-23T09:39:18Z","title":"LKPNR: LLM and KG for Personalized News Recommendation Framework","summary":" Accurately recommending candidate news articles to users is a basic challenge\nfaced by personalized news recommendation systems. Traditional methods are\nusually difficult to grasp the complex semantic information in news texts,\nresulting in unsatisfactory recommendation results. Besides, these traditional\nmethods are more friendly to active users with rich historical behaviors.\nHowever, they can not effectively solve the \"long tail problem\" of inactive\nusers. To address these issues, this research presents a novel general\nframework that combines Large Language Models (LLM) and Knowledge Graphs (KG)\ninto semantic representations of traditional methods. In order to improve\nsemantic understanding in complex news texts, we use LLMs' powerful text\nunderstanding ability to generate news representations containing rich semantic\ninformation. In addition, our method combines the information about news\nentities and mines high-order structural information through multiple hops in\nKG, thus alleviating the challenge of long tail distribution. Experimental\nresults demonstrate that compared with various traditional models, the\nframework significantly improves the recommendation effect. The successful\nintegration of LLM and KG in our framework has established a feasible path for\nachieving more accurate personalized recommendations in the news field. Our\ncode is available at https://github.com/Xuan-ZW/LKPNR.\n","authors":["Chen hao","Xie Runfeng","Cui Xiangyang","Yan Zhou","Wang Xin","Xuan Zhanwei","Zhang Kai"],"pdf_url":"https://arxiv.org/pdf/2308.12028v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11998v1","updated":"2023-08-23T08:35:59Z","published":"2023-08-23T08:35:59Z","title":"Economic Recommender Systems -- A Systematic Review","summary":" Many of today's online services provide personalized recommendations to their\nusers. Such recommendations are typically designed to serve certain user needs,\ne.g., to quickly find relevant content in situations of information overload.\nCorrespondingly, the academic literature in the field largely focuses on the\nvalue of recommender systems for the end user. In this context, one underlying\nassumption is that the improved service that is achieved through the\nrecommendations will in turn positively impact the organization's goals, e.g.,\nin the form of higher customer retention or loyalty. However, in reality,\nrecommender systems can be used to target organizational economic goals more\ndirectly by incorporating monetary considerations such as price awareness and\nprofitability aspects into the underlying recommendation models. In this work,\nwe survey the existing literature on what we call Economic Recommender Systems\nbased on a systematic review approach that helped us identify 133 relevant\npapers. We first categorize existing works along different dimensions and then\nreview the most important technical approaches from the literature.\nFurthermore, we discuss common methodologies to evaluate such systems and\nfinally outline the limitations of today's research and future directions.\n","authors":["Alvise De Biasio","Nicolò Navarin","Dietmar Jannach"],"pdf_url":"https://arxiv.org/pdf/2308.11998v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01792v3","updated":"2023-08-23T07:43:03Z","published":"2023-06-01T08:10:03Z","title":"Task Relation-aware Continual User Representation Learning","summary":" User modeling, which learns to represent users into a low-dimensional\nrepresentation space based on their past behaviors, got a surge of interest\nfrom the industry for providing personalized services to users. Previous\nefforts in user modeling mainly focus on learning a task-specific user\nrepresentation that is designed for a single task. However, since learning\ntask-specific user representations for every task is infeasible, recent studies\nintroduce the concept of universal user representation, which is a more\ngeneralized representation of a user that is relevant to a variety of tasks.\nDespite their effectiveness, existing approaches for learning universal user\nrepresentations are impractical in real-world applications due to the data\nrequirement, catastrophic forgetting and the limited learning capability for\ncontinually added tasks. In this paper, we propose a novel continual user\nrepresentation learning method, called TERACON, whose learning capability is\nnot limited as the number of learned tasks increases while capturing the\nrelationship between the tasks. The main idea is to introduce an embedding for\neach task, i.e., task embedding, which is utilized to generate task-specific\nsoft masks that not only allow the entire model parameters to be updated until\nthe end of training sequence, but also facilitate the relationship between the\ntasks to be captured. Moreover, we introduce a novel knowledge retention module\nwith pseudo-labeling strategy that successfully alleviates the long-standing\nproblem of continual learning, i.e., catastrophic forgetting. Extensive\nexperiments on public and proprietary real-world datasets demonstrate the\nsuperiority and practicality of TERACON. Our code is available at\nhttps://github.com/Sein-Kim/TERACON.\n","authors":["Sein Kim","Namkyeong Lee","Donghyun Kim","Minchul Yang","Chanyoung Park"],"pdf_url":"https://arxiv.org/pdf/2306.01792v3.pdf","comment":"KDD 2023"},{"id":"http://arxiv.org/abs/2211.06924v3","updated":"2023-08-23T04:02:28Z","published":"2022-11-13T15:11:03Z","title":"A Tale of Two Graphs: Freezing and Denoising Graph Structures for\n Multimodal Recommendation","summary":" Multimodal recommender systems utilizing multimodal features (e.g., images\nand textual descriptions) typically show better recommendation accuracy than\ngeneral recommendation models based solely on user-item interactions.\nGenerally, prior work fuses multimodal features into item ID embeddings to\nenrich item representations, thus failing to capture the latent semantic\nitem-item structures. In this context, LATTICE proposes to learn the latent\nstructure between items explicitly and achieves state-of-the-art performance\nfor multimodal recommendations. However, we argue the latent graph structure\nlearning of LATTICE is both inefficient and unnecessary. Experimentally, we\ndemonstrate that freezing its item-item structure before training can also\nachieve competitive performance. Based on this finding, we propose a simple yet\neffective model, dubbed as FREEDOM, that FREEzes the item-item graph and\nDenOises the user-item interaction graph simultaneously for Multimodal\nrecommendation. Theoretically, we examine the design of FREEDOM through a graph\nspectral perspective and demonstrate that it possesses a tighter upper bound on\nthe graph spectrum. In denoising the user-item interaction graph, we devise a\ndegree-sensitive edge pruning method, which rejects possibly noisy edges with a\nhigh probability when sampling the graph. We evaluate the proposed model on\nthree real-world datasets and show that FREEDOM can significantly outperform\ncurrent strongest baselines. Compared with LATTICE, FREEDOM achieves an average\nimprovement of 19.07% in recommendation accuracy while reducing its memory cost\nup to 6$\\times$ on large graphs. The source code is available at:\nhttps://github.com/enoche/FREEDOM.\n","authors":["Xin Zhou","Zhiqi Shen"],"pdf_url":"https://arxiv.org/pdf/2211.06924v3.pdf","comment":"Accepted to ACM Multimedia (MM) 2023"},{"id":"http://arxiv.org/abs/2308.11884v1","updated":"2023-08-23T03:03:14Z","published":"2023-08-23T03:03:14Z","title":"Integrating the Wikidata Taxonomy into YAGO","summary":" Wikidata is one of the largest public general-purpose Knowledge Bases (KBs).\nYet, due to its collaborative nature, its schema and taxonomy have become\nconvoluted. For the YAGO 4 KB, we combined Wikidata with the ontology from\nSchema.org, which reduced and cleaned up the taxonomy and constraints and made\nit possible to run automated reasoners on the data. However, it also cut away\nlarge parts of the Wikidata taxonomy. In this paper, we present our effort to\nmerge the entire Wikidata taxonomy into the YAGO KB as much as possible. We pay\nparticular attention to logical constraints and a careful distinction of\nclasses and instances. Our work creates YAGO 4.5, which adds a rich layer of\ninformative classes to YAGO, while at the same time keeping the KB logically\nconsistent.\n","authors":["Fabian Suchanek","Mehwish Alam","Thomas Bonald","Pierre-Henri Paris","Jules Soria"],"pdf_url":"https://arxiv.org/pdf/2308.11884v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12420v1","updated":"2023-08-23T20:42:32Z","published":"2023-08-23T20:42:32Z","title":"Evolution of ESG-focused DLT Research: An NLP Analysis of the Literature","summary":" Distributed Ledger Technologies (DLTs) have rapidly evolved, necessitating\ncomprehensive insights into their diverse components. However, a systematic\nliterature review that emphasizes the Environmental, Sustainability, and\nGovernance (ESG) components of DLT remains lacking. To bridge this gap, we\nselected 107 seed papers to build a citation network of 63,083 references and\nrefined it to a corpus of 24,539 publications for analysis. Then, we labeled\nthe named entities in 46 papers according to twelve top-level categories\nderived from an established technology taxonomy and enhanced the taxonomy by\npinpointing DLT's ESG elements. Leveraging transformer-based language models,\nwe fine-tuned a pre-trained language model for a Named Entity Recognition (NER)\ntask using our labeled dataset. We used our fine-tuned language model to\ndistill the corpus to 505 key papers, facilitating a literature review via\nnamed entities and temporal graph analysis on DLT evolution in the context of\nESG. Our contributions are a methodology to conduct a machine learning-driven\nsystematic literature review in the DLT field, placing a special emphasis on\nESG aspects. Furthermore, we present a first-of-its-kind NER dataset, composed\nof 54,808 named entities, designed for DLT and ESG-related explorations.\n","authors":["Walter Hernandez","Kamil Tylinski","Alastair Moore","Niall Roche","Nikhil Vadgama","Horst Treiblmaier","Jiangbo Shangguan","Paolo Tasca","Jiahua Xu"],"pdf_url":"https://arxiv.org/pdf/2308.12420v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04706v2","updated":"2023-08-23T18:47:13Z","published":"2023-08-09T04:57:56Z","title":"Pareto Invariant Representation Learning for Multimedia Recommendation","summary":" Multimedia recommendation involves personalized ranking tasks, where\nmultimedia content is usually represented using a generic encoder. However,\nthese generic representations introduce spurious correlations that fail to\nreveal users' true preferences. Existing works attempt to alleviate this\nproblem by learning invariant representations, but overlook the balance between\nindependent and identically distributed (IID) and out-of-distribution (OOD)\ngeneralization. In this paper, we propose a framework called Pareto Invariant\nRepresentation Learning (PaInvRL) to mitigate the impact of spurious\ncorrelations from an IID-OOD multi-objective optimization perspective, by\nlearning invariant representations (intrinsic factors that attract user\nattention) and variant representations (other factors) simultaneously.\nSpecifically, PaInvRL includes three iteratively executed modules: (i)\nheterogeneous identification module, which identifies the heterogeneous\nenvironments to reflect distributional shifts for user-item interactions; (ii)\ninvariant mask generation module, which learns invariant masks based on the\nPareto-optimal solutions that minimize the adaptive weighted Invariant Risk\nMinimization (IRM) and Empirical Risk (ERM) losses; (iii) convert module, which\ngenerates both variant representations and item-invariant representations for\ntraining a multi-modal recommendation model that mitigates spurious\ncorrelations and balances the generalization performance within and cross the\nenvironmental distributions. We compare the proposed PaInvRL with\nstate-of-the-art recommendation models on three public multimedia\nrecommendation datasets (Movielens, Tiktok, and Kwai), and the experimental\nresults validate the effectiveness of PaInvRL for both within- and\ncross-environmental learning.\n","authors":["Shanshan Huang","Haoxuan Li","Qingsong Li","Chunyuan Zheng","Li Liu"],"pdf_url":"https://arxiv.org/pdf/2308.04706v2.pdf","comment":"ACM MM 2023 full paper"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2308.12284v1","updated":"2023-08-23T17:58:14Z","published":"2023-08-23T17:58:14Z","title":"D4: Improving LLM Pretraining via Document De-Duplication and\n Diversification","summary":" Over recent years, an increasing amount of compute and data has been poured\ninto training large language models (LLMs), usually by doing one-pass learning\non as many tokens as possible randomly selected from large-scale web corpora.\nWhile training on ever-larger portions of the internet leads to consistent\nperformance improvements, the size of these improvements diminishes with scale,\nand there has been little work exploring the effect of data selection on\npre-training and downstream performance beyond simple de-duplication methods\nsuch as MinHash. Here, we show that careful data selection (on top of\nde-duplicated data) via pre-trained model embeddings can speed up training (20%\nefficiency gains) and improves average downstream accuracy on 16 NLP tasks (up\nto 2%) at the 6.7B model scale. Furthermore, we show that repeating data\nintelligently consistently outperforms baseline training (while repeating\nrandom data performs worse than baseline training). Our results indicate that\nclever data selection can significantly improve LLM pre-training, calls into\nquestion the common practice of training for a single epoch on as much data as\npossible, and demonstrates a path to keep improving our models past the limits\nof randomly sampling web data.\n","authors":["Kushal Tirumala","Daniel Simig","Armen Aghajanyan","Ari S. Morcos"],"pdf_url":"https://arxiv.org/pdf/2308.12284v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12280v1","updated":"2023-08-23T17:50:57Z","published":"2023-08-23T17:50:57Z","title":"Extended Linear Regression: A Kalman Filter Approach for Minimizing Loss\n via Area Under the Curve","summary":" This research enhances linear regression models by integrating a Kalman\nfilter and analysing curve areas to minimize loss. The goal is to develop an\noptimal linear regression equation using stochastic gradient descent (SGD) for\nweight updating. Our approach involves a stepwise process, starting with\nuser-defined parameters. The linear regression model is trained using SGD,\ntracking weights and loss separately and zipping them finally. A Kalman filter\nis then trained based on weight and loss arrays to predict the next\nconsolidated weights. Predictions result from multiplying input averages with\nweights, evaluated for loss to form a weight-versus-loss curve. The curve's\nequation is derived using the two-point formula, and area under the curve is\ncalculated via integration. The linear regression equation with minimum area\nbecomes the optimal curve for prediction. Benefits include avoiding constant\nweight updates via gradient descent and working with partial datasets, unlike\nmethods needing the entire set. However, computational complexity should be\nconsidered. The Kalman filter's accuracy might diminish beyond a certain\nprediction range.\n","authors":["Gokulprasath R"],"pdf_url":"https://arxiv.org/pdf/2308.12280v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12279v1","updated":"2023-08-23T17:50:50Z","published":"2023-08-23T17:50:50Z","title":"On-Manifold Projected Gradient Descent","summary":" This work provides a computable, direct, and mathematically rigorous\napproximation to the differential geometry of class manifolds for\nhigh-dimensional data, along with nonlinear projections from input space onto\nthese class manifolds. The tools are applied to the setting of neural network\nimage classifiers, where we generate novel, on-manifold data samples, and\nimplement a projected gradient descent algorithm for on-manifold adversarial\ntraining. The susceptibility of neural networks (NNs) to adversarial attack\nhighlights the brittle nature of NN decision boundaries in input space.\nIntroducing adversarial examples during training has been shown to reduce the\nsusceptibility of NNs to adversarial attack; however, it has also been shown to\nreduce the accuracy of the classifier if the examples are not valid examples\nfor that class. Realistic \"on-manifold\" examples have been previously generated\nfrom class manifolds in the latent of an autoencoder. Our work explores these\nphenomena in a geometric and computational setting that is much closer to the\nraw, high-dimensional input space than can be provided by VAE or other black\nbox dimensionality reductions. We employ conformally invariant diffusion maps\n(CIDM) to approximate class manifolds in diffusion coordinates, and develop the\nNystr\\\"{o}m projection to project novel points onto class manifolds in this\nsetting. On top of the manifold approximation, we leverage the spectral\nexterior calculus (SEC) to determine geometric quantities such as tangent\nvectors of the manifold. We use these tools to obtain adversarial examples that\nreside on a class manifold, yet fool a classifier. These misclassifications\nthen become explainable in terms of human-understandable manipulations within\nthe data, by expressing the on-manifold adversary in the semantic basis on the\nmanifold.\n","authors":["Aaron Mahler","Tyrus Berry","Tom Stephens","Harbir Antil","Michael Merritt","Jeanie Schreiber","Ioannis Kevrekidis"],"pdf_url":"https://arxiv.org/pdf/2308.12279v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12270v1","updated":"2023-08-23T17:37:51Z","published":"2023-08-23T17:37:51Z","title":"Language Reward Modulation for Pretraining Reinforcement Learning","summary":" Using learned reward functions (LRFs) as a means to solve sparse-reward\nreinforcement learning (RL) tasks has yielded some steady progress in\ntask-complexity through the years. In this work, we question whether today's\nLRFs are best-suited as a direct replacement for task rewards. Instead, we\npropose leveraging the capabilities of LRFs as a pretraining signal for RL.\nConcretely, we propose $\\textbf{LA}$nguage Reward $\\textbf{M}$odulated\n$\\textbf{P}$retraining (LAMP) which leverages the zero-shot capabilities of\nVision-Language Models (VLMs) as a $\\textit{pretraining}$ utility for RL as\nopposed to a downstream task reward. LAMP uses a frozen, pretrained VLM to\nscalably generate noisy, albeit shaped exploration rewards by computing the\ncontrastive alignment between a highly diverse collection of language\ninstructions and the image observations of an agent in its pretraining\nenvironment. LAMP optimizes these rewards in conjunction with standard\nnovelty-seeking exploration rewards with reinforcement learning to acquire a\nlanguage-conditioned, pretrained policy. Our VLM pretraining approach, which is\na departure from previous attempts to use LRFs, can warmstart sample-efficient\nlearning on robot manipulation tasks in RLBench.\n","authors":["Ademi Adeniji","Amber Xie","Carmelo Sferrazza","Younggyo Seo","Stephen James","Pieter Abbeel"],"pdf_url":"https://arxiv.org/pdf/2308.12270v1.pdf","comment":"Code available at https://github.com/ademiadeniji/lamp"},{"id":"http://arxiv.org/abs/2308.11601v2","updated":"2023-08-23T17:34:17Z","published":"2023-08-22T17:48:24Z","title":"Tryage: Real-time, intelligent Routing of User Prompts to Large Language\n Models","summary":" The introduction of the transformer architecture and the self-attention\nmechanism has led to an explosive production of language models trained on\nspecific downstream tasks and data domains. With over 200, 000 models in the\nHugging Face ecosystem, users grapple with selecting and optimizing models to\nsuit multifaceted workflows and data domains while addressing computational,\nsecurity, and recency concerns. There is an urgent need for machine learning\nframeworks that can eliminate the burden of model selection and customization\nand unleash the incredible power of the vast emerging model library for end\nusers. Here, we propose a context-aware routing system, Tryage, that leverages\na language model router for optimal selection of expert models from a model\nlibrary based on analysis of individual input prompts. Inspired by the thalamic\nrouter in the brain, Tryage employs a perceptive router to predict down-stream\nmodel performance on prompts and, then, makes a routing decision using an\nobjective function that integrates performance predictions with user goals and\nconstraints that are incorporated through flags (e.g., model size, model\nrecency). Tryage allows users to explore a Pareto front and automatically\ntrade-off between task accuracy and secondary goals including minimization of\nmodel size, recency, security, verbosity, and readability. Across heterogeneous\ndata sets that include code, text, clinical data, and patents, the Tryage\nframework surpasses Gorilla and GPT3.5 turbo in dynamic model selection\nidentifying the optimal model with an accuracy of 50.9% , compared to 23.6% by\nGPT 3.5 Turbo and 10.8% by Gorilla. Conceptually, Tryage demonstrates how\nrouting models can be applied to program and control the behavior of\nmulti-model LLM systems to maximize efficient use of the expanding and evolving\nlanguage model ecosystem.\n","authors":["Surya Narayanan Hari","Matt Thomson"],"pdf_url":"https://arxiv.org/pdf/2308.11601v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12264v1","updated":"2023-08-23T17:32:06Z","published":"2023-08-23T17:32:06Z","title":"FECoM: A Step towards Fine-Grained Energy Measurement for Deep Learning","summary":" With the increasing usage, scale, and complexity of Deep Learning (DL)\nmodels, their rapidly growing energy consumption has become a critical concern.\nPromoting green development and energy awareness at different granularities is\nthe need of the hour to limit carbon emissions of DL systems. However, the lack\nof standard and repeatable tools to accurately measure and optimize energy\nconsumption at a fine granularity (e.g., at method level) hinders progress in\nthis area. In this paper, we introduce FECoM (Fine-grained Energy Consumption\nMeter), a framework for fine-grained DL energy consumption measurement.\nSpecifically, FECoM provides researchers and developers a mechanism to profile\nDL APIs. FECoM addresses the challenges of measuring energy consumption at\nfine-grained level by using static instrumentation and considering various\nfactors, including computational load and temperature stability. We assess\nFECoM's capability to measure fine-grained energy consumption for one of the\nmost popular open-source DL frameworks, namely TensorFlow. Using FECoM, we also\ninvestigate the impact of parameter size and execution time on energy\nconsumption, enriching our understanding of TensorFlow APIs' energy profiles.\nFurthermore, we elaborate on the considerations, issues, and challenges that\none needs to consider while designing and implementing a fine-grained energy\nconsumption measurement tool. We hope this work will facilitate further\nadvances in DL energy measurement and the development of energy-aware practices\nfor DL systems.\n","authors":["Saurabhsingh Rajput","Tim Widmayer","Ziyuan Shang","Maria Kechagia","Federica Sarro","Tushar Sharma"],"pdf_url":"https://arxiv.org/pdf/2308.12264v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.01575v3","updated":"2023-08-23T17:23:59Z","published":"2021-12-02T19:29:26Z","title":"Towards Interactive Reinforcement Learning with Intrinsic Feedback","summary":" Reinforcement learning (RL) and brain-computer interfaces (BCI) have\nexperienced significant growth over the past decade. With rising interest in\nhuman-in-the-loop (HITL), incorporating human input with RL algorithms has\ngiven rise to the sub-field of interactive RL. Adjacently, the field of BCI has\nlong been interested in extracting informative brain signals from neural\nactivity for use in human-computer interactions. A key link between these\nfields lies in the interpretation of neural activity as feedback such that\ninteractive RL approaches can be employed. We denote this new and emerging\nmedium of feedback as intrinsic feedback. Despite intrinsic feedback's ability\nto be conveyed automatically and even unconsciously, proper exploration\nsurrounding this key link has largely gone unaddressed by both communities.\nThus, to help facilitate a deeper understanding and a more effective\nutilization, we provide a tutorial-style review covering the motivations,\napproaches, and open problems of intrinsic feedback and its foundational\nconcepts.\n","authors":["Benjamin Poole","Minwoo Lee"],"pdf_url":"https://arxiv.org/pdf/2112.01575v3.pdf","comment":"Name change and vast rewrites of the paper"},{"id":"http://arxiv.org/abs/2308.12256v1","updated":"2023-08-23T17:16:07Z","published":"2023-08-23T17:16:07Z","title":"Learning from Negative User Feedback and Measuring Responsiveness for\n Sequential Recommenders","summary":" Sequential recommenders have been widely used in industry due to their\nstrength in modeling user preferences. While these models excel at learning a\nuser's positive interests, less attention has been paid to learning from\nnegative user feedback. Negative user feedback is an important lever of user\ncontrol, and comes with an expectation that recommenders should respond quickly\nand reduce similar recommendations to the user. However, negative feedback\nsignals are often ignored in the training objective of sequential retrieval\nmodels, which primarily aim at predicting positive user interactions. In this\nwork, we incorporate explicit and implicit negative user feedback into the\ntraining objective of sequential recommenders in the retrieval stage using a\n\"not-to-recommend\" loss function that optimizes for the log-likelihood of not\nrecommending items with negative feedback. We demonstrate the effectiveness of\nthis approach using live experiments on a large-scale industrial recommender\nsystem. Furthermore, we address a challenge in measuring recommender\nresponsiveness to negative feedback by developing a counterfactual simulation\nframework to compare recommender responses between different user actions,\nshowing improved responsiveness from the modeling change.\n","authors":["Yueqi Wang","Yoni Halpern","Shuo Chang","Jingchen Feng","Elaine Ya Le","Longfei Li","Xujian Liang","Min-Cheng Huang","Shane Li","Alex Beutel","Yaping Zhang","Shuchao Bi"],"pdf_url":"https://arxiv.org/pdf/2308.12256v1.pdf","comment":"RecSys 2023 Industry Track"},{"id":"http://arxiv.org/abs/2308.12252v1","updated":"2023-08-23T17:01:53Z","published":"2023-08-23T17:01:53Z","title":"How Safe Am I Given What I See? Calibrated Prediction of Safety Chances\n for Image-Controlled Autonomy","summary":" End-to-end learning has emerged as a major paradigm for developing autonomous\nsystems. Unfortunately, with its performance and convenience comes an even\ngreater challenge of safety assurance. A key factor of this challenge is the\nabsence of the notion of a low-dimensional and interpretable dynamical state,\naround which traditional assurance methods revolve. Focusing on the online\nsafety prediction problem, this paper proposes a configurable family of\nlearning pipelines based on generative world models, which do not require\nlow-dimensional states. To implement these pipelines, we overcome the\nchallenges of learning safety-informed latent representations and missing\nsafety labels under prediction-induced distribution shift. These pipelines come\nwith statistical calibration guarantees on their safety chance predictions\nbased on conformal prediction. We perform an extensive evaluation of the\nproposed learning pipelines on two case studies of image-controlled systems: a\nracing car and a cartpole.\n","authors":["Zhenjiang Mao","Carson Sobolewski","Ivan Ruchkin"],"pdf_url":"https://arxiv.org/pdf/2308.12252v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2103.05621v3","updated":"2023-08-23T16:54:03Z","published":"2021-03-09T18:46:01Z","title":"The Common Intuition to Transfer Learning Can Win or Lose: Case Studies\n for Linear Regression","summary":" We study a fundamental transfer learning process from source to target linear\nregression tasks, including overparameterized settings where there are more\nlearned parameters than data samples. The target task learning is addressed by\nusing its training data together with the parameters previously computed for\nthe source task. We define a transfer learning approach to the target task as a\nlinear regression optimization with a regularization on the distance between\nthe to-be-learned target parameters and the already-learned source parameters.\nWe analytically characterize the generalization performance of our transfer\nlearning approach and demonstrate its ability to resolve the peak in\ngeneralization errors in double descent phenomena of the minimum L2-norm\nsolution to linear regression. Moreover, we show that for sufficiently related\ntasks, the optimally tuned transfer learning approach can outperform the\noptimally tuned ridge regression method, even when the true parameter vector\nconforms to an isotropic Gaussian prior distribution. Namely, we demonstrate\nthat transfer learning can beat the minimum mean square error (MMSE) solution\nof the independent target task. Our results emphasize the ability of transfer\nlearning to extend the solution space to the target task and, by that, to have\nan improved MMSE solution. We formulate the linear MMSE solution to our\ntransfer learning setting and point out its key differences from the common\ndesign philosophy to transfer learning.\n","authors":["Yehuda Dar","Daniel LeJeune","Richard G. Baraniuk"],"pdf_url":"https://arxiv.org/pdf/2103.05621v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12247v1","updated":"2023-08-23T16:48:04Z","published":"2023-08-23T16:48:04Z","title":"How to Protect Copyright Data in Optimization of Large Language Models?","summary":" Large language models (LLMs) and generative AI have played a transformative\nrole in computer research and applications. Controversy has arisen as to\nwhether these models output copyrighted data, which can occur if the data the\nmodels are trained on is copyrighted. LLMs are built on the transformer neural\nnetwork architecture, which in turn relies on a mathematical computation called\nAttention that uses the softmax function.\n In this paper, we show that large language model training and optimization\ncan be seen as a softmax regression problem. We then establish a method of\nefficiently performing softmax regression, in a way that prevents the\nregression function from generating copyright data. This establishes a\ntheoretical method of training large language models in a way that avoids\ngenerating copyright data.\n","authors":["Timothy Chu","Zhao Song","Chiwun Yang"],"pdf_url":"https://arxiv.org/pdf/2308.12247v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.03006v4","updated":"2023-08-23T16:47:25Z","published":"2021-10-06T18:25:50Z","title":"Unsupervised Selective Labeling for More Effective Semi-Supervised\n Learning","summary":" Given an unlabeled dataset and an annotation budget, we study how to\nselectively label a fixed number of instances so that semi-supervised learning\n(SSL) on such a partially labeled dataset is most effective. We focus on\nselecting the right data to label, in addition to usual SSL's propagating\nlabels from labeled data to the rest unlabeled data. This instance selection\ntask is challenging, as without any labeled data we do not know what the\nobjective of learning should be. Intuitively, no matter what the downstream\ntask is, instances to be labeled must be representative and diverse: The former\nwould facilitate label propagation to unlabeled data, whereas the latter would\nensure coverage of the entire dataset. We capture this idea by selecting\ncluster prototypes, either in a pretrained feature space, or along with feature\noptimization, both without labels. Our unsupervised selective labeling\nconsistently improves SSL methods over state-of-the-art active learning given\nlabeled data, by 8 to 25 times in label efficiency. For example, it boosts\nFixMatch by 10% (14%) in accuracy on CIFAR-10 (ImageNet-1K) with 0.08% (0.2%)\nlabeled data, demonstrating that small computation spent on selecting what data\nto label brings significant gain especially under a low annotation budget. Our\nwork sets a new standard for practical and efficient SSL.\n","authors":["Xudong Wang","Long Lian","Stella X. Yu"],"pdf_url":"https://arxiv.org/pdf/2110.03006v4.pdf","comment":"Accepted by ECCV 2022; Fixed a few typos"},{"id":"http://arxiv.org/abs/2209.01566v3","updated":"2023-08-23T16:44:27Z","published":"2022-09-04T08:35:16Z","title":"Towards Top-Down Automated Development in Limited Scopes: A\n Neuro-Symbolic Framework from Expressibles to Executables","summary":" Deep code generation is a topic of deep learning for software engineering\n(DL4SE), which adopts neural models to generate code for the intended\nfunctions. Since end-to-end neural methods lack domain knowledge and software\nhierarchy awareness, they tend to perform poorly w.r.t project-level tasks. To\nsystematically explore the potential improvements of code generation, we let it\nparticipate in the whole top-down development from \\emph{expressibles} to\n\\emph{executables}, which is possible in limited scopes. In the process, it\nbenefits from massive samples, features, and knowledge. As the foundation, we\nsuggest building a taxonomy on code data, namely code taxonomy, leveraging the\ncategorization of code information. Moreover, we introduce a three-layer\nsemantic pyramid (SP) to associate text data and code data. It identifies the\ninformation of different abstraction levels, and thus introduces the domain\nknowledge on development and reveals the hierarchy of software. Furthermore, we\npropose a semantic pyramid framework (SPF) as the approach, focusing on\nsoftware of high modularity and low complexity. SPF divides the code generation\nprocess into stages and reserves spots for potential interactions. In addition,\nwe conceived preliminary applications in software development to confirm the\nneuro-symbolic framework.\n","authors":["Jian Gu","Harald C. Gall"],"pdf_url":"https://arxiv.org/pdf/2209.01566v3.pdf","comment":"5 pages, 3 figures, 2 tables, accepted by ESEC/FSE 2023, the\n camera-ready version"},{"id":"http://arxiv.org/abs/2308.12243v1","updated":"2023-08-23T16:42:27Z","published":"2023-08-23T16:42:27Z","title":"Multi-Objective Optimization for Sparse Deep Neural Network Training","summary":" Different conflicting optimization criteria arise naturally in various Deep\nLearning scenarios. These can address different main tasks (i.e., in the\nsetting of Multi-Task Learning), but also main and secondary tasks such as loss\nminimization versus sparsity. The usual approach is a simple weighting of the\ncriteria, which formally only works in the convex setting. In this paper, we\npresent a Multi-Objective Optimization algorithm using a modified Weighted\nChebyshev scalarization for training Deep Neural Networks (DNNs) with respect\nto several tasks. By employing this scalarization technique, the algorithm can\nidentify all optimal solutions of the original problem while reducing its\ncomplexity to a sequence of single-objective problems. The simplified problems\nare then solved using an Augmented Lagrangian method, enabling the use of\npopular optimization techniques such as Adam and Stochastic Gradient Descent,\nwhile efficaciously handling constraints. Our work aims to address the\n(economical and also ecological) sustainability issue of DNN models, with a\nparticular focus on Deep Multi-Task models, which are typically designed with a\nvery large number of weights to perform equally well on multiple tasks. Through\nexperiments conducted on two Machine Learning datasets, we demonstrate the\npossibility of adaptively sparsifying the model during training without\nsignificantly impacting its performance, if we are willing to apply\ntask-specific adaptations to the network weights. Code is available at\nhttps://github.com/salomonhotegni/MDMTN.\n","authors":["S. S. Hotegni","S. Peitz","M. Berkemeier"],"pdf_url":"https://arxiv.org/pdf/2308.12243v1.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2206.02667v2","updated":"2023-08-23T16:18:03Z","published":"2022-06-06T15:12:56Z","title":"Emergent segmentation from participation dynamics and multi-learner\n retraining","summary":" The choice to participate in a data-driven service, often made on the basis\nof quality of that service, influences the ability of the service to learn and\nimprove. We study the participation and retraining dynamics that arise when\nboth the learners and sub-populations of users are \\emph{risk-reducing}, which\ncover a broad class of updates including gradient descent, multiplicative\nweights, etc. Suppose, for example, that individuals choose to spend their time\namongst social media platforms proportionally to how well each platform works\nfor them. Each platform also gathers data about its active users, which it uses\nto update parameters with a gradient step. For this example and for our general\nclass of dynamics, we show that the only asymptotically stable equilibria are\nsegmented, with sub-populations allocated to a single learner. Under mild\nassumptions, the utilitarian social optimum is a stable equilibrium. In\ncontrast to previous work, which shows that repeated risk minimization can\nresult in representation disparity and high overall loss for a single learner\n\\citep{hashimoto2018fairness,miller2021outside}, we find that repeated myopic\nupdates with multiple learners lead to better outcomes. We illustrate the\nphenomena via a simulated example initialized from real data.\n","authors":["Sarah Dean","Mihaela Curmei","Lillian J. Ratliff","Jamie Morgenstern","Maryam Fazel"],"pdf_url":"https://arxiv.org/pdf/2206.02667v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12221v1","updated":"2023-08-23T16:01:50Z","published":"2023-08-23T16:01:50Z","title":"Critical Learning Periods Emerge Even in Deep Linear Networks","summary":" Critical learning periods are periods early in development where temporary\nsensory deficits can have a permanent effect on behavior and learned\nrepresentations. Despite the radical differences between biological and\nartificial networks, critical learning periods have been empirically observed\nin both systems. This suggests that critical periods may be fundamental to\nlearning and not an accident of biology. Yet, why exactly critical periods\nemerge in deep networks is still an open question, and in particular it is\nunclear whether the critical periods observed in both systems depend on\nparticular architectural or optimization details. To isolate the key underlying\nfactors, we focus on deep linear network models, and show that, surprisingly,\nsuch networks also display much of the behavior seen in biology and artificial\nnetworks, while being amenable to analytical treatment. We show that critical\nperiods depend on the depth of the model and structure of the data\ndistribution. We also show analytically and in simulations that the learning of\nfeatures is tied to competition between sources. Finally, we extend our\nanalysis to multi-task learning to show that pre-training on certain tasks can\ndamage the transfer performance on new tasks, and show how this depends on the\nrelationship between tasks and the duration of the pre-training stage. To the\nbest of our knowledge, our work provides the first analytically tractable model\nthat sheds light into why critical learning periods emerge in biological and\nartificial networks.\n","authors":["Michael Kleinman","Alessandro Achille","Stefano Soatto"],"pdf_url":"https://arxiv.org/pdf/2308.12221v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12219v1","updated":"2023-08-23T16:01:12Z","published":"2023-08-23T16:01:12Z","title":"Diffusion Language Models Can Perform Many Tasks with Scaling and\n Instruction-Finetuning","summary":" The recent surge of generative AI has been fueled by the generative power of\ndiffusion probabilistic models and the scalable capabilities of large language\nmodels. Despite their potential, it remains elusive whether diffusion language\nmodels can solve general language tasks comparable to their autoregressive\ncounterparts. This paper demonstrates that scaling diffusion models w.r.t.\ndata, sizes, and tasks can effectively make them strong language learners. We\nbuild competent diffusion language models at scale by first acquiring knowledge\nfrom massive data via masked language modeling pretraining thanks to their\nintrinsic connections. We then reprogram pretrained masked language models into\ndiffusion language models via diffusive adaptation, wherein task-specific\nfinetuning and instruction finetuning are explored to unlock their versatility\nin solving general language tasks. Experiments show that scaling diffusion\nlanguage models consistently improves performance across downstream language\ntasks. We further discover that instruction finetuning can elicit zero-shot and\nfew-shot in-context learning abilities that help tackle many unseen tasks by\nfollowing natural language instructions, and show promise in advanced and\nchallenging abilities such as reasoning\n","authors":["Jiasheng Ye","Zaixiang Zheng","Yu Bao","Lihua Qian","Quanquan Gu"],"pdf_url":"https://arxiv.org/pdf/2308.12219v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12215v1","updated":"2023-08-23T15:52:20Z","published":"2023-08-23T15:52:20Z","title":"The Challenges of Machine Learning for Trust and Safety: A Case Study on\n Misinformation Detection","summary":" We examine the disconnect between scholarship and practice in applying\nmachine learning to trust and safety problems, using misinformation detection\nas a case study. We systematize literature on automated detection of\nmisinformation across a corpus of 270 well-cited papers in the field. We then\nexamine subsets of papers for data and code availability, design missteps,\nreproducibility, and generalizability. We find significant shortcomings in the\nliterature that call into question claimed performance and practicality.\nDetection tasks are often meaningfully distinct from the challenges that online\nservices actually face. Datasets and model evaluation are often\nnon-representative of real-world contexts, and evaluation frequently is not\nindependent of model training. Data and code availability is poor. Models do\nnot generalize well to out-of-domain data. Based on these results, we offer\nrecommendations for evaluating machine learning applications to trust and\nsafety problems. Our aim is for future work to avoid the pitfalls that we\nidentify.\n","authors":["Madelyne Xiao","Jonathan Mayer"],"pdf_url":"https://arxiv.org/pdf/2308.12215v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12212v1","updated":"2023-08-23T15:51:29Z","published":"2023-08-23T15:51:29Z","title":"Learning to Learn Financial Networks for Optimising Momentum Strategies","summary":" Network momentum provides a novel type of risk premium, which exploits the\ninterconnections among assets in a financial network to predict future returns.\nHowever, the current process of constructing financial networks relies heavily\non expensive databases and financial expertise, limiting accessibility for\nsmall-sized and academic institutions. Furthermore, the traditional approach\ntreats network construction and portfolio optimisation as separate tasks,\npotentially hindering optimal portfolio performance. To address these\nchallenges, we propose L2GMOM, an end-to-end machine learning framework that\nsimultaneously learns financial networks and optimises trading signals for\nnetwork momentum strategies. The model of L2GMOM is a neural network with a\nhighly interpretable forward propagation architecture, which is derived from\nalgorithm unrolling. The L2GMOM is flexible and can be trained with diverse\nloss functions for portfolio performance, e.g. the negative Sharpe ratio.\nBacktesting on 64 continuous future contracts demonstrates a significant\nimprovement in portfolio profitability and risk control, with a Sharpe ratio of\n1.74 across a 20-year period.\n","authors":[" Xingyue"," Pu","Stefan Zohren","Stephen Roberts","Xiaowen Dong"],"pdf_url":"https://arxiv.org/pdf/2308.12212v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2308.12210v1","updated":"2023-08-23T15:50:51Z","published":"2023-08-23T15:50:51Z","title":"ULDP-FL: Federated Learning with Across Silo User-Level Differential\n Privacy","summary":" Differentially Private Federated Learning (DP-FL) has garnered attention as a\ncollaborative machine learning approach that ensures formal privacy. Most DP-FL\napproaches ensure DP at the record-level within each silo for cross-silo FL.\nHowever, a single user's data may extend across multiple silos, and the desired\nuser-level DP guarantee for such a setting remains unknown. In this study, we\npresent ULDP-FL, a novel FL framework designed to guarantee user-level DP in\ncross-silo FL where a single user's data may belong to multiple silos. Our\nproposed algorithm directly ensures user-level DP through per-user weighted\nclipping, departing from group-privacy approaches. We provide a theoretical\nanalysis of the algorithm's privacy and utility. Additionally, we enhance the\nalgorithm's utility and showcase its private implementation using cryptographic\nbuilding blocks. Empirical experiments on real-world datasets show substantial\nimprovements in our methods in privacy-utility trade-offs under user-level DP\ncompared to baseline methods. To the best of our knowledge, our work is the\nfirst FL framework that effectively provides user-level DP in the general\ncross-silo FL setting.\n","authors":["Fumiyuki Kato","Li Xiong","Shun Takagi","Yang Cao","Masatoshi Yoshikawa"],"pdf_url":"https://arxiv.org/pdf/2308.12210v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12202v1","updated":"2023-08-23T15:39:42Z","published":"2023-08-23T15:39:42Z","title":"Curriculum Learning with Adam: The Devil Is in the Wrong Details","summary":" Curriculum learning (CL) posits that machine learning models -- similar to\nhumans -- may learn more efficiently from data that match their current\nlearning progress. However, CL methods are still poorly understood and, in\nparticular for natural language processing (NLP), have achieved only limited\nsuccess. In this paper, we explore why. Starting from an attempt to replicate\nand extend a number of recent curriculum methods, we find that their results\nare surprisingly brittle when applied to NLP. A deep dive into the\n(in)effectiveness of the curricula in some scenarios shows us why: when\ncurricula are employed in combination with the popular Adam optimisation\nalgorithm, they oftentimes learn to adapt to suboptimally chosen optimisation\nparameters for this algorithm. We present a number of different case studies\nwith different common hand-crafted and automated CL approaches to illustrate\nthis phenomenon, and we find that none of them outperforms optimisation with\nonly Adam with well-chosen hyperparameters. As such, our results contribute to\nunderstanding why CL methods work, but at the same time urge caution when\nclaiming positive results.\n","authors":["Lucas Weber","Jaap Jumelet","Paul Michel","Elia Bruni","Dieuwke Hupkes"],"pdf_url":"https://arxiv.org/pdf/2308.12202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12193v1","updated":"2023-08-23T15:31:38Z","published":"2023-08-23T15:31:38Z","title":"Self-Supervised Knowledge-Driven Deep Learning for 3D Magnetic Inversion","summary":" The magnetic inversion method is one of the non-destructive geophysical\nmethods, which aims to estimate the subsurface susceptibility distribution from\nsurface magnetic anomaly data. Recently, supervised deep learning methods have\nbeen widely utilized in lots of geophysical fields including magnetic\ninversion. However, these methods rely heavily on synthetic training data,\nwhose performance is limited since the synthetic data is not independently and\nidentically distributed with the field data. Thus, we proposed to realize\nmagnetic inversion by self-supervised deep learning. The proposed\nself-supervised knowledge-driven 3D magnetic inversion method (SSKMI) learns on\nthe target field data by a closed loop of the inversion and forward models.\nGiven that the parameters of the forward model are preset, SSKMI can optimize\nthe inversion model by minimizing the mean absolute error between observed and\nre-estimated surface magnetic anomalies. Besides, there is a knowledge-driven\nmodule in the proposed inversion model, which makes the deep learning method\nmore explicable. Meanwhile, comparative experiments demonstrate that the\nknowledge-driven module can accelerate the training of the proposed method and\nachieve better results. Since magnetic inversion is an ill-pose task, SSKMI\nproposed to constrain the inversion model by a guideline in the auxiliary loop.\nThe experimental results demonstrate that the proposed method is a reliable\nmagnetic inversion method with outstanding performance.\n","authors":["Yinshuo Li","Zhuo Jia","Wenkai Lu","Cao Song"],"pdf_url":"https://arxiv.org/pdf/2308.12193v1.pdf","comment":"11 pages, 14 figures"},{"id":"http://arxiv.org/abs/2308.12192v1","updated":"2023-08-23T15:30:44Z","published":"2023-08-23T15:30:44Z","title":"Robustness Analysis of Continuous-Depth Models with Lagrangian\n Techniques","summary":" This paper presents, in a unified fashion, deterministic as well as\nstatistical Lagrangian-verification techniques. They formally quantify the\nbehavioral robustness of any time-continuous process, formulated as a\ncontinuous-depth model. To this end, we review LRT-NG, SLR, and GoTube,\nalgorithms for constructing a tight reachtube, that is, an over-approximation\nof the set of states reachable within a given time-horizon, and provide\nguarantees for the reachtube bounds. We compare the usage of the variational\nequations, associated to the system equations, the mean value theorem, and the\nLipschitz constants, in achieving deterministic and statistical guarantees. In\nLRT-NG, the Lipschitz constant is used as a bloating factor of the initial\nperturbation, to compute the radius of an ellipsoid in an optimal metric, which\nover-approximates the set of reachable states. In SLR and GoTube, we get\nstatistical guarantees, by using the Lipschitz constants to compute local balls\naround samples. These are needed to calculate the probability of having found\nan upper bound, of the true maximum perturbation at every timestep. Our\nexperiments demonstrate the superior performance of Lagrangian techniques, when\ncompared to LRT, Flow*, and CAPD, and illustrate their use in the robustness\nanalysis of various continuous-depth models.\n","authors":["Sophie A. Neubauer","Radu Grosu"],"pdf_url":"https://arxiv.org/pdf/2308.12192v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2107.08467"},{"id":"http://arxiv.org/abs/2308.12188v1","updated":"2023-08-23T15:25:17Z","published":"2023-08-23T15:25:17Z","title":"Development and external validation of a lung cancer risk estimation\n tool using gradient-boosting","summary":" Lung cancer is a significant cause of mortality worldwide, emphasizing the\nimportance of early detection for improved survival rates. In this study, we\npropose a machine learning (ML) tool trained on data from the PLCO Cancer\nScreening Trial and validated on the NLST to estimate the likelihood of lung\ncancer occurrence within five years. The study utilized two datasets, the PLCO\n(n=55,161) and NLST (n=48,595), consisting of comprehensive information on risk\nfactors, clinical measurements, and outcomes related to lung cancer. Data\npreprocessing involved removing patients who were not current or former smokers\nand those who had died of causes unrelated to lung cancer. Additionally, a\nfocus was placed on mitigating bias caused by censored data. Feature selection,\nhyper-parameter optimization, and model calibration were performed using\nXGBoost, an ensemble learning algorithm that combines gradient boosting and\ndecision trees. The ML model was trained on the pre-processed PLCO dataset and\ntested on the NLST dataset. The model incorporated features such as age,\ngender, smoking history, medical diagnoses, and family history of lung cancer.\nThe model was well-calibrated (Brier score=0.044). ROC-AUC was 82% on the PLCO\ndataset and 70% on the NLST dataset. PR-AUC was 29% and 11% respectively. When\ncompared to the USPSTF guidelines for lung cancer screening, our model provided\nthe same recall with a precision of 13.1% vs. 9.3% on the PLCO dataset and 3.2%\nvs. 3.1% on the NLST dataset. The developed ML tool provides a freely available\nweb application for estimating the likelihood of developing lung cancer within\nfive years. By utilizing risk factors and clinical data, individuals can assess\ntheir risk and make informed decisions regarding lung cancer screening. This\nresearch contributes to the efforts in early detection and prevention\nstrategies, aiming to reduce lung cancer-related mortality rates.\n","authors":["Pierre-Louis Benveniste","Julie Alberge","Lei Xing","Jean-Emmanuel Bibault"],"pdf_url":"https://arxiv.org/pdf/2308.12188v1.pdf","comment":"14 pages, 4 figures, 4 tables, 1 Github repository, see\n http://github.com/plbenveniste/LungCancerRisk"},{"id":"http://arxiv.org/abs/2210.01860v4","updated":"2023-08-23T15:22:04Z","published":"2022-10-04T19:03:47Z","title":"ProtoBandit: Efficient Prototype Selection via Multi-Armed Bandits","summary":" In this work, we propose a multi-armed bandit-based framework for identifying\na compact set of informative data instances (i.e., the prototypes) from a\nsource dataset $S$ that best represents a given target set $T$. Prototypical\nexamples of a given dataset offer interpretable insights into the underlying\ndata distribution and assist in example-based reasoning, thereby influencing\nevery sphere of human decision-making. Current state-of-the-art prototype\nselection approaches require $O(|S||T|)$ similarity comparisons between source\nand target data points, which becomes prohibitively expensive for large-scale\nsettings. We propose to mitigate this limitation by employing stochastic greedy\nsearch in the space of prototypical examples and multi-armed bandits for\nreducing the number of similarity comparisons. Our randomized algorithm,\nProtoBandit, identifies a set of $k$ prototypes incurring $O(k^3|S|)$\nsimilarity comparisons, which is independent of the size of the target set. An\ninteresting outcome of our analysis is for the $k$-medoids clustering problem\n$T = S$ setting) in which we show that our algorithm ProtoBandit approximates\nthe BUILD step solution of the partitioning around medoids (PAM) method in\n$O(k^3|S|)$ complexity. Empirically, we observe that ProtoBandit reduces the\nnumber of similarity computation calls by several orders of magnitudes\n($100-1000$ times) while obtaining solutions similar in quality to those from\nstate-of-the-art approaches.\n","authors":["Arghya Roy Chaudhuri","Pratik Jawanpuria","Bamdev Mishra"],"pdf_url":"https://arxiv.org/pdf/2210.01860v4.pdf","comment":"Erratum corrected"},{"id":"http://arxiv.org/abs/2305.01975v2","updated":"2023-08-23T15:12:01Z","published":"2023-05-03T08:41:37Z","title":"A Survey on Dataset Distillation: Approaches, Applications and Future\n Directions","summary":" Dataset distillation is attracting more attention in machine learning as\ntraining sets continue to grow and the cost of training state-of-the-art models\nbecomes increasingly high. By synthesizing datasets with high information\ndensity, dataset distillation offers a range of potential applications,\nincluding support for continual learning, neural architecture search, and\nprivacy protection. Despite recent advances, we lack a holistic understanding\nof the approaches and applications. Our survey aims to bridge this gap by first\nproposing a taxonomy of dataset distillation, characterizing existing\napproaches, and then systematically reviewing the data modalities, and related\napplications. In addition, we summarize the challenges and discuss future\ndirections for this field of research.\n","authors":["Jiahui Geng","Zongxiong Chen","Yuandou Wang","Herbert Woisetschlaeger","Sonja Schimmler","Ruben Mayer","Zhiming Zhao","Chunming Rong"],"pdf_url":"https://arxiv.org/pdf/2305.01975v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12175v1","updated":"2023-08-23T14:53:38Z","published":"2023-08-23T14:53:38Z","title":"Unsupervised anomalies detection in IIoT edge devices networks using\n federated learning","summary":" In a connection of many IoT devices that each collect data, normally training\na machine learning model would involve transmitting the data to a central\nserver which requires strict privacy rules. However, some owners are reluctant\nof availing their data out of the company due to data security concerns.\nFederated learning(FL) as a distributed machine learning approach performs\ntraining of a machine learning model on the device that gathered the data\nitself. In this scenario, data is not share over the network for training\npurpose. Fedavg as one of FL algorithms permits a model to be copied to\nparticipating devices during a training session. The devices could be chosen at\nrandom, and a device can be aborted. The resulting models are sent to the\ncoordinating server and then average models from the devices that finished\ntraining. The process is repeated until a desired model accuracy is achieved.\nBy doing this, FL approach solves the privacy problem for IoT/ IIoT devices\nthat held sensitive data for the owners. In this paper, we leverage the\nbenefits of FL and implemented Fedavg algorithm on a recent dataset that\nrepresent the modern IoT/ IIoT device networks. The results were almost the\nsame as the centralized machine learning approach. We also evaluated some\nshortcomings of Fedavg such as unfairness that happens during the training when\nstruggling devices do not participate for every stage of training. This\ninefficient training of local or global model could lead in a high number of\nfalse alarms in intrusion detection systems for IoT/IIoT gadgets developed\nusing Fedavg. Hence, after evaluating the FedAv deep auto encoder with\ncentralized deep auto encoder ML, we further proposed and designed a Fair\nFedavg algorithm that will be evaluated in the future work.\n","authors":["Niyomukiza Thamar","Hossam Samy Elsaid Sharara"],"pdf_url":"https://arxiv.org/pdf/2308.12175v1.pdf","comment":"Accepted for PuBlication in machine learning journals"},{"id":"http://arxiv.org/abs/2209.11355v3","updated":"2023-08-23T14:51:47Z","published":"2022-09-23T00:35:22Z","title":"Learning Interpretable Dynamics from Images of a Freely Rotating 3D\n Rigid Body","summary":" In many real-world settings, image observations of freely rotating 3D rigid\nbodies, such as satellites, may be available when low-dimensional measurements\nare not. However, the high-dimensionality of image data precludes the use of\nclassical estimation techniques to learn the dynamics and a lack of\ninterpretability reduces the usefulness of standard deep learning methods. In\nthis work, we present a physics-informed neural network model to estimate and\npredict 3D rotational dynamics from image sequences. We achieve this using a\nmulti-stage prediction pipeline that maps individual images to a latent\nrepresentation homeomorphic to $\\mathbf{SO}(3)$, computes angular velocities\nfrom latent pairs, and predicts future latent states using the Hamiltonian\nequations of motion with a learned representation of the Hamiltonian. We\ndemonstrate the efficacy of our approach on a new rotating rigid-body dataset\nwith sequences of rotating cubes and rectangular prisms with uniform and\nnon-uniform density.\n","authors":["Justice Mason","Christine Allen-Blanchette","Nicholas Zolman","Elizabeth Davison","Naomi Leonard"],"pdf_url":"https://arxiv.org/pdf/2209.11355v3.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.07221v5","updated":"2023-08-23T14:24:51Z","published":"2023-08-14T15:47:25Z","title":"AudioFormer: Audio Transformer learns audio feature representations from\n discrete acoustic codes","summary":" We propose a method named AudioFormer,which learns audio feature\nrepresentations through the acquisition of discrete acoustic codes and\nsubsequently fine-tunes them for audio classification tasks. Initially,we\nintroduce a novel perspective by considering the audio classification task as a\nform of natural language understanding (NLU). Leveraging an existing neural\naudio codec model,we generate discrete acoustic codes and utilize them to train\na masked language model (MLM),thereby obtaining audio feature representations.\nFurthermore,we pioneer the integration of a Multi-Positive sample Contrastive\n(MPC) learning approach. This method enables the learning of joint\nrepresentations among multiple discrete acoustic codes within the same audio\ninput. In our experiments,we treat discrete acoustic codes as textual data and\ntrain a masked language model using a cloze-like methodology,ultimately\nderiving high-quality audio representations. Notably,the MPC learning technique\neffectively captures collaborative representations among distinct positive\nsamples. Our research outcomes demonstrate that AudioFormer attains\nsignificantly improved performance compared to prevailing monomodal audio\nclassification models across multiple datasets,and even outperforms\naudio-visual multimodal classification models on select datasets.\nSpecifically,our approach achieves remarkable results on datasets including\nAudioSet (2M,20K),and FSD50K,with performance scores of 53.9,45.1,and\n65.6,respectively. We have openly shared both the code and models:\nhttps://github.com/LZH-0225/AudioFormer.git.\n","authors":["Zhaohui Li","Haitao Wang","Xinghua Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.07221v5.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2303.02206v2","updated":"2023-08-23T14:23:48Z","published":"2023-03-03T20:35:38Z","title":"Domain Specific Question Answering Over Knowledge Graphs Using Logical\n Programming and Large Language Models","summary":" Answering questions over domain-specific graphs requires a tailored approach\ndue to the limited number of relations and the specific nature of the domain.\nOur approach integrates classic logical programming languages into large\nlanguage models (LLMs), enabling the utilization of logical reasoning\ncapabilities to tackle the KGQA task. By representing the questions as Prolog\nqueries, which are readable and near close to natural language in\nrepresentation, we facilitate the generation of programmatically derived\nanswers. To validate the effectiveness of our approach, we evaluate it using a\nwell-known benchmark dataset, MetaQA. Our experimental results demonstrate that\nour method achieves accurate identification of correct answer entities for all\ntest questions, even when trained on a small fraction of annotated data.\nOverall, our work presents a promising approach to addressing question\nanswering over domain-specific graphs, offering an explainable and robust\nsolution by incorporating logical programming languages.\n","authors":["Navid Madani","Rohini K. Srihari","Kenneth Joseph"],"pdf_url":"https://arxiv.org/pdf/2303.02206v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12161v1","updated":"2023-08-23T14:23:26Z","published":"2023-08-23T14:23:26Z","title":"Data-driven decision-focused surrogate modeling","summary":" We introduce the concept of decision-focused surrogate modeling for solving\ncomputationally challenging nonlinear optimization problems in real-time\nsettings. The proposed data-driven framework seeks to learn a simpler, e.g.\nconvex, surrogate optimization model that is trained to minimize the decision\nprediction error, which is defined as the difference between the optimal\nsolutions of the original and the surrogate optimization models. The learning\nproblem, formulated as a bilevel program, can be viewed as a data-driven\ninverse optimization problem to which we apply a decomposition-based solution\nalgorithm from previous work. We validate our framework through numerical\nexperiments involving the optimization of common nonlinear chemical processes\nsuch as chemical reactors, heat exchanger networks, and material blending\nsystems. We also present a detailed comparison of decision-focused surrogate\nmodeling with standard data-driven surrogate modeling methods and demonstrate\nthat our approach is significantly more data-efficient while producing simple\nsurrogate models with high decision prediction accuracy.\n","authors":["Rishabh Gupta","Qi Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.12161v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11702v4","updated":"2023-08-23T14:18:30Z","published":"2023-03-21T09:42:27Z","title":"On the link between generative semi-supervised learning and generative\n open-set recognition","summary":" This study investigates the relationship between semi-supervised learning\n(SSL, which is training off partially labelled datasets) and open-set\nrecognition (OSR, which is classification with simultaneous novelty detection)\nunder the context of generative adversarial networks (GANs). Although no\nprevious study has formally linked SSL and OSR, their respective methods share\nstriking similarities. Specifically, SSL-GANs and OSR-GANs require their\ngenerators to produce 'bad-looking' samples which are used to regularise their\nclassifier networks. We hypothesise that the definitions of bad-looking samples\nin SSL and OSR represents the same concept and realises the same goal. More\nformally, bad-looking samples lie in the complementary space, which is the area\nbetween and around the boundaries of the labelled categories within the\nclassifier's embedding space. By regularising a classifier with samples in the\ncomplementary space, classifiers achieve improved generalisation for SSL and\nalso generalise the open space for OSR. To test this hypothesis, we compare a\nfoundational SSL-GAN with the state-of-the-art OSR-GAN under the same SSL-OSR\nexperimental conditions. Our results find that SSL-GANs achieve near identical\nresults to OSR-GANs, proving the SSL-OSR link. Subsequently, to further this\nnew research path, we compare several SSL-GANs various SSL-OSR setups which\nthis first benchmark results. A combined framework of SSL-OSR certainly\nimproves the practicality and cost-efficiency of classifier training, and so\nfurther theoretical and application studies are also discussed.\n","authors":["Emile Reyn Engelbrecht","Johan du Preez"],"pdf_url":"https://arxiv.org/pdf/2303.11702v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12143v1","updated":"2023-08-23T14:00:58Z","published":"2023-08-23T14:00:58Z","title":"A Probabilistic Fluctuation based Membership Inference Attack for\n Generative Models","summary":" Membership Inference Attack (MIA) identifies whether a record exists in a\nmachine learning model's training set by querying the model. MIAs on the\nclassic classification models have been well-studied, and recent works have\nstarted to explore how to transplant MIA onto generative models. Our\ninvestigation indicates that existing MIAs designed for generative models\nmainly depend on the overfitting in target models. However, overfitting can be\navoided by employing various regularization techniques, whereas existing MIAs\ndemonstrate poor performance in practice. Unlike overfitting, memorization is\nessential for deep learning models to attain optimal performance, making it a\nmore prevalent phenomenon. Memorization in generative models leads to an\nincreasing trend in the probability distribution of generating records around\nthe member record. Therefore, we propose a Probabilistic Fluctuation Assessing\nMembership Inference Attack (PFAMI), a black-box MIA that infers memberships by\ndetecting these trends via analyzing the overall probabilistic fluctuations\naround given records. We conduct extensive experiments across multiple\ngenerative models and datasets, which demonstrate PFAMI can improve the attack\nsuccess rate (ASR) by about 27.9% when compared with the best baseline.\n","authors":["Wenjie Fu","Huandong Wang","Chen Gao","Guanghua Liu","Yong Li","Tao Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.12143v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.15498v2","updated":"2023-08-23T13:44:06Z","published":"2022-11-28T16:17:47Z","title":"Physics-informed neural networks with unknown measurement noise","summary":" Physics-informed neural networks (PINNs) constitute a flexible approach to\nboth finding solutions and identifying parameters of partial differential\nequations. Most works on the topic assume noiseless data, or data contaminated\nby weak Gaussian noise. We show that the standard PINN framework breaks down in\ncase of non-Gaussian noise. We give a way of resolving this fundamental issue\nand we propose to jointly train an energy-based model (EBM) to learn the\ncorrect noise distribution. We illustrate the improved performance of our\napproach using multiple examples.\n","authors":["Philipp Pilar","Niklas Wahlström"],"pdf_url":"https://arxiv.org/pdf/2211.15498v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12127v1","updated":"2023-08-23T13:33:39Z","published":"2023-08-23T13:33:39Z","title":"Masking Strategies for Background Bias Removal in Computer Vision Models","summary":" Models for fine-grained image classification tasks, where the difference\nbetween some classes can be extremely subtle and the number of samples per\nclass tends to be low, are particularly prone to picking up background-related\nbiases and demand robust methods to handle potential examples with\nout-of-distribution (OOD) backgrounds. To gain deeper insights into this\ncritical problem, our research investigates the impact of background-induced\nbias on fine-grained image classification, evaluating standard backbone models\nsuch as Convolutional Neural Network (CNN) and Vision Transformers (ViT). We\nexplore two masking strategies to mitigate background-induced bias: Early\nmasking, which removes background information at the (input) image level, and\nlate masking, which selectively masks high-level spatial features corresponding\nto the background. Extensive experiments assess the behavior of CNN and ViT\nmodels under different masking strategies, with a focus on their generalization\nto OOD backgrounds. The obtained findings demonstrate that both proposed\nstrategies enhance OOD performance compared to the baseline models, with early\nmasking consistently exhibiting the best OOD performance. Notably, a ViT\nvariant employing GAP-Pooled Patch token-based classification combined with\nearly masking achieves the highest OOD robustness.\n","authors":["Ananthu Aniraj","Cassio F. Dantas","Dino Ienco","Diego Marcos"],"pdf_url":"https://arxiv.org/pdf/2308.12127v1.pdf","comment":"Accepted at the 2023 IEEE/CVF International Conference on Computer\n Vision Workshop (ICCVW) on Out Of Distribution Generalization in Computer\n Vision (OOD-CV)"},{"id":"http://arxiv.org/abs/2308.12126v1","updated":"2023-08-23T13:32:31Z","published":"2023-08-23T13:32:31Z","title":"An Accelerated Block Proximal Framework with Adaptive Momentum for\n Nonconvex and Nonsmooth Optimization","summary":" We propose an accelerated block proximal linear framework with adaptive\nmomentum (ABPL$^+$) for nonconvex and nonsmooth optimization. We analyze the\npotential causes of the extrapolation step failing in some algorithms, and\nresolve this issue by enhancing the comparison process that evaluates the\ntrade-off between the proximal gradient step and the linear extrapolation step\nin our algorithm. Furthermore, we extends our algorithm to any scenario\ninvolving updating block variables with positive integers, allowing each cycle\nto randomly shuffle the update order of the variable blocks. Additionally,\nunder mild assumptions, we prove that ABPL$^+$ can monotonically decrease the\nfunction value without strictly restricting the extrapolation parameters and\nstep size, demonstrates the viability and effectiveness of updating these\nblocks in a random order, and we also more obviously and intuitively\ndemonstrate that the derivative set of the sequence generated by our algorithm\nis a critical point set. Moreover, we demonstrate the global convergence as\nwell as the linear and sublinear convergence rates of our algorithm by\nutilizing the Kurdyka-Lojasiewicz (K{\\L}) condition. To enhance the\neffectiveness and flexibility of our algorithm, we also expand the study to the\nimprecise version of our algorithm and construct an adaptive extrapolation\nparameter strategy, which improving its overall performance. We apply our\nalgorithm to multiple non-negative matrix factorization with the $\\ell_0$ norm,\nnonnegative tensor decomposition with the $\\ell_0$ norm, and perform extensive\nnumerical experiments to validate its effectiveness and efficiency.\n","authors":["Weifeng Yang","Wenwen Min"],"pdf_url":"https://arxiv.org/pdf/2308.12126v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.14981v3","updated":"2023-08-23T13:25:16Z","published":"2022-06-30T02:17:11Z","title":"Randomized Coordinate Subgradient Method for Nonsmooth Composite\n Optimization","summary":" Coordinate-type subgradient methods for addressing nonsmooth optimization\nproblems are relatively underexplored due to the set-valued nature of the\nsubdifferential. In this work, our study focuses on nonsmooth composite\noptimization problems, encompassing a wide class of convex and weakly convex\n(nonconvex nonsmooth) problems. By utilizing the chain rule of the composite\nstructure properly, we introduce the Randomized Coordinate Subgradient method\n(RCS) for tackling this problem class. To the best of our knowledge, this is\nthe first coordinate subgradient method for solving general nonsmooth composite\noptimization problems. In theory, we consider the linearly bounded subgradients\nassumption for the objective function, which is more general than the\ntraditional Lipschitz continuity assumption, to account for practical\nscenarios. We then conduct convergence analysis for RCS in both convex and\nweakly convex cases based on this generalized Lipschitz-type assumption.\nSpecifically, we establish the $\\widetilde{\\mathcal{O}}$$(1/\\sqrt{k})$\nconvergence rate in expectation and the $\\tilde o(1/\\sqrt{k})$ almost sure\nasymptotic convergence rate in terms of the suboptimality gap when $f$ is\nconvex. For the case when $f$ is weakly convex and its subdifferential\nsatisfies the global metric subregularity property, we derive the\n$\\mathcal{O}(\\varepsilon^{-4})$ iteration complexity in expectation. We also\nestablish an asymptotic convergence result. To justify the global metric\nsubregularity property utilized in the analysis, we establish this error bound\ncondition for the concrete (real-valued) robust phase retrieval problem. We\nalso provide a convergence lemma and the relationship between the global metric\nsubregularity properties of a weakly convex function and its Moreau envelope.\nFinally, we conduct several experiments to demonstrate the possible superiority\nof RCS over the subgradient method.\n","authors":["Lei Zhao","Ding Chen","Daoli Zhu","Xiao Li"],"pdf_url":"https://arxiv.org/pdf/2206.14981v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.13579v2","updated":"2023-08-23T13:20:44Z","published":"2022-11-24T13:08:43Z","title":"Knowledge-Aware Federated Active Learning with Non-IID Data","summary":" Federated learning enables multiple decentralized clients to learn\ncollaboratively without sharing the local training data. However, the expensive\nannotation cost to acquire data labels on local clients remains an obstacle in\nutilizing local data. In this paper, we propose a federated active learning\nparadigm to efficiently learn a global model with limited annotation budget\nwhile protecting data privacy in a decentralized learning way. The main\nchallenge faced by federated active learning is the mismatch between the active\nsampling goal of the global model on the server and that of the asynchronous\nlocal clients. This becomes even more significant when data is distributed\nnon-IID across local clients. To address the aforementioned challenge, we\npropose Knowledge-Aware Federated Active Learning (KAFAL), which consists of\nKnowledge-Specialized Active Sampling (KSAS) and Knowledge-Compensatory\nFederated Update (KCFU). KSAS is a novel active sampling method tailored for\nthe federated active learning problem. It deals with the mismatch challenge by\nsampling actively based on the discrepancies between local and global models.\nKSAS intensifies specialized knowledge in local clients, ensuring the sampled\ndata to be informative for both the local clients and the global model. KCFU,\nin the meantime, deals with the client heterogeneity caused by limited data and\nnon-IID data distributions. It compensates for each client's ability in weak\nclasses by the assistance of the global model. Extensive experiments and\nanalyses are conducted to show the superiority of KSAS over the\nstate-of-the-art active learning methods and the efficiency of KCFU under the\nfederated active learning framework.\n","authors":["Yu-Tong Cao","Ye Shi","Baosheng Yu","Jingya Wang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2211.13579v2.pdf","comment":"14 pages, 12 figures, ICCV23"},{"id":"http://arxiv.org/abs/2308.12120v1","updated":"2023-08-23T13:16:31Z","published":"2023-08-23T13:16:31Z","title":"An Open-Source ML-Based Full-Stack Optimization Framework for Machine\n Learning Accelerators","summary":" Parameterizable machine learning (ML) accelerators are the product of recent\nbreakthroughs in ML. To fully enable their design space exploration (DSE), we\npropose a physical-design-driven, learning-based prediction framework for\nhardware-accelerated deep neural network (DNN) and non-DNN ML algorithms. It\nadopts a unified approach that combines backend power, performance, and area\n(PPA) analysis with frontend performance simulation, thereby achieving a\nrealistic estimation of both backend PPA and system metrics such as runtime and\nenergy. In addition, our framework includes a fully automated DSE technique,\nwhich optimizes backend and system metrics through an automated search of\narchitectural and backend parameters. Experimental studies show that our\napproach consistently predicts backend PPA and system metrics with an average\n7% or less prediction error for the ASIC implementation of two deep learning\naccelerator platforms, VTA and VeriGOOD-ML, in both a commercial 12 nm process\nand a research-oriented 45 nm process.\n","authors":["Hadi Esmaeilzadeh","Soroush Ghodrati","Andrew B. Kahng","Joon Kyung Kim","Sean Kinzer","Sayak Kundu","Rohan Mahapatra","Susmita Dey Manasi","Sachin Sapatnekar","Zhiang Wang","Ziqing Zeng"],"pdf_url":"https://arxiv.org/pdf/2308.12120v1.pdf","comment":"This is an extended version of our work titled \"Physically Accurate\n Learning-based Performance Prediction of Hardware-accelerated ML Algorithms\"\n published in MLCAD 2022"},{"id":"http://arxiv.org/abs/2308.12114v1","updated":"2023-08-23T13:09:03Z","published":"2023-08-23T13:09:03Z","title":"Less is More -- Towards parsimonious multi-task models using structured\n sparsity","summary":" Group sparsity in Machine Learning (ML) encourages simpler, more\ninterpretable models with fewer active parameter groups. This work aims to\nincorporate structured group sparsity into the shared parameters of a\nMulti-Task Learning (MTL) framework, to develop parsimonious models that can\neffectively address multiple tasks with fewer parameters while maintaining\ncomparable or superior performance to a dense model. Sparsifying the model\nduring training helps decrease the model's memory footprint, computation\nrequirements, and prediction time during inference. We use channel-wise l1/l2\ngroup sparsity in the shared layers of the Convolutional Neural Network (CNN).\nThis approach not only facilitates the elimination of extraneous groups\n(channels) but also imposes a penalty on the weights, thereby enhancing the\nlearning of all tasks. We compare the outcomes of single-task and multi-task\nexperiments under group sparsity on two publicly available MTL datasets, NYU-v2\nand CelebAMask-HQ. We also investigate how changing the sparsification degree\nimpacts both the performance of the model and the sparsity of groups.\n","authors":["Richa Upadhyay","Ronald Phlypo","Rajkumar Saini","Marcus Liwicki"],"pdf_url":"https://arxiv.org/pdf/2308.12114v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2210.14598v2","updated":"2023-08-23T13:04:05Z","published":"2022-10-26T10:12:31Z","title":"Exact Manifold Gaussian Variational Bayes","summary":" We propose an optimization algorithm for Variational Inference (VI) in\ncomplex models. Our approach relies on natural gradient updates where the\nvariational space is a Riemann manifold. We develop an efficient algorithm for\nGaussian Variational Inference that implicitly satisfies the positive definite\nconstraint on the variational covariance matrix. Our Exact manifold Gaussian\nVariational Bayes (EMGVB) provides exact but simple update rules and is\nstraightforward to implement. Due to its black-box nature, EMGVB stands as a\nready-to-use solution for VI in complex models. Over five datasets, we\nempirically validate our feasible approach on different statistical,\neconometric, and deep learning models, discussing its performance with respect\nto baseline methods.\n","authors":["Martin Magris","Mostafa Shabani","Alexandros Iosifidis"],"pdf_url":"https://arxiv.org/pdf/2210.14598v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12112v1","updated":"2023-08-23T13:02:52Z","published":"2023-08-23T13:02:52Z","title":"Generalized Continual Category Discovery","summary":" Most of Continual Learning (CL) methods push the limit of supervised learning\nsettings, where an agent is expected to learn new labeled tasks and not forget\nprevious knowledge. However, these settings are not well aligned with real-life\nscenarios, where a learning agent has access to a vast amount of unlabeled data\nencompassing both novel (entirely unlabeled) classes and examples from known\nclasses. Drawing inspiration from Generalized Category Discovery (GCD), we\nintroduce a novel framework that relaxes this assumption. Precisely, in any\ntask, we allow for the existence of novel and known classes, and one must use\ncontinual version of unsupervised learning methods to discover them. We call\nthis setting Generalized Continual Category Discovery (GCCD). It unifies CL and\nGCD, bridging the gap between synthetic benchmarks and real-life scenarios.\nWith a series of experiments, we present that existing methods fail to\naccumulate knowledge from subsequent tasks in which unlabeled samples of novel\nclasses are present. In light of these limitations, we propose a method that\nincorporates both supervised and unsupervised signals and mitigates the\nforgetting through the use of centroid adaptation. Our method surpasses strong\nCL methods adopted for GCD techniques and presents a superior representation\nlearning performance.\n","authors":["Daniel Marczak","Grzegorz Rypeść","Sebastian Cygert","Tomasz Trzciński","Bartłomiej Twardowski"],"pdf_url":"https://arxiv.org/pdf/2308.12112v1.pdf","comment":"7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.12108v1","updated":"2023-08-23T12:55:41Z","published":"2023-08-23T12:55:41Z","title":"Quantifying degeneracy in singular models via the learning coefficient","summary":" Deep neural networks (DNN) are singular statistical models which exhibit\ncomplex degeneracies. In this work, we illustrate how a quantity known as the\n\\emph{learning coefficient} introduced in singular learning theory quantifies\nprecisely the degree of degeneracy in deep neural networks. Importantly, we\nwill demonstrate that degeneracy in DNN cannot be accounted for by simply\ncounting the number of \"flat\" directions. We propose a computationally scalable\napproximation of a localized version of the learning coefficient using\nstochastic gradient Langevin dynamics. To validate our approach, we demonstrate\nits accuracy in low-dimensional models with known theoretical values.\nImportantly, the local learning coefficient can correctly recover the ordering\nof degeneracy between various parameter regions of interest. An experiment on\nMNIST shows the local learning coefficient can reveal the inductive bias of\nstochastic opitmizers for more or less degenerate critical points.\n","authors":["Edmund Lau","Daniel Murfet","Susan Wei"],"pdf_url":"https://arxiv.org/pdf/2308.12108v1.pdf","comment":"22 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.12093v1","updated":"2023-08-23T12:27:55Z","published":"2023-08-23T12:27:55Z","title":"Cached Operator Reordering: A Unified View for Fast GNN Training","summary":" Graph Neural Networks (GNNs) are a powerful tool for handling structured\ngraph data and addressing tasks such as node classification, graph\nclassification, and clustering. However, the sparse nature of GNN computation\nposes new challenges for performance optimization compared to traditional deep\nneural networks. We address these challenges by providing a unified view of GNN\ncomputation, I/O, and memory. By analyzing the computational graphs of the\nGraph Convolutional Network (GCN) and Graph Attention (GAT) layers -- two\nwidely used GNN layers -- we propose alternative computation strategies. We\npresent adaptive operator reordering with caching, which achieves a speedup of\nup to 2.43x for GCN compared to the current state-of-the-art. Furthermore, an\nexploration of different caching schemes for GAT yields a speedup of up to\n1.94x. The proposed optimizations save memory, are easily implemented across\nvarious hardware platforms, and have the potential to alleviate performance\nbottlenecks in training large-scale GNN models.\n","authors":["Julia Bazinska","Andrei Ivanov","Tal Ben-Nun","Nikoli Dryden","Maciej Besta","Siyuan Shen","Torsten Hoefler"],"pdf_url":"https://arxiv.org/pdf/2308.12093v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.10898v2","updated":"2023-08-23T12:27:50Z","published":"2023-02-09T20:46:46Z","title":"Estimating Driver Personality Traits from On-Road Driving Data","summary":" This paper focuses on the estimation of a driver's psychological\ncharacteristics using driving data for driving assistance systems. Driving\nassistance systems that support drivers by adapting individual psychological\ncharacteristics can provide appropriate feedback and prevent traffic accidents.\nAs a first step toward implementing such adaptive assistance systems, this\nresearch aims to develop a model to estimate drivers' psychological\ncharacteristics, such as cognitive function, psychological driving style, and\nworkload sensitivity, from on-road driving behavioral data using machine\nlearning and deep learning techniques. We also investigated the relationship\nbetween driving behavior and various cognitive functions, including the Trail\nMaking Test (TMT) and Useful Field of View (UFOV) test, through regression\nmodeling. The proposed method focuses on road type information and captures\nvarious durations of time-series data observed from driving behaviors. First,\nwe segment the driving time-series data into two road types, namely, arterial\nroads and intersections, to consider driving situations. Second, we further\nsegment data into many sequences of various durations. Third, statistics are\ncalculated from each sequence. Finally, these statistics are used as input\nfeatures of machine learning models to estimate psychological characteristics.\nThe experimental results show that our model can estimate a driver's cognitive\nfunction, namely, the TMT~(B) and UFOV test scores, with Pearson correlation\ncoefficients $r$ of 0.579 and 0.708, respectively. Some characteristics, such\nas psychological driving style and workload sensitivity, are estimated with\nhigh accuracy, but whether various duration segmentation improves accuracy\ndepends on the characteristics, and it is not effective for all\ncharacteristics.\n","authors":["Ryusei Kimura","Takahiro Tanaka","Yuki Yoshihara","Kazuhiro Fujikake","Hitoshi Kanamori","Shogo Okada"],"pdf_url":"https://arxiv.org/pdf/2302.10898v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00009v2","updated":"2023-08-23T12:24:02Z","published":"2023-06-18T20:06:58Z","title":"Comparison of Machine Learning Methods for Assigning Software Issues to\n Team Members","summary":" Software issues contain units of work to fix, improve, or create new threads\nduring the development and facilitate communication among the team members.\nAssigning an issue to the most relevant team member and determining a category\nof an issue is a tedious and challenging task. Wrong classifications cause\ndelays and rework in the project and trouble among the team members. This paper\nproposes a set of carefully curated linguistic features for shallow machine\nlearning methods and compares the performance of shallow and ensemble methods\nwith deep language models. Unlike the state-of-the-art, we assign issues to\nfour roles (designer, developer, tester, and leader) rather than to specific\nindividuals or teams to contribute to the generality of our solution. We also\nconsider the level of experience of the developers to reflect the industrial\npractices in our solution formulation. We collect and annotate five industrial\ndata sets from one of the top three global television producers to evaluate our\nproposal and compare it with deep language models. Our data sets contain 5324\nissues in total. We show that an ensemble classifier of shallow techniques\nachieves 0.92 for issue assignment in accuracy which is statistically\ncomparable to the state-of-the-art deep language models. The contributions\ninclude the public sharing of five annotated industrial issue data sets, the\ndevelopment of a clear and comprehensive feature set, the introduction of a\nnovel label set, and the validation of the efficacy of an ensemble classifier\nof shallow machine learning techniques.\n","authors":["Büşra Tabak","Fatma Başak Aydemir"],"pdf_url":"https://arxiv.org/pdf/2307.00009v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.12423v3","updated":"2023-08-23T12:02:28Z","published":"2022-05-25T00:55:47Z","title":"Deletion and Insertion Tests in Regression Models","summary":" A basic task in explainable AI (XAI) is to identify the most important\nfeatures behind a prediction made by a black box function $f$. The insertion\nand deletion tests of Petsiuk et al. (2018) can be used to judge the quality of\nalgorithms that rank pixels from most to least important for a classification.\nMotivated by regression problems we establish a formula for their area under\nthe curve (AUC) criteria in terms of certain main effects and interactions in\nan anchored decomposition of $f$. We find an expression for the expected value\nof the AUC under a random ordering of inputs to $f$ and propose an alternative\narea above a straight line for the regression setting. We use this criterion to\ncompare feature importances computed by integrated gradients (IG) to those\ncomputed by Kernel SHAP (KS) as well as LIME, DeepLIFT, vanilla gradient and\ninput$\\times$gradient methods. KS has the best overall performance in two\ndatasets we consider but it is very expensive to compute. We find that IG is\nnearly as good as KS while being much faster. Our comparison problems include\nsome binary inputs that pose a challenge to IG because it must use values\nbetween the possible variable levels and so we consider ways to handle binary\nvariables in IG. We show that sorting variables by their Shapley value does not\nnecessarily give the optimal ordering for an insertion-deletion test. It will\nhowever do that for monotone functions of additive models, such as logistic\nregression.\n","authors":["Naofumi Hama","Masayoshi Mase","Art B. Owen"],"pdf_url":"https://arxiv.org/pdf/2205.12423v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12075v1","updated":"2023-08-23T11:48:35Z","published":"2023-08-23T11:48:35Z","title":"Stabilizing RNN Gradients through Pre-training","summary":" Numerous theories of learning suggest to prevent the gradient variance from\nexponential growth with depth or time, to stabilize and improve training.\nTypically, these analyses are conducted on feed-forward fully-connected neural\nnetworks or single-layer recurrent neural networks, given their mathematical\ntractability. In contrast, this study demonstrates that pre-training the\nnetwork to local stability can be effective whenever the architectures are too\ncomplex for an analytical initialization. Furthermore, we extend known\nstability theories to encompass a broader family of deep recurrent networks,\nrequiring minimal assumptions on data and parameter distribution, a theory that\nwe refer to as the Local Stability Condition (LSC). Our investigation reveals\nthat the classical Glorot, He, and Orthogonal initialization schemes satisfy\nthe LSC when applied to feed-forward fully-connected neural networks. However,\nanalysing deep recurrent networks, we identify a new additive source of\nexponential explosion that emerges from counting gradient paths in a\nrectangular grid in depth and time. We propose a new approach to mitigate this\nissue, that consists on giving a weight of a half to the time and depth\ncontributions to the gradient, instead of the classical weight of one. Our\nempirical results confirm that pre-training both feed-forward and recurrent\nnetworks to fulfill the LSC often results in improved final performance across\nmodels. This study contributes to the field by providing a means to stabilize\nnetworks of any complexity. Our approach can be implemented as an additional\nstep before pre-training on large augmented datasets, and as an alternative to\nfinding stable initializations analytically.\n","authors":["Luca Herranz-Celotti","Jean Rouat"],"pdf_url":"https://arxiv.org/pdf/2308.12075v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12069v1","updated":"2023-08-23T11:31:50Z","published":"2023-08-23T11:31:50Z","title":"Identifying Reaction-Aware Driving Styles of Stochastic Model Predictive\n Controlled Vehicles by Inverse Reinforcement Learning","summary":" The driving style of an Autonomous Vehicle (AV) refers to how it behaves and\ninteracts with other AVs. In a multi-vehicle autonomous driving system, an AV\ncapable of identifying the driving styles of its nearby AVs can reliably\nevaluate the risk of collisions and make more reasonable driving decisions.\nHowever, there has not been a consistent definition of driving styles for an AV\nin the literature, although it is considered that the driving style is encoded\nin the AV's trajectories and can be identified using Maximum Entropy Inverse\nReinforcement Learning (ME-IRL) methods as a cost function. Nevertheless, an\nimportant indicator of the driving style, i.e., how an AV reacts to its nearby\nAVs, is not fully incorporated in the feature design of previous ME-IRL\nmethods. In this paper, we describe the driving style as a cost function of a\nseries of weighted features. We design additional novel features to capture the\nAV's reaction-aware characteristics. Then, we identify the driving styles from\nthe demonstration trajectories generated by the Stochastic Model Predictive\nControl (SMPC) using a modified ME-IRL method with our newly proposed features.\nThe proposed method is validated using MATLAB simulation and an off-the-shelf\nexperiment.\n","authors":["Ni Dang","Tao Shi","Zengjie Zhang","Wanxin Jin","Marion Leibold","Martin Buss"],"pdf_url":"https://arxiv.org/pdf/2308.12069v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12067v1","updated":"2023-08-23T11:27:30Z","published":"2023-08-23T11:27:30Z","title":"InstructionGPT-4: A 200-Instruction Paradigm for Fine-Tuning MiniGPT-4","summary":" Multimodal large language models acquire their instruction-following\ncapabilities through a two-stage training process: pre-training on image-text\npairs and fine-tuning on supervised vision-language instruction data. Recent\nstudies have shown that large language models can achieve satisfactory results\neven with a limited amount of high-quality instruction-following data. In this\npaper, we introduce InstructionGPT-4, which is fine-tuned on a small dataset\ncomprising only 200 examples, amounting to approximately 6% of the\ninstruction-following data used in the alignment dataset for MiniGPT-4. We\nfirst propose several metrics to access the quality of multimodal instruction\ndata. Based on these metrics, we present a simple and effective data selector\nto automatically identify and filter low-quality vision-language data. By\nemploying this method, InstructionGPT-4 outperforms the original MiniGPT-4 on\nvarious evaluations (e.g., visual question answering, GPT-4 preference).\nOverall, our findings demonstrate that less but high-quality instruction tuning\ndata is efficient to enable multimodal large language models to generate better\noutput.\n","authors":["Lai Wei","Zihao Jiang","Weiran Huang","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2308.12067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12066v1","updated":"2023-08-23T11:25:37Z","published":"2023-08-23T11:25:37Z","title":"Pre-gated MoE: An Algorithm-System Co-Design for Fast and Scalable\n Mixture-of-Expert Inference","summary":" Large language models (LLMs) based on transformers have made significant\nstrides in recent years, the success of which is driven by scaling up their\nmodel size. Despite their high algorithmic performance, the computational and\nmemory requirements of LLMs present unprecedented challenges. To tackle the\nhigh compute requirements of LLMs, the Mixture-of-Experts (MoE) architecture\nwas introduced which is able to scale its model size without proportionally\nscaling up its computational requirements. Unfortunately, MoE's high memory\ndemands and dynamic activation of sparse experts restrict its applicability to\nreal-world problems. Previous solutions that offload MoE's memory-hungry expert\nparameters to CPU memory fall short because the latency to migrate activated\nexperts from CPU to GPU incurs high performance overhead. Our proposed\nPre-gated MoE system effectively tackles the compute and memory challenges of\nconventional MoE architectures using our algorithm-system co-design. Pre-gated\nMoE employs our novel pre-gating function which alleviates the dynamic nature\nof sparse expert activation, allowing our proposed system to address the large\nmemory footprint of MoEs while also achieving high performance. We demonstrate\nthat Pre-gated MoE is able to improve performance, reduce GPU memory\nconsumption, while also maintaining the same level of model quality. These\nfeatures allow our Pre-gated MoE system to cost-effectively deploy large-scale\nLLMs using just a single GPU with high performance.\n","authors":["Ranggi Hwang","Jianyu Wei","Shijie Cao","Changho Hwang","Xiaohu Tang","Ting Cao","Mao Yang","Minsoo Rhu"],"pdf_url":"https://arxiv.org/pdf/2308.12066v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12065v1","updated":"2023-08-23T11:24:28Z","published":"2023-08-23T11:24:28Z","title":"Ensembling Uncertainty Measures to Improve Safety of Black-Box\n Classifiers","summary":" Machine Learning (ML) algorithms that perform classification may predict the\nwrong class, experiencing misclassifications. It is well-known that\nmisclassifications may have cascading effects on the encompassing system,\npossibly resulting in critical failures. This paper proposes SPROUT, a Safety\nwraPper thROugh ensembles of UncertainTy measures, which suspects\nmisclassifications by computing uncertainty measures on the inputs and outputs\nof a black-box classifier. If a misclassification is detected, SPROUT blocks\nthe propagation of the output of the classifier to the encompassing system. The\nresulting impact on safety is that SPROUT transforms erratic outputs\n(misclassifications) into data omission failures, which can be easily managed\nat the system level. SPROUT has a broad range of applications as it fits binary\nand multi-class classification, comprising image and tabular datasets. We\nexperimentally show that SPROUT always identifies a huge fraction of the\nmisclassifications of supervised classifiers, and it is able to detect all\nmisclassifications in specific cases. SPROUT implementation contains\npre-trained wrappers, it is publicly available and ready to be deployed with\nminimal effort.\n","authors":["Tommaso Zoppi","Andrea Ceccarelli","Andrea Bondavalli"],"pdf_url":"https://arxiv.org/pdf/2308.12065v1.pdf","comment":"To appear at ECAI23 in October23"},{"id":"http://arxiv.org/abs/2301.10137v2","updated":"2023-08-23T11:17:19Z","published":"2023-01-12T13:53:27Z","title":"Dirac signal processing of higher-order topological signals","summary":" Higher-order networks can sustain topological signals which are variables\nassociated not only to the nodes, but also to the links, to the triangles and\nin general to the higher dimensional simplices of simplicial complexes. These\ntopological signals can describe a large variety of real systems including\ncurrents in the ocean, synaptic currents between neurons and biological\ntransportation networks. In real scenarios topological signal data might be\nnoisy and an important task is to process these signals by improving their\nsignal to noise ratio. So far topological signals are typically processed\nindependently of each other. For instance, node signals are processed\nindependently of link signals, and algorithms that can enforce a consistent\nprocessing of topological signals across different dimensions are largely\nlacking. Here we propose Dirac signal processing, an adaptive, unsupervised\nsignal processing algorithm that learns to jointly filter topological signals\nsupported on nodes, links and triangles of simplicial complexes in a consistent\nway. The proposed Dirac signal processing algorithm is formulated in terms of\nthe discrete Dirac operator which can be interpreted as \"square root\" of a\nhigher-order Hodge Laplacian. We discuss in detail the properties of the Dirac\noperator including its spectrum and the chirality of its eigenvectors and we\nadopt this operator to formulate Dirac signal processing that can filter noisy\nsignals defined on nodes, links and triangles of simplicial complexes. We test\nour algorithms on noisy synthetic data and noisy data of drifters in the ocean\nand find that the algorithm can learn to efficiently reconstruct the true\nsignals outperforming algorithms based exclusively on the Hodge Laplacian.\n","authors":["Lucille Calmon","Michael T. Schaub","Ginestra Bianconi"],"pdf_url":"https://arxiv.org/pdf/2301.10137v2.pdf","comment":"(26 pages, 12 figures)"},{"id":"http://arxiv.org/abs/2210.13708v3","updated":"2023-08-23T11:15:21Z","published":"2022-10-11T03:11:12Z","title":"MARLlib: A Scalable and Efficient Multi-agent Reinforcement Learning\n Library","summary":" A significant challenge facing researchers in the area of multi-agent\nreinforcement learning (MARL) pertains to the identification of a library that\ncan offer fast and compatible development for multi-agent tasks and algorithm\ncombinations, while obviating the need to consider compatibility issues. In\nthis paper, we present MARLlib, a library designed to address the\naforementioned challenge by leveraging three key mechanisms: 1) a standardized\nmulti-agent environment wrapper, 2) an agent-level algorithm implementation,\nand 3) a flexible policy mapping strategy. By utilizing these mechanisms,\nMARLlib can effectively disentangle the intertwined nature of the multi-agent\ntask and the learning process of the algorithm, with the ability to\nautomatically alter the training strategy based on the current task's\nattributes. The MARLlib library's source code is publicly accessible on GitHub:\n\\url{https://github.com/Replicable-MARL/MARLlib}.\n","authors":["Siyi Hu","Yifan Zhong","Minquan Gao","Weixun Wang","Hao Dong","Xiaodan Liang","Zhihui Li","Xiaojun Chang","Yaodong Yang"],"pdf_url":"https://arxiv.org/pdf/2210.13708v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12061v1","updated":"2023-08-23T11:03:28Z","published":"2023-08-23T11:03:28Z","title":"HarvestNet: A Dataset for Detecting Smallholder Farming Activity Using\n Harvest Piles and Remote Sensing","summary":" Small farms contribute to a large share of the productive land in developing\ncountries. In regions such as sub-Saharan Africa, where 80% of farms are small\n(under 2 ha in size), the task of mapping smallholder cropland is an important\npart of tracking sustainability measures such as crop productivity. However,\nthe visually diverse and nuanced appearance of small farms has limited the\neffectiveness of traditional approaches to cropland mapping. Here we introduce\na new approach based on the detection of harvest piles characteristic of many\nsmallholder systems throughout the world. We present HarvestNet, a dataset for\nmapping the presence of farms in the Ethiopian regions of Tigray and Amhara\nduring 2020-2023, collected using expert knowledge and satellite images,\ntotaling 7k hand-labeled images and 2k ground collected labels. We also\nbenchmark a set of baselines including SOTA models in remote sensing with our\nbest models having around 80% classification performance on hand labelled data\nand 90%, 98% accuracy on ground truth data for Tigray, Amhara respectively. We\nalso perform a visual comparison with a widely used pre-existing coverage map\nand show that our model detects an extra 56,621 hectares of cropland in Tigray.\nWe conclude that remote sensing of harvest piles can contribute to more timely\nand accurate cropland assessments in food insecure region.\n","authors":["Jonathan Xu","Amna Elmustafa","Liya Weldegebriel","Emnet Negash","Richard Lee","Chenlin Meng","Stefano Ermon","David Lobell"],"pdf_url":"https://arxiv.org/pdf/2308.12061v1.pdf","comment":"18 pages, 22 figures"},{"id":"http://arxiv.org/abs/2308.12059v1","updated":"2023-08-23T10:59:41Z","published":"2023-08-23T10:59:41Z","title":"Manipulating Embeddings of Stable Diffusion Prompts","summary":" Generative text-to-image models such as Stable Diffusion allow users to\ngenerate images based on a textual description, the prompt. Changing the prompt\nis still the primary means for the user to change a generated image as desired.\nHowever, changing the image by reformulating the prompt remains a difficult\nprocess of trial and error, which has led to the emergence of prompt\nengineering as a new field of research. We propose and analyze methods to\nchange the embedding of a prompt directly instead of the prompt text. It allows\nfor more fine-grained and targeted control that takes into account user\nintentions. Our approach treats the generative text-to-image model as a\ncontinuous function and passes gradients between the image space and the prompt\nembedding space. By addressing different user interaction problems, we can\napply this idea in three scenarios: (1) Optimization of a metric defined in\nimage space that could measure, for example, image style. (2) Assistance of\nusers in creative tasks by enabling them to navigate the image space along a\nselection of directions of \"near\" prompt embeddings. (3) Changing the embedding\nof the prompt to include information that the user has seen in a particular\nseed but finds difficult to describe in the prompt. Our experiments demonstrate\nthe feasibility of the described methods.\n","authors":["Niklas Deckers","Julia Peters","Martin Potthast"],"pdf_url":"https://arxiv.org/pdf/2308.12059v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12054v1","updated":"2023-08-23T10:51:33Z","published":"2023-08-23T10:51:33Z","title":"Sample Complexity of Robust Learning against Evasion Attacks","summary":" It is becoming increasingly important to understand the vulnerability of\nmachine learning models to adversarial attacks. One of the fundamental problems\nin adversarial machine learning is to quantify how much training data is needed\nin the presence of evasion attacks, where data is corrupted at test time. In\nthis thesis, we work with the exact-in-the-ball notion of robustness and study\nthe feasibility of adversarially robust learning from the perspective of\nlearning theory, considering sample complexity.\n We first explore the setting where the learner has access to random examples\nonly, and show that distributional assumptions are essential. We then focus on\nlearning problems with distributions on the input data that satisfy a Lipschitz\ncondition and show that robustly learning monotone conjunctions has sample\ncomplexity at least exponential in the adversary's budget (the maximum number\nof bits it can perturb on each input). However, if the adversary is restricted\nto perturbing $O(\\log n)$ bits, then one can robustly learn conjunctions and\ndecision lists w.r.t. log-Lipschitz distributions.\n We then study learning models where the learner is given more power. We first\nconsider local membership queries, where the learner can query the label of\npoints near the training sample. We show that, under the uniform distribution,\nthe exponential dependence on the adversary's budget to robustly learn\nconjunctions remains inevitable. We then introduce a local equivalence query\noracle, which returns whether the hypothesis and target concept agree in a\ngiven region around a point in the training sample, and a counterexample if it\nexists. We show that if the query radius is equal to the adversary's budget, we\ncan develop robust empirical risk minimization algorithms in the\ndistribution-free setting. We give general query complexity upper and lower\nbounds, as well as for concrete concept classes.\n","authors":["Pascale Gourdeau"],"pdf_url":"https://arxiv.org/pdf/2308.12054v1.pdf","comment":"DPhil (PhD) Thesis - University of Oxford"},{"id":"http://arxiv.org/abs/2308.12053v1","updated":"2023-08-23T10:48:28Z","published":"2023-08-23T10:48:28Z","title":"Layer-wise Feedback Propagation","summary":" In this paper, we present Layer-wise Feedback Propagation (LFP), a novel\ntraining approach for neural-network-like predictors that utilizes\nexplainability, specifically Layer-wise Relevance Propagation(LRP), to assign\nrewards to individual connections based on their respective contributions to\nsolving a given task. This differs from traditional gradient descent, which\nupdates parameters towards anestimated loss minimum. LFP distributes a reward\nsignal throughout the model without the need for gradient computations. It then\nstrengthens structures that receive positive feedback while reducingthe\ninfluence of structures that receive negative feedback. We establish the\nconvergence of LFP theoretically and empirically, and demonstrate its\neffectiveness in achieving comparable performance to gradient descent on\nvarious models and datasets. Notably, LFP overcomes certain limitations\nassociated with gradient-based methods, such as reliance on meaningful\nderivatives. We further investigate how the different LRP-rules can be extended\nto LFP, what their effects are on training, as well as potential applications,\nsuch as training models with no meaningful derivatives, e.g., step-function\nactivated Spiking Neural Networks (SNNs), or for transfer learning, to\nefficiently utilize existing knowledge.\n","authors":["Leander Weber","Jim Berend","Alexander Binder","Thomas Wiegand","Wojciech Samek","Sebastian Lapuschkin"],"pdf_url":"https://arxiv.org/pdf/2308.12053v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.11418v2","updated":"2023-08-23T10:42:09Z","published":"2022-04-25T03:32:17Z","title":"Riemannian Hamiltonian methods for min-max optimization on manifolds","summary":" In this paper, we study min-max optimization problems on Riemannian\nmanifolds. We introduce a Riemannian Hamiltonian function, minimization of\nwhich serves as a proxy for solving the original min-max problems. Under the\nRiemannian Polyak--{\\L}ojasiewicz condition on the Hamiltonian function, its\nminimizer corresponds to the desired min-max saddle point. We also provide\ncases where this condition is satisfied. For geodesic-bilinear optimization in\nparticular, solving the proxy problem leads to the correct search direction\ntowards global optimality, which becomes challenging with the min-max\nformulation. To minimize the Hamiltonian function, we propose Riemannian\nHamiltonian methods (RHM) and present their convergence analyses. We extend RHM\nto include consensus regularization and to the stochastic setting. We\nillustrate the efficacy of the proposed RHM in applications such as subspace\nrobust Wasserstein distance, robust training of neural networks, and generative\nadversarial networks.\n","authors":["Andi Han","Bamdev Mishra","Pratik Jawanpuria","Pawan Kumar","Junbin Gao"],"pdf_url":"https://arxiv.org/pdf/2204.11418v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10276v2","updated":"2023-08-23T10:10:49Z","published":"2023-08-20T14:12:11Z","title":"Minimalist Traffic Prediction: Linear Layer Is All You Need","summary":" Traffic prediction is essential for the progression of Intelligent\nTransportation Systems (ITS) and the vision of smart cities. While\nSpatial-Temporal Graph Neural Networks (STGNNs) have shown promise in this\ndomain by leveraging Graph Neural Networks (GNNs) integrated with either RNNs\nor Transformers, they present challenges such as computational complexity,\ngradient issues, and resource-intensiveness. This paper addresses these\nchallenges, advocating for three main solutions: a node-embedding approach,\ntime series decomposition, and periodicity learning. We introduce STLinear, a\nminimalist model architecture designed for optimized efficiency and\nperformance. Unlike traditional STGNNs, STlinear operates fully locally,\navoiding inter-node data exchanges, and relies exclusively on linear layers,\ndrastically cutting computational demands. Our empirical studies on real-world\ndatasets confirm STLinear's prowess, matching or exceeding the accuracy of\nleading STGNNs, but with significantly reduced complexity and computation\noverhead (more than 95% reduction in MACs per epoch compared to\nstate-of-the-art STGNN baseline published in 2023). In summary, STLinear\nemerges as a potent, efficient alternative to conventional STGNNs, with\nprofound implications for the future of ITS and smart city initiatives.\n","authors":["Wenying Duan","Hong Rao","Wei Huang","Xiaoxi He"],"pdf_url":"https://arxiv.org/pdf/2308.10276v2.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2308.12044v1","updated":"2023-08-23T10:08:52Z","published":"2023-08-23T10:08:52Z","title":"A multiobjective continuation method to compute the regularization path\n of deep neural networks","summary":" Sparsity is a highly desired feature in deep neural networks (DNNs) since it\nensures numerical efficiency, improves the interpretability of models (due to\nthe smaller number of relevant features), and robustness. In machine learning\napproaches based on linear models, it is well known that there exists a\nconnecting path between the sparsest solution in terms of the $\\ell^1$ norm\n(i.e., zero weights) and the non-regularized solution, which is called the\nregularization path. Very recently, there was a first attempt to extend the\nconcept of regularization paths to DNNs by means of treating the empirical loss\nand sparsity ($\\ell^1$ norm) as two conflicting criteria and solving the\nresulting multiobjective optimization problem. However, due to the\nnon-smoothness of the $\\ell^1$ norm and the high number of parameters, this\napproach is not very efficient from a computational perspective. To overcome\nthis limitation, we present an algorithm that allows for the approximation of\nthe entire Pareto front for the above-mentioned objectives in a very efficient\nmanner. We present numerical examples using both deterministic and stochastic\ngradients. We furthermore demonstrate that knowledge of the regularization path\nallows for a well-generalizing network parametrization.\n","authors":["Augustina C. Amakor","Konstantin Sontag","Sebastian Peitz"],"pdf_url":"https://arxiv.org/pdf/2308.12044v1.pdf","comment":"7 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.12043v1","updated":"2023-08-23T10:08:10Z","published":"2023-08-23T10:08:10Z","title":"IncreLoRA: Incremental Parameter Allocation Method for\n Parameter-Efficient Fine-tuning","summary":" With the increasing size of pre-trained language models (PLMs), fine-tuning\nall the parameters in the model is not efficient, especially when there are a\nlarge number of downstream tasks, which incur significant training and storage\ncosts. Many parameter-efficient fine-tuning (PEFT) approaches have been\nproposed, among which, Low-Rank Adaptation (LoRA) is a representative approach\nthat injects trainable rank decomposition matrices into every target module.\nYet LoRA ignores the importance of parameters in different modules. To address\nthis problem, many works have been proposed to prune the parameters of LoRA.\nHowever, under limited training conditions, the upper bound of the rank of the\npruned parameter matrix is still affected by the preset values. We, therefore,\npropose IncreLoRA, an incremental parameter allocation method that adaptively\nadds trainable parameters during training based on the importance scores of\neach module. This approach is different from the pruning method as it is not\nlimited by the initial number of training parameters, and each parameter matrix\nhas a higher rank upper bound for the same training overhead. We conduct\nextensive experiments on GLUE to demonstrate the effectiveness of IncreLoRA.\nThe results show that our method owns higher parameter efficiency, especially\nwhen under the low-resource settings where our method significantly outperforms\nthe baselines. Our code is publicly available.\n","authors":["Feiyu Zhang","Liangzhi Li","Junhao Chen","Zhouqiang Jiang","Bowen Wang","Yiming Qian"],"pdf_url":"https://arxiv.org/pdf/2308.12043v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.15782v3","updated":"2023-08-23T10:02:15Z","published":"2023-06-27T20:09:56Z","title":"UTRNet: High-Resolution Urdu Text Recognition In Printed Documents","summary":" In this paper, we propose a novel approach to address the challenges of\nprinted Urdu text recognition using high-resolution, multi-scale semantic\nfeature extraction. Our proposed UTRNet architecture, a hybrid CNN-RNN model,\ndemonstrates state-of-the-art performance on benchmark datasets. To address the\nlimitations of previous works, which struggle to generalize to the intricacies\nof the Urdu script and the lack of sufficient annotated real-world data, we\nhave introduced the UTRSet-Real, a large-scale annotated real-world dataset\ncomprising over 11,000 lines and UTRSet-Synth, a synthetic dataset with 20,000\nlines closely resembling real-world and made corrections to the ground truth of\nthe existing IIITH dataset, making it a more reliable resource for future\nresearch. We also provide UrduDoc, a benchmark dataset for Urdu text line\ndetection in scanned documents. Additionally, we have developed an online tool\nfor end-to-end Urdu OCR from printed documents by integrating UTRNet with a\ntext detection model. Our work not only addresses the current limitations of\nUrdu OCR but also paves the way for future research in this area and\nfacilitates the continued advancement of Urdu OCR technology. The project page\nwith source code, datasets, annotations, trained models, and online tool is\navailable at abdur75648.github.io/UTRNet.\n","authors":["Abdur Rahman","Arjun Ghosh","Chetan Arora"],"pdf_url":"https://arxiv.org/pdf/2306.15782v3.pdf","comment":"Accepted at The 17th International Conference on Document Analysis\n and Recognition (ICDAR 2023)"},{"id":"http://arxiv.org/abs/2308.12031v1","updated":"2023-08-23T09:44:12Z","published":"2023-08-23T09:44:12Z","title":"CACTUS: a Comprehensive Abstraction and Classification Tool for\n Uncovering Structures","summary":" The availability of large data sets is providing an impetus for driving\ncurrent artificial intelligent developments. There are, however, challenges for\ndeveloping solutions with small data sets due to practical and cost-effective\ndeployment and the opacity of deep learning models. The Comprehensive\nAbstraction and Classification Tool for Uncovering Structures called CACTUS is\npresented for improved secure analytics by effectively employing explainable\nartificial intelligence. It provides additional support for categorical\nattributes, preserving their original meaning, optimising memory usage, and\nspeeding up the computation through parallelisation. It shows to the user the\nfrequency of the attributes in each class and ranks them by their\ndiscriminative power. Its performance is assessed by application to the\nWisconsin diagnostic breast cancer and Thyroid0387 data sets.\n","authors":["Luca Gherardini","Varun Ravi Varma","Karol Capala","Roger Woods","Jose Sousa"],"pdf_url":"https://arxiv.org/pdf/2308.12031v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12030v1","updated":"2023-08-23T09:43:10Z","published":"2023-08-23T09:43:10Z","title":"Prompt-Based Length Controlled Generation with Reinforcement Learning","summary":" Recently, large language models (LLMs) like ChatGPT and GPT-4 have attracted\ngreat attention given their surprising improvement and performance. Length\ncontrolled generation of LLMs emerges as an important topic, which also enables\nusers to fully leverage the capability of LLMs in more real-world scenarios\nlike generating a proper answer or essay of a desired length. In addition, the\nautoregressive generation in LLMs is extremely time-consuming, while the\nability of controlling this generated length can arbitrarily reduce the\ninference cost by limiting the length, and thus satisfy different needs.\nTherefore, we aim to propose a prompt-based length control method to achieve\nthis length controlled generation, which can also be widely applied in\nGPT-style LLMs. In particular, we adopt reinforcement learning with the reward\nsignal given by either trainable or rule-based reward model, which further\naffects the generation of LLMs via rewarding a pre-defined target length.\nExperiments show that our method significantly improves the accuracy of\nprompt-based length control for summarization task on popular datasets like\nCNNDM and NYT. We believe this length-controllable ability can provide more\npotentials towards the era of LLMs.\n","authors":["Renlong Jie","Xiaojun Meng","Lifeng Shang","Xin Jiang","Qun Liu"],"pdf_url":"https://arxiv.org/pdf/2308.12030v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12029v1","updated":"2023-08-23T09:41:28Z","published":"2023-08-23T09:41:28Z","title":"A Scale-Invariant Task Balancing Approach for Multi-Task Learning","summary":" Multi-task learning (MTL), a learning paradigm to learn multiple related\ntasks simultaneously, has achieved great success in various fields. However,\ntask-balancing remains a significant challenge in MTL, with the disparity in\nloss/gradient scales often leading to performance compromises. In this paper,\nwe propose a Scale-Invariant Multi-Task Learning (SI-MTL) method to alleviate\nthe task-balancing problem from both loss and gradient perspectives.\nSpecifically, SI-MTL contains a logarithm transformation which is performed on\nall task losses to ensure scale-invariant at the loss level, and a gradient\nbalancing method, SI-G, which normalizes all task gradients to the same\nmagnitude as the maximum gradient norm. Extensive experiments conducted on\nseveral benchmark datasets consistently demonstrate the effectiveness of SI-G\nand the state-of-the-art performance of SI-MTL.\n","authors":["Baijiong Lin","Weisen Jiang","Feiyang Ye","Yu Zhang","Pengguang Chen","Ying-Cong Chen","Shu Liu"],"pdf_url":"https://arxiv.org/pdf/2308.12029v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2308.07758v3","updated":"2023-08-23T09:35:33Z","published":"2023-08-15T13:19:59Z","title":"Forward-Backward Reasoning in Large Language Models for Verification","summary":" Chain-of-Though (CoT) prompting has shown promising performance in various\nreasoning tasks. Recently, Self-Consistency \\citep{wang2023selfconsistency}\nproposes to sample a diverse set of reasoning chains which may lead to\ndifferent answers while the answer that receives the most votes is selected. In\nthis paper, we propose a novel method to use backward reasoning in verifying\ncandidate answers. We mask a token in the question by ${\\bf x}$ and ask the LLM\nto predict the masked token when a candidate answer is provided by \\textit{a\nsimple template}, i.e., \"\\textit{\\textbf{If we know the answer of the above\nquestion is \\{a candidate answer\\}, what is the value of unknown variable ${\\bf\nx}$?}}\" Intuitively, the LLM is expected to predict the masked token\nsuccessfully if the provided candidate answer is correct. We further propose\nFOBAR to combine forward and backward reasoning for estimating the probability\nof candidate answers. We conduct extensive experiments on six data sets and\nthree LLMs. Experimental results demonstrate that FOBAR achieves\nstate-of-the-art performance on various reasoning benchmarks.\n","authors":["Weisen Jiang","Han Shi","Longhui Yu","Zhengying Liu","Yu Zhang","Zhenguo Li","James T. Kwok"],"pdf_url":"https://arxiv.org/pdf/2308.07758v3.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2302.13991v2","updated":"2023-08-23T09:27:19Z","published":"2023-02-27T17:30:00Z","title":"Learning to Generalize towards Unseen Domains via a Content-Aware Style\n Invariant Model for Disease Detection from Chest X-rays","summary":" Performance degradation due to source domain mismatch is a longstanding\nchallenge in deep learning-based medical image analysis, particularly for chest\nX-rays (CXRs). Several methods (e.g., adversarial training, multi-domain\nmixups) have been proposed to extract domain-invariant high-level features to\naddress this domain shift. However, these methods do not explicitly regularize\nthe content and style characteristics of the extracted domain-invariant\nfeatures. Recent studies have demonstrated that CNN models exhibit a strong\nbias toward styles (e.g., uninformative textures) rather than content (e.g.,\nshape), in stark contrast to the human-vision system. Radiologists tend to\nlearn visual cues from CXRs and thus perform well across multiple domains.\nTherefore, in medical imaging for pathology diagnosis from CXR images, models\nshould extract domain-invariant features that are style-invariant and\ncontent-biased. Motivated by this, we employ the novel style randomization\nmodules (SRMs) at both image and feature levels that work together\nhierarchically to create rich style perturbed features on the fly while keeping\nthe content intact. In addition, we leverage consistency regularizations\nbetween global semantic features and predicted probability distributions,\nrespectively, for with and without style perturbed versions of the same CXR\nimage to tweak the model's sensitivity toward content markers for accurate\npredictions. Extensive experiments with three large-scale thoracic disease\ndatasets, i.e., CheXpert, MIMIC-CXR, and BRAX, demonstrate that our proposed\nframework is more robust in the presence of domain shift and achieves\nstate-of-the-art performance.\n","authors":["Mohammad Zunaed","Md. Aynal Haque","Taufiq Hasan"],"pdf_url":"https://arxiv.org/pdf/2302.13991v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12018v1","updated":"2023-08-23T09:20:41Z","published":"2023-08-23T09:20:41Z","title":"Bias-Aware Minimisation: Understanding and Mitigating Estimator Bias in\n Private SGD","summary":" Differentially private SGD (DP-SGD) holds the promise of enabling the safe\nand responsible application of machine learning to sensitive datasets. However,\nDP-SGD only provides a biased, noisy estimate of a mini-batch gradient. This\nrenders optimisation steps less effective and limits model utility as a result.\nWith this work, we show a connection between per-sample gradient norms and the\nestimation bias of the private gradient oracle used in DP-SGD. Here, we propose\nBias-Aware Minimisation (BAM) that allows for the provable reduction of private\ngradient estimator bias. We show how to efficiently compute quantities needed\nfor BAM to scale to large neural networks and highlight similarities to closely\nrelated methods such as Sharpness-Aware Minimisation. Finally, we provide\nempirical evidence that BAM not only reduces bias but also substantially\nimproves privacy-utility trade-offs on the CIFAR-10, CIFAR-100, and ImageNet-32\ndatasets.\n","authors":["Moritz Knolle","Robert Dorfman","Alexander Ziller","Daniel Rueckert","Georgios Kaissis"],"pdf_url":"https://arxiv.org/pdf/2308.12018v1.pdf","comment":"Accepted to the 2023 Theory and Practice of Differential Privacy\n (TPDP) Workshop"},{"id":"http://arxiv.org/abs/2308.12016v1","updated":"2023-08-23T09:18:41Z","published":"2023-08-23T09:18:41Z","title":"MKL-$L_{0/1}$-SVM","summary":" This paper presents a Multiple Kernel Learning (abbreviated as MKL) framework\nfor the Support Vector Machine (SVM) with the $(0, 1)$ loss function. Some\nfirst-order optimality conditions are given and then exploited to develop a\nfast ADMM solver to deal with the nonconvex and nonsmooth optimization problem.\nExtensive numerical experiments on synthetic and real datasets show that the\nperformance of our MKL-$L_{0/1}$-SVM is comparable with the one of the leading\napproaches called SimpleMKL developed by Rakotomamonjy, Bach, Canu, and\nGrandvalet [Journal of Machine Learning Research, vol. 9, pp. 2491-2521, 2008].\n","authors":["Bin Zhu","Yijie Shi"],"pdf_url":"https://arxiv.org/pdf/2308.12016v1.pdf","comment":"25 pages in the JMLR template, 4 figures, and 2 tables. arXiv admin\n note: substantial text overlap with arXiv:2303.04445"},{"id":"http://arxiv.org/abs/2308.12013v1","updated":"2023-08-23T09:09:32Z","published":"2023-08-23T09:09:32Z","title":"Quantum-Noise-driven Generative Diffusion Models","summary":" Generative models realized with machine learning techniques are powerful\ntools to infer complex and unknown data distributions from a finite number of\ntraining samples in order to produce new synthetic data. Diffusion models are\nan emerging framework that have recently overcome the performance of the\ngenerative adversarial networks in creating synthetic text and high-quality\nimages. Here, we propose and discuss the quantum generalization of diffusion\nmodels, i.e., three quantum-noise-driven generative diffusion models that could\nbe experimentally tested on real quantum systems. The idea is to harness unique\nquantum features, in particular the non-trivial interplay among coherence,\nentanglement and noise that the currently available noisy quantum processors do\nunavoidably suffer from, in order to overcome the main computational burdens of\nclassical diffusion models during inference. Hence, we suggest to exploit\nquantum noise not as an issue to be detected and solved but instead as a very\nremarkably beneficial key ingredient to generate much more complex probability\ndistributions that would be difficult or even impossible to express\nclassically, and from which a quantum processor might sample more efficiently\nthan a classical one. Therefore, our results are expected to pave the way for\nnew quantum-inspired or quantum-based generative diffusion algorithms\naddressing more powerfully classical tasks as data generation/prediction with\nwidespread real-world applications ranging from climate forecasting to\nneuroscience, from traffic flow analysis to financial forecasting.\n","authors":["Marco Parigi","Stefano Martina","Filippo Caruso"],"pdf_url":"https://arxiv.org/pdf/2308.12013v1.pdf","comment":"13 pages, 2 figures"},{"id":"http://arxiv.org/abs/2301.04785v3","updated":"2023-08-23T09:06:18Z","published":"2023-01-12T02:25:22Z","title":"Phase-shifted Adversarial Training","summary":" Adversarial training has been considered an imperative component for safely\ndeploying neural network-based applications to the real world. To achieve\nstronger robustness, existing methods primarily focus on how to generate strong\nattacks by increasing the number of update steps, regularizing the models with\nthe smoothed loss function, and injecting the randomness into the attack.\nInstead, we analyze the behavior of adversarial training through the lens of\nresponse frequency. We empirically discover that adversarial training causes\nneural networks to have low convergence to high-frequency information,\nresulting in highly oscillated predictions near each data. To learn\nhigh-frequency contents efficiently and effectively, we first prove that a\nuniversal phenomenon of frequency principle, i.e., \\textit{lower frequencies\nare learned first}, still holds in adversarial training. Based on that, we\npropose phase-shifted adversarial training (PhaseAT) in which the model learns\nhigh-frequency components by shifting these frequencies to the low-frequency\nrange where the fast convergence occurs. For evaluations, we conduct the\nexperiments on CIFAR-10 and ImageNet with the adaptive attack carefully\ndesigned for reliable evaluation. Comprehensive results show that PhaseAT\nsignificantly improves the convergence for high-frequency information. This\nresults in improved adversarial robustness by enabling the model to have\nsmoothed predictions near each data.\n","authors":["Yeachan Kim","Seongyeon Kim","Ihyeok Seo","Bonggun Shin"],"pdf_url":"https://arxiv.org/pdf/2301.04785v3.pdf","comment":"Conference on Uncertainty in Artificial Intelligence, 2023 (UAI 2023)"},{"id":"http://arxiv.org/abs/2301.05763v3","updated":"2023-08-23T08:57:56Z","published":"2023-01-13T21:24:23Z","title":"A Rigorous Uncertainty-Aware Quantification Framework Is Essential for\n Reproducible and Replicable Machine Learning Workflows","summary":" The ability to replicate predictions by machine learning (ML) or artificial\nintelligence (AI) models and results in scientific workflows that incorporate\nsuch ML/AI predictions is driven by numerous factors. An uncertainty-aware\nmetric that can quantitatively assess the reproducibility of quantities of\ninterest (QoI) would contribute to the trustworthiness of results obtained from\nscientific workflows involving ML/AI models. In this article, we discuss how\nuncertainty quantification (UQ) in a Bayesian paradigm can provide a general\nand rigorous framework for quantifying reproducibility for complex scientific\nworkflows. Such as framework has the potential to fill a critical gap that\ncurrently exists in ML/AI for scientific workflows, as it will enable\nresearchers to determine the impact of ML/AI model prediction variability on\nthe predictive outcomes of ML/AI-powered workflows. We expect that the\nenvisioned framework will contribute to the design of more reproducible and\ntrustworthy workflows for diverse scientific applications, and ultimately,\naccelerate scientific discoveries.\n","authors":["Line Pouchard","Kristofer G. Reyes","Francis J. Alexander","Byung-Jun Yoon"],"pdf_url":"https://arxiv.org/pdf/2301.05763v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10522v3","updated":"2023-08-23T08:49:54Z","published":"2023-08-21T07:19:47Z","title":"Information Theory-Guided Heuristic Progressive Multi-View Coding","summary":" Multi-view representation learning aims to capture comprehensive information\nfrom multiple views of a shared context. Recent works intuitively apply\ncontrastive learning to different views in a pairwise manner, which is still\nscalable: view-specific noise is not filtered in learning view-shared\nrepresentations; the fake negative pairs, where the negative terms are actually\nwithin the same class as the positive, and the real negative pairs are\ncoequally treated; evenly measuring the similarities between terms might\ninterfere with optimization. Importantly, few works study the theoretical\nframework of generalized self-supervised multi-view learning, especially for\nmore than two views. To this end, we rethink the existing multi-view learning\nparadigm from the perspective of information theory and then propose a novel\ninformation theoretical framework for generalized multi-view learning. Guided\nby it, we build a multi-view coding method with a three-tier progressive\narchitecture, namely Information theory-guided hierarchical Progressive\nMulti-view Coding (IPMC). In the distribution-tier, IPMC aligns the\ndistribution between views to reduce view-specific noise. In the set-tier, IPMC\nconstructs self-adjusted contrasting pools, which are adaptively modified by a\nview filter. Lastly, in the instance-tier, we adopt a designed unified loss to\nlearn representations and reduce the gradient interference. Theoretically and\nempirically, we demonstrate the superiority of IPMC over state-of-the-art\nmethods.\n","authors":["Jiangmeng Li","Hang Gao","Wenwen Qiang","Changwen Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.10522v3.pdf","comment":"This paper is accepted by the jourcal of Neural Networks (Elsevier)\n by 2023. arXiv admin note: substantial text overlap with arXiv:2109.02344"},{"id":"http://arxiv.org/abs/2308.12002v1","updated":"2023-08-23T08:41:24Z","published":"2023-08-23T08:41:24Z","title":"Neural oscillators for magnetic hysteresis modeling","summary":" Hysteresis is a ubiquitous phenomenon in science and engineering; its\nmodeling and identification are crucial for understanding and optimizing the\nbehavior of various systems. We develop an ordinary differential equation-based\nrecurrent neural network (RNN) approach to model and quantify the hysteresis,\nwhich manifests itself in sequentiality and history-dependence. Our neural\noscillator, HystRNN, draws inspiration from coupled-oscillatory RNN and\nphenomenological hysteresis models to update the hidden states. The performance\nof HystRNN is evaluated to predict generalized scenarios, involving first-order\nreversal curves and minor loops. The findings show the ability of HystRNN to\ngeneralize its behavior to previously untrained regions, an essential feature\nthat hysteresis models must have. This research highlights the advantage of\nneural oscillators over the traditional RNN-based methods in capturing complex\nhysteresis patterns in magnetic materials, where traditional rate-dependent\nmethods are inadequate to capture intrinsic nonlinearity.\n","authors":["Abhishek Chandra","Taniya Kapoor","Bram Daniels","Mitrofan Curti","Koen Tiels","Daniel M. Tartakovsky","Elena A. Lomonova"],"pdf_url":"https://arxiv.org/pdf/2308.12002v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12000v1","updated":"2023-08-23T08:38:53Z","published":"2023-08-23T08:38:53Z","title":"On Uniformly Optimal Algorithms for Best Arm Identification in Two-Armed\n Bandits with Fixed Budget","summary":" We study the problem of best-arm identification with fixed budget in\nstochastic two-arm bandits with Bernoulli rewards. We prove that surprisingly,\nthere is no algorithm that (i) performs as well as the algorithm sampling each\narm equally (this algorithm is referred to as the {\\it uniform sampling}\nalgorithm) on all instances, and that (ii) strictly outperforms this algorithm\non at least one instance. In short, there is no algorithm better than the\nuniform sampling algorithm. Towards this result, we introduce the natural class\nof {\\it consistent} and {\\it stable} algorithms, and show that any algorithm\nthat performs as well as the uniform sampling algorithm on all instances\nbelongs to this class. The proof is completed by deriving a lower bound on the\nerror rate satisfied by any consistent and stable algorithm, and by showing\nthat the uniform sampling algorithm matches this lower bound. Our results\nprovide a solution to the two open problems presented in \\cite{qin2022open}.\n","authors":["Po-An Wang","Kaito Ariu","Alexandre Proutiere"],"pdf_url":"https://arxiv.org/pdf/2308.12000v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11991v1","updated":"2023-08-23T08:25:33Z","published":"2023-08-23T08:25:33Z","title":"Relational Concept Based Models","summary":" The design of interpretable deep learning models working in relational\ndomains poses an open challenge: interpretable deep learning methods, such as\nConcept-Based Models (CBMs), are not designed to solve relational problems,\nwhile relational models are not as interpretable as CBMs. To address this\nproblem, we propose Relational Concept-Based Models, a family of relational\ndeep learning methods providing interpretable task predictions. Our\nexperiments, ranging from image classification to link prediction in knowledge\ngraphs, show that relational CBMs (i) match generalization performance of\nexisting relational black-boxes (as opposed to non-relational CBMs), (ii)\nsupport the generation of quantified concept-based explanations, (iii)\neffectively respond to test-time interventions, and (iv) withstand demanding\nsettings including out-of-distribution scenarios, limited training data\nregimes, and scarce concept supervisions.\n","authors":["Pietro Barbiero","Francesco Giannini","Gabriele Ciravegna","Michelangelo Diligenti","Giuseppe Marra"],"pdf_url":"https://arxiv.org/pdf/2308.11991v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11406v2","updated":"2023-08-23T08:00:04Z","published":"2023-08-22T12:53:09Z","title":"Designing an attack-defense game: how to increase robustness of\n financial transaction models via a competition","summary":" Given the escalating risks of malicious attacks in the finance sector and the\nconsequential severe damage, a thorough understanding of adversarial strategies\nand robust defense mechanisms for machine learning models is critical. The\nthreat becomes even more severe with the increased adoption in banks more\naccurate, but potentially fragile neural networks. We aim to investigate the\ncurrent state and dynamics of adversarial attacks and defenses for neural\nnetwork models that use sequential financial data as the input.\n To achieve this goal, we have designed a competition that allows realistic\nand detailed investigation of problems in modern financial transaction data.\nThe participants compete directly against each other, so possible attacks and\ndefenses are examined in close-to-real-life conditions. Our main contributions\nare the analysis of the competition dynamics that answers the questions on how\nimportant it is to conceal a model from malicious users, how long does it take\nto break it, and what techniques one should use to make it more robust, and\nintroduction additional way to attack models or increase their robustness.\n Our analysis continues with a meta-study on the used approaches with their\npower, numerical experiments, and accompanied ablations studies. We show that\nthe developed attacks and defenses outperform existing alternatives from the\nliterature while being practical in terms of execution, proving the validity of\nthe competition as a tool for uncovering vulnerabilities of machine learning\nmodels and mitigating them in various domains.\n","authors":["Alexey Zaytsev","Alex Natekin","Evgeni Vorsin","Valerii Smirnov","Georgii Smirnov","Oleg Sidorshin","Alexander Senin","Alexander Dudin","Dmitry Berestnev"],"pdf_url":"https://arxiv.org/pdf/2308.11406v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11978v1","updated":"2023-08-23T07:57:45Z","published":"2023-08-23T07:57:45Z","title":"Will More Expressive Graph Neural Networks do Better on Generative\n Tasks?","summary":" Graph generation poses a significant challenge as it involves predicting a\ncomplete graph with multiple nodes and edges based on simply a given label.\nThis task also carries fundamental importance to numerous real-world\napplications, including de-novo drug and molecular design. In recent years,\nseveral successful methods have emerged in the field of graph generation.\nHowever, these approaches suffer from two significant shortcomings: (1) the\nunderlying Graph Neural Network (GNN) architectures used in these methods are\noften underexplored; and (2) these methods are often evaluated on only a\nlimited number of metrics. To fill this gap, we investigate the expressiveness\nof GNNs under the context of the molecular graph generation task, by replacing\nthe underlying GNNs of graph generative models with more expressive GNNs.\nSpecifically, we analyse the performance of six GNNs in two different\ngenerative frameworks (GCPN and GraphAF), on six different molecular generative\nobjectives on the ZINC-250k dataset. Through our extensive experiments, we\ndemonstrate that advanced GNNs can indeed improve the performance of GCPN and\nGraphAF on molecular generation tasks, but GNN expressiveness is not a\nnecessary condition for a good GNN-based generative model. Moreover, we show\nthat GCPN and GraphAF with advanced GNNs can achieve state-of-the-art results\nacross 17 other non-GNN-based graph generative approaches, such as variational\nautoencoders and Bayesian optimisation models, on the proposed molecular\ngenerative objectives (DRD2, Median1, Median2), which are important metrics for\nde-novo molecular design.\n","authors":["Xiandong Zou","Xiangyu Zhao","Pietro Liò","Yiren Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.11978v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11975v1","updated":"2023-08-23T07:50:43Z","published":"2023-08-23T07:50:43Z","title":"Approximating Score-based Explanation Techniques Using Conformal\n Regression","summary":" Score-based explainable machine-learning techniques are often used to\nunderstand the logic behind black-box models. However, such explanation\ntechniques are often computationally expensive, which limits their application\nin time-critical contexts. Therefore, we propose and investigate the use of\ncomputationally less costly regression models for approximating the output of\nscore-based explanation techniques, such as SHAP. Moreover, validity guarantees\nfor the approximated values are provided by the employed inductive conformal\nprediction framework. We propose several non-conformity measures designed to\ntake the difficulty of approximating the explanations into account while\nkeeping the computational cost low. We present results from a large-scale\nempirical investigation, in which the approximate explanations generated by our\nproposed models are evaluated with respect to efficiency (interval size). The\nresults indicate that the proposed method can significantly improve execution\ntime compared to the fast version of SHAP, TreeSHAP. The results also suggest\nthat the proposed method can produce tight intervals, while providing validity\nguarantees. Moreover, the proposed approach allows for comparing explanations\nof different approximation methods and selecting a method based on how\ninformative (tight) are the predicted intervals.\n","authors":["Amr Alkhatib","Henrik Boström","Sofiane Ennadir","Ulf Johansson"],"pdf_url":"https://arxiv.org/pdf/2308.11975v1.pdf","comment":"20 pages, 14 figures, The 12th Symposium on Conformal and\n Probabilistic Prediction with Applications (COPA 2023)"},{"id":"http://arxiv.org/abs/2306.01792v3","updated":"2023-08-23T07:43:03Z","published":"2023-06-01T08:10:03Z","title":"Task Relation-aware Continual User Representation Learning","summary":" User modeling, which learns to represent users into a low-dimensional\nrepresentation space based on their past behaviors, got a surge of interest\nfrom the industry for providing personalized services to users. Previous\nefforts in user modeling mainly focus on learning a task-specific user\nrepresentation that is designed for a single task. However, since learning\ntask-specific user representations for every task is infeasible, recent studies\nintroduce the concept of universal user representation, which is a more\ngeneralized representation of a user that is relevant to a variety of tasks.\nDespite their effectiveness, existing approaches for learning universal user\nrepresentations are impractical in real-world applications due to the data\nrequirement, catastrophic forgetting and the limited learning capability for\ncontinually added tasks. In this paper, we propose a novel continual user\nrepresentation learning method, called TERACON, whose learning capability is\nnot limited as the number of learned tasks increases while capturing the\nrelationship between the tasks. The main idea is to introduce an embedding for\neach task, i.e., task embedding, which is utilized to generate task-specific\nsoft masks that not only allow the entire model parameters to be updated until\nthe end of training sequence, but also facilitate the relationship between the\ntasks to be captured. Moreover, we introduce a novel knowledge retention module\nwith pseudo-labeling strategy that successfully alleviates the long-standing\nproblem of continual learning, i.e., catastrophic forgetting. Extensive\nexperiments on public and proprietary real-world datasets demonstrate the\nsuperiority and practicality of TERACON. Our code is available at\nhttps://github.com/Sein-Kim/TERACON.\n","authors":["Sein Kim","Namkyeong Lee","Donghyun Kim","Minchul Yang","Chanyoung Park"],"pdf_url":"https://arxiv.org/pdf/2306.01792v3.pdf","comment":"KDD 2023"},{"id":"http://arxiv.org/abs/2308.11971v1","updated":"2023-08-23T07:36:30Z","published":"2023-08-23T07:36:30Z","title":"EVE: Efficient Vision-Language Pre-training with Masked Prediction and\n Modality-Aware MoE","summary":" Building scalable vision-language models to learn from diverse, multimodal\ndata remains an open challenge. In this paper, we introduce an Efficient\nVision-languagE foundation model, namely EVE, which is one unified multimodal\nTransformer pre-trained solely by one unified pre-training task. Specifically,\nEVE encodes both vision and language within a shared Transformer network\nintegrated with modality-aware sparse Mixture-of-Experts (MoE) modules, which\ncapture modality-specific information by selectively switching to different\nexperts. To unify pre-training tasks of vision and language, EVE performs\nmasked signal modeling on image-text pairs to reconstruct masked signals, i.e.,\nimage pixels and text tokens, given visible signals. This simple yet effective\npre-training objective accelerates training by 3.5x compared to the model\npre-trained with Image-Text Contrastive and Image-Text Matching losses. Owing\nto the combination of the unified architecture and pre-training task, EVE is\neasy to scale up, enabling better downstream performance with fewer resources\nand faster training speed. Despite its simplicity, EVE achieves\nstate-of-the-art performance on various vision-language downstream tasks,\nincluding visual question answering, visual reasoning, and image-text\nretrieval.\n","authors":["Junyi Chen","Longteng Guo","Jia Sun","Shuai Shao","Zehuan Yuan","Liang Lin","Dongyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.11971v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11969v1","updated":"2023-08-23T07:30:16Z","published":"2023-08-23T07:30:16Z","title":"Anisotropic Hybrid Networks for liver tumor segmentation with\n uncertainty quantification","summary":" The burden of liver tumors is important, ranking as the fourth leading cause\nof cancer mortality. In case of hepatocellular carcinoma (HCC), the delineation\nof liver and tumor on contrast-enhanced magnetic resonance imaging (CE-MRI) is\nperformed to guide the treatment strategy. As this task is time-consuming,\nneeds high expertise and could be subject to inter-observer variability there\nis a strong need for automatic tools. However, challenges arise from the lack\nof available training data, as well as the high variability in terms of image\nresolution and MRI sequence. In this work we propose to compare two different\npipelines based on anisotropic models to obtain the segmentation of the liver\nand tumors. The first pipeline corresponds to a baseline multi-class model that\nperforms the simultaneous segmentation of the liver and tumor classes. In the\nsecond approach, we train two distinct binary models, one segmenting the liver\nonly and the other the tumors. Our results show that both pipelines exhibit\ndifferent strengths and weaknesses. Moreover we propose an uncertainty\nquantification strategy allowing the identification of potential false positive\ntumor lesions. Both solutions were submitted to the MICCAI 2023 Atlas challenge\nregarding liver and tumor segmentation.\n","authors":["Benjamin Lambert","Pauline Roca","Florence Forbes","Senan Doyle","Michel Dojat"],"pdf_url":"https://arxiv.org/pdf/2308.11969v1.pdf","comment":"Accepted for presentation at MICCAI Workshop on 2nd\n Resource-Efficient Medical Image Analysis (REMIA)"},{"id":"http://arxiv.org/abs/2302.06912v3","updated":"2023-08-23T07:27:20Z","published":"2023-02-14T08:56:50Z","title":"Regret-Based Optimization for Robust Reinforcement Learning","summary":" Deep Reinforcement Learning (DRL) policies have been shown to be vulnerable\nto small adversarial noise in observations. Such adversarial noise can have\ndisastrous consequences in safety-critical environments. For instance, a\nself-driving car receiving adversarially perturbed sensory observations about\nnearby signs (e.g., a stop sign physically altered to be perceived as a speed\nlimit sign) or objects (e.g., cars altered to be recognized as trees) can be\nfatal. Existing approaches for making RL algorithms robust to an\nobservation-perturbing adversary have focused on reactive approaches that\niteratively improve against adversarial examples generated at each iteration.\nWhile such approaches have been shown to provide improvements over regular RL\nmethods, they are reactive and can fare significantly worse if certain\ncategories of adversarial examples are not generated during training. To that\nend, we pursue a more proactive approach that relies on directly optimizing a\nwell-studied robustness measure, regret instead of expected value. We provide a\nprincipled approach that minimizes maximum regret over a \"neighborhood\" of\nobservations to the received \"observation\". Our regret criterion can be used to\nmodify existing value- and policy-based Deep RL methods. We demonstrate that\nour approaches provide a significant improvement in performance across a wide\nvariety of benchmarks against leading approaches for robust Deep RL.\n","authors":["Roman Belaire","Pradeep Varakantham","Thanh Nguyen","David Lo"],"pdf_url":"https://arxiv.org/pdf/2302.06912v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.09091v2","updated":"2023-08-23T07:23:17Z","published":"2023-01-22T10:17:02Z","title":"BallGAN: 3D-aware Image Synthesis with a Spherical Background","summary":" 3D-aware GANs aim to synthesize realistic 3D scenes such that they can be\nrendered in arbitrary perspectives to produce images. Although previous methods\nproduce realistic images, they suffer from unstable training or degenerate\nsolutions where the 3D geometry is unnatural. We hypothesize that the 3D\ngeometry is underdetermined due to the insufficient constraint, i.e., being\nclassified as real image to the discriminator is not enough. To solve this\nproblem, we propose to approximate the background as a spherical surface and\nrepresent a scene as a union of the foreground placed in the sphere and the\nthin spherical background. It reduces the degree of freedom in the background\nfield. Accordingly, we modify the volume rendering equation and incorporate\ndedicated constraints to design a novel 3D-aware GAN framework named BallGAN.\nBallGAN has multiple advantages as follows. 1) It produces more reasonable 3D\ngeometry; the images of a scene across different viewpoints have better\nphotometric consistency and fidelity than the state-of-the-art methods. 2) The\ntraining becomes much more stable. 3) The foreground can be separately rendered\non top of different arbitrary backgrounds.\n","authors":["Minjung Shin","Yunji Seo","Jeongmin Bae","Young Sun Choi","Hyunsu Kim","Hyeran Byun","Youngjung Uh"],"pdf_url":"https://arxiv.org/pdf/2301.09091v2.pdf","comment":"ICCV 2023, Project Page: https://minjung-s.github.io/ballgan"},{"id":"http://arxiv.org/abs/2307.06857v2","updated":"2023-08-23T07:06:53Z","published":"2023-07-11T17:51:48Z","title":"Self-consistency for open-ended generations","summary":" Large Language Models (LLMs) can exhibit considerable variation in the\nquality of their sampled outputs. Reranking and selecting the best generation\nfrom the sampled set is a popular way of obtaining strong gains in generation\nquality. In this paper, we present a novel approach for reranking LLM\ngenerations. Unlike other techniques that might involve additional inferences\nor training a specialized reranker, our approach relies on easy to compute\npairwise statistics between the generations that have minimal compute overhead.\nWe show that our approach can be formalized as an extension of self-consistency\nand analyze its performance in that framework, theoretically as well as via\nsimulations. We show strong improvements for selecting the best $k$ generations\nfor code generation tasks as well as robust improvements for best generation\nfor the tasks of autoformalization, and summarization. While our approach only\nassumes black-box access to LLMs, we show that additional access to token\nprobabilities can improve performance even further.\n","authors":["Siddhartha Jain","Xiaofei Ma","Anoop Deoras","Bing Xiang"],"pdf_url":"https://arxiv.org/pdf/2307.06857v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11958v1","updated":"2023-08-23T06:57:05Z","published":"2023-08-23T06:57:05Z","title":"Maintaining Plasticity via Regenerative Regularization","summary":" In continual learning, plasticity refers to the ability of an agent to\nquickly adapt to new information. Neural networks are known to lose plasticity\nwhen processing non-stationary data streams. In this paper, we propose L2 Init,\na very simple approach for maintaining plasticity by incorporating in the loss\nfunction L2 regularization toward initial parameters. This is very similar to\nstandard L2 regularization (L2), the only difference being that L2 regularizes\ntoward the origin. L2 Init is simple to implement and requires selecting only a\nsingle hyper-parameter. The motivation for this method is the same as that of\nmethods that reset neurons or parameter values. Intuitively, when recent losses\nare insensitive to particular parameters, these parameters drift toward their\ninitial values. This prepares parameters to adapt quickly to new tasks. On\nsimple problems representative of different types of nonstationarity in\ncontinual learning, we demonstrate that L2 Init consistently mitigates\nplasticity loss. We additionally find that our regularization term reduces\nparameter magnitudes and maintains a high effective feature rank.\n","authors":["Saurabh Kumar","Henrik Marklund","Benjamin Van Roy"],"pdf_url":"https://arxiv.org/pdf/2308.11958v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11953v1","updated":"2023-08-23T06:51:22Z","published":"2023-08-23T06:51:22Z","title":"When MiniBatch SGD Meets SplitFed Learning:Convergence Analysis and\n Performance Evaluation","summary":" Federated learning (FL) enables collaborative model training across\ndistributed clients (e.g., edge devices) without sharing raw data. Yet, FL can\nbe computationally expensive as the clients need to train the entire model\nmultiple times. SplitFed learning (SFL) is a recent distributed approach that\nalleviates computation workload at the client device by splitting the model at\na cut layer into two parts, where clients only need to train part of the model.\nHowever, SFL still suffers from the \\textit{client drift} problem when clients'\ndata are highly non-IID. To address this issue, we propose MiniBatch-SFL. This\nalgorithm incorporates MiniBatch SGD into SFL, where the clients train the\nclient-side model in an FL fashion while the server trains the server-side\nmodel similar to MiniBatch SGD. We analyze the convergence of MiniBatch-SFL and\nshow that the bound of the expected loss can be obtained by analyzing the\nexpected server-side and client-side model updates, respectively. The\nserver-side updates do not depend on the non-IID degree of the clients'\ndatasets and can potentially mitigate client drift. However, the client-side\nmodel relies on the non-IID degree and can be optimized by properly choosing\nthe cut layer. Perhaps counter-intuitive, our empirical result shows that a\nlatter position of the cut layer leads to a smaller average gradient divergence\nand a better algorithm performance. Moreover, numerical results show that\nMiniBatch-SFL achieves higher accuracy than conventional SFL and FL. The\naccuracy improvement can be up to 24.1\\% and 17.1\\% with highly non-IID data,\nrespectively.\n","authors":["Chao Huang","Geng Tian","Ming Tang"],"pdf_url":"https://arxiv.org/pdf/2308.11953v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.08434v2","updated":"2023-08-23T06:48:52Z","published":"2022-03-16T07:12:42Z","title":"Deep Residual Error and Bag-of-Tricks Learning for Gravitational Wave\n Surrogate Modeling","summary":" Deep learning methods have been employed in gravitational-wave astronomy to\naccelerate the construction of surrogate waveforms for the inspiral of\nspin-aligned black hole binaries, among other applications. We face the\nchallenge of modeling the residual error of an artificial neural network that\nmodels the coefficients of the surrogate waveform expansion (especially those\nof the phase of the waveform) which we demonstrate has sufficient structure to\nbe learnable by a second network. Adding this second network, we were able to\nreduce the maximum mismatch for waveforms in a validation set by 13.4 times. We\nalso explored several other ideas for improving the accuracy of the surrogate\nmodel, such as the exploitation of similarities between waveforms, the\naugmentation of the training set, the dissection of the input space, using\ndedicated networks per output coefficient and output augmentation. In several\ncases, small improvements can be observed, but the most significant improvement\nstill comes from the addition of a second network that models the residual\nerror. Since the residual error for more general surrogate waveform models\n(when e.g., eccentricity is included) may also have a specific structure, one\ncan expect our method to be applicable to cases where the gain in accuracy\ncould lead to significant gains in computational time.\n","authors":["Styliani-Christina Fragkouli","Paraskevi Nousi","Nikolaos Passalis","Panagiotis Iosif","Nikolaos Stergioulas","Anastasios Tefas"],"pdf_url":"https://arxiv.org/pdf/2203.08434v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10632v2","updated":"2023-08-23T06:41:42Z","published":"2023-08-21T11:07:27Z","title":"Foundation Model-oriented Robustness: Robust Image Model Evaluation with\n Pretrained Models","summary":" Machine learning has demonstrated remarkable performance over finite\ndatasets, yet whether the scores over the fixed benchmarks can sufficiently\nindicate the model's performance in the real world is still in discussion. In\nreality, an ideal robust model will probably behave similarly to the oracle\n(e.g., the human users), thus a good evaluation protocol is probably to\nevaluate the models' behaviors in comparison to the oracle. In this paper, we\nintroduce a new robustness measurement that directly measures the image\nclassification model's performance compared with a surrogate oracle (i.e., a\nfoundation model). Besides, we design a simple method that can accomplish the\nevaluation beyond the scope of the benchmarks. Our method extends the image\ndatasets with new samples that are sufficiently perturbed to be distinct from\nthe ones in the original sets, but are still bounded within the same\nimage-label structure the original test image represents, constrained by a\nfoundation model pretrained with a large amount of samples. As a result, our\nnew method will offer us a new way to evaluate the models' robustness\nperformance, free of limitations of fixed benchmarks or constrained\nperturbations, although scoped by the power of the oracle. In addition to the\nevaluation results, we also leverage our generated data to understand the\nbehaviors of the model and our new evaluation strategies.\n","authors":["Peiyan Zhang","Haoyang Liu","Chaozhuo Li","Xing Xie","Sunghun Kim","Haohan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.10632v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11946v1","updated":"2023-08-23T06:40:05Z","published":"2023-08-23T06:40:05Z","title":"Multi-scale Transformer Pyramid Networks for Multivariate Time Series\n Forecasting","summary":" Multivariate Time Series (MTS) forecasting involves modeling temporal\ndependencies within historical records. Transformers have demonstrated\nremarkable performance in MTS forecasting due to their capability to capture\nlong-term dependencies. However, prior work has been confined to modeling\ntemporal dependencies at either a fixed scale or multiple scales that\nexponentially increase (most with base 2). This limitation hinders their\neffectiveness in capturing diverse seasonalities, such as hourly and daily\npatterns. In this paper, we introduce a dimension invariant embedding technique\nthat captures short-term temporal dependencies and projects MTS data into a\nhigher-dimensional space, while preserving the dimensions of time steps and\nvariables in MTS data. Furthermore, we present a novel Multi-scale Transformer\nPyramid Network (MTPNet), specifically designed to effectively capture temporal\ndependencies at multiple unconstrained scales. The predictions are inferred\nfrom multi-scale latent representations obtained from transformers at various\nscales. Extensive experiments on nine benchmark datasets demonstrate that the\nproposed MTPNet outperforms recent state-of-the-art methods.\n","authors":["Yifan Zhang","Rui Wu","Sergiu M. Dascalu","Frederick C. Harris Jr"],"pdf_url":"https://arxiv.org/pdf/2308.11946v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.07771v2","updated":"2023-08-23T06:37:32Z","published":"2022-12-15T12:47:59Z","title":"Temporal Saliency Detection Towards Explainable Transformer-based\n Timeseries Forecasting","summary":" Despite the notable advancements in numerous Transformer-based models, the\ntask of long multi-horizon time series forecasting remains a persistent\nchallenge, especially towards explainability. Focusing on commonly used\nsaliency maps in explaining DNN in general, our quest is to build\nattention-based architecture that can automatically encode saliency-related\ntemporal patterns by establishing connections with appropriate attention heads.\nHence, this paper introduces Temporal Saliency Detection (TSD), an effective\napproach that builds upon the attention mechanism and applies it to\nmulti-horizon time series prediction. While our proposed architecture adheres\nto the general encoder-decoder structure, it undergoes a significant renovation\nin the encoder component, wherein we incorporate a series of information\ncontracting and expanding blocks inspired by the U-Net style architecture. The\nTSD approach facilitates the multiresolution analysis of saliency patterns by\ncondensing multi-heads, thereby progressively enhancing the forecasting of\ncomplex time series data. Empirical evaluations illustrate the superiority of\nour proposed approach compared to other models across multiple standard\nbenchmark datasets in diverse far-horizon forecasting settings. The initial TSD\nachieves substantial relative improvements of 31% and 46% over several models\nin the context of multivariate and univariate prediction. We believe the\ncomprehensive investigations presented in this study will offer valuable\ninsights and benefits to future research endeavors.\n","authors":["Nghia Duong-Trung","Duc-Manh Nguyen","Danh Le-Phuoc"],"pdf_url":"https://arxiv.org/pdf/2212.07771v2.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2308.11943v1","updated":"2023-08-23T06:32:14Z","published":"2023-08-23T06:32:14Z","title":"RamseyRL: A Framework for Intelligent Ramsey Number Counterexample\n Searching","summary":" The Ramsey number is the minimum number of nodes, $n = R(s, t)$, such that\nall undirected simple graphs of order $n$, contain a clique of order $s$, or an\nindependent set of order $t$. This paper explores the application of a best\nfirst search algorithm and reinforcement learning (RL) techniques to find\ncounterexamples to specific Ramsey numbers. We incrementally improve over prior\nsearch methods such as random search by introducing a graph vectorization and\ndeep neural network (DNN)-based heuristic, which gauge the likelihood of a\ngraph being a counterexample. The paper also proposes algorithmic optimizations\nto confine a polynomial search runtime. This paper does not aim to present new\ncounterexamples but rather introduces and evaluates a framework supporting\nRamsey counterexample exploration using other heuristics. Code and methods are\nmade available through a PyPI package and GitHub repository.\n","authors":["Steve Vott","Adam M. Lehavi"],"pdf_url":"https://arxiv.org/pdf/2308.11943v1.pdf","comment":"8 pages, 4 figures, submitted to AAAI2024"},{"id":"http://arxiv.org/abs/2308.11940v1","updated":"2023-08-23T06:21:46Z","published":"2023-08-23T06:21:46Z","title":"Audio Generation with Multiple Conditional Diffusion Model","summary":" Text-based audio generation models have limitations as they cannot encompass\nall the information in audio, leading to restricted controllability when\nrelying solely on text. To address this issue, we propose a novel model that\nenhances the controllability of existing pre-trained text-to-audio models by\nincorporating additional conditions including content (timestamp) and style\n(pitch contour and energy contour) as supplements to the text. This approach\nachieves fine-grained control over the temporal order, pitch, and energy of\ngenerated audio. To preserve the diversity of generation, we employ a trainable\ncontrol condition encoder that is enhanced by a large language model and a\ntrainable Fusion-Net to encode and fuse the additional conditions while keeping\nthe weights of the pre-trained text-to-audio model frozen. Due to the lack of\nsuitable datasets and evaluation metrics, we consolidate existing datasets into\na new dataset comprising the audio and corresponding conditions and use a\nseries of evaluation metrics to evaluate the controllability performance.\nExperimental results demonstrate that our model successfully achieves\nfine-grained control to accomplish controllable audio generation. Audio samples\nand our dataset are publicly available at\nhttps://conditionaudiogen.github.io/conditionaudiogen/\n","authors":["Zhifang Guo","Jianguo Mao","Rui Tao","Long Yan","Kazushige Ouchi","Hong Liu","Xiangdong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.11940v1.pdf","comment":"Submitted to AAAI 2024"},{"id":"http://arxiv.org/abs/2308.11217v2","updated":"2023-08-23T06:17:21Z","published":"2023-08-22T06:05:11Z","title":"Federated Learning in Big Model Era: Domain-Specific Multimodal Large\n Models","summary":" Multimodal data, which can comprehensively perceive and recognize the\nphysical world, has become an essential path towards general artificial\nintelligence. However, multimodal large models trained on public datasets often\nunderperform in specific industrial domains. This paper proposes a multimodal\nfederated learning framework that enables multiple enterprises to utilize\nprivate domain data to collaboratively train large models for vertical domains,\nachieving intelligent services across scenarios. The authors discuss in-depth\nthe strategic transformation of federated learning in terms of intelligence\nfoundation and objectives in the era of big model, as well as the new\nchallenges faced in heterogeneous data, model aggregation, performance and cost\ntrade-off, data privacy, and incentive mechanism. The paper elaborates a case\nstudy of leading enterprises contributing multimodal data and expert knowledge\nto city safety operation management , including distributed deployment and\nefficient coordination of the federated learning platform, technical\ninnovations on data quality improvement based on large model capabilities and\nefficient joint fine-tuning approaches. Preliminary experiments show that\nenterprises can enhance and accumulate intelligent capabilities through\nmultimodal model federated learning, thereby jointly creating an smart city\nmodel that provides high-quality intelligent services covering energy\ninfrastructure safety, residential community security, and urban operation\nmanagement. The established federated learning cooperation ecosystem is\nexpected to further aggregate industry, academia, and research resources,\nrealize large models in multiple vertical domains, and promote the large-scale\nindustrial application of artificial intelligence and cutting-edge research on\nmultimodal federated learning.\n","authors":["Zengxiang Li","Zhaoxiang Hou","Hui Liu","Ying Wang","Tongzhi Li","Longfei Xie","Chao Shi","Chengyi Yang","Weishan Zhang","Zelei Liu"],"pdf_url":"https://arxiv.org/pdf/2308.11217v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11939v1","updated":"2023-08-23T06:14:02Z","published":"2023-08-23T06:14:02Z","title":"Retail Demand Forecasting: A Comparative Study for Multivariate Time\n Series","summary":" Accurate demand forecasting in the retail industry is a critical determinant\nof financial performance and supply chain efficiency. As global markets become\nincreasingly interconnected, businesses are turning towards advanced prediction\nmodels to gain a competitive edge. However, existing literature mostly focuses\non historical sales data and ignores the vital influence of macroeconomic\nconditions on consumer spending behavior. In this study, we bridge this gap by\nenriching time series data of customer demand with macroeconomic variables,\nsuch as the Consumer Price Index (CPI), Index of Consumer Sentiment (ICS), and\nunemployment rates. Leveraging this comprehensive dataset, we develop and\ncompare various regression and machine learning models to predict retail demand\naccurately.\n","authors":["Md Sabbirul Haque","Md Shahedul Amin","Jonayet Miah"],"pdf_url":"https://arxiv.org/pdf/2308.11939v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11933v1","updated":"2023-08-23T05:53:13Z","published":"2023-08-23T05:53:13Z","title":"System Identification for Continuous-time Linear Dynamical Systems","summary":" The problem of system identification for the Kalman filter, relying on the\nexpectation-maximization (EM) procedure to learn the underlying parameters of a\ndynamical system, has largely been studied assuming that observations are\nsampled at equally-spaced time points. However, in many applications this is a\nrestrictive and unrealistic assumption. This paper addresses system\nidentification for the continuous-discrete filter, with the aim of generalizing\nlearning for the Kalman filter by relying on a solution to a continuous-time\nIt\\^o stochastic differential equation (SDE) for the latent state and\ncovariance dynamics. We introduce a novel two-filter, analytical form for the\nposterior with a Bayesian derivation, which yields analytical updates which do\nnot require the forward-pass to be pre-computed. Using this analytical and\nefficient computation of the posterior, we provide an EM procedure which\nestimates the parameters of the SDE, naturally incorporating irregularly\nsampled measurements. Generalizing the learning of latent linear dynamical\nsystems (LDS) to continuous-time may extend the use of the hybrid Kalman filter\nto data which is not regularly sampled or has intermittent missing values, and\ncan extend the power of non-linear system identification methods such as\nswitching LDS (SLDS), which rely on EM for the linear discrete-time Kalman\nfilter as a sub-unit for learning locally linearized behavior of a non-linear\nsystem. We apply the method by learning the parameters of a latent,\nmultivariate Fokker-Planck SDE representing a toggle-switch genetic circuit\nusing biologically realistic parameters, and compare the efficacy of learning\nrelative to the discrete-time Kalman filter as the step-size irregularity and\nspectral-radius of the dynamics-matrix increases.\n","authors":["Peter Halmos","Jonathan Pillow","David A. Knowles"],"pdf_url":"https://arxiv.org/pdf/2308.11933v1.pdf","comment":"32 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.11929v1","updated":"2023-08-23T05:33:03Z","published":"2023-08-23T05:33:03Z","title":"Dynamic landslide susceptibility mapping over recent three decades to\n uncover variations in landslide causes in subtropical urban mountainous areas","summary":" Landslide susceptibility assessment (LSA) is of paramount importance in\nmitigating landslide risks. Recently, there has been a surge in the utilization\nof data-driven methods for predicting landslide susceptibility due to the\ngrowing availability of aerial and satellite data. Nonetheless, the rapid\noscillations within the landslide-inducing environment (LIE), primarily due to\nsignificant changes in external triggers such as rainfall, pose difficulties\nfor contemporary data-driven LSA methodologies to accommodate LIEs over diverse\ntimespans. This study presents dynamic landslide susceptibility mapping that\nsimply employs multiple predictive models for annual LSA. In practice, this\nwill inevitably encounter small sample problems due to the limited number of\nlandslide samples in certain years. Another concern arises owing to the\nmajority of the existing LSA approaches train black-box models to fit distinct\ndatasets, yet often failing in generalization and providing comprehensive\nexplanations concerning the interactions between input features and\npredictions. Accordingly, we proposed to meta-learn representations with fast\nadaptation ability using a few samples and gradient updates; and apply SHAP for\neach model interpretation and landslide feature permutation. Additionally, we\napplied MT-InSAR for LSA result enhancement and validation. The chosen study\narea is Lantau Island, Hong Kong, where we conducted a comprehensive dynamic\nLSA spanning from 1992 to 2019. The model interpretation results demonstrate\nthat the primary factors responsible for triggering landslides in Lantau Island\nare terrain slope and extreme rainfall. The results also indicate that the\nvariation in landslide causes can be primarily attributed to extreme rainfall\nevents, which result from global climate change, and the implementation of the\nLandslip Prevention and Mitigation Programme (LPMitP) by the Hong Kong\ngovernment.\n","authors":["Peifeng Ma","Li Chen","Chang Yu","Qing Zhu","Yulin Ding"],"pdf_url":"https://arxiv.org/pdf/2308.11929v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.05237v3","updated":"2023-08-23T05:25:09Z","published":"2023-05-09T07:56:26Z","title":"Traffic Forecasting on New Roads Unseen in the Training Data Using\n Spatial Contrastive Pre-Training","summary":" New roads are being constructed all the time. However, the capabilities of\nprevious deep forecasting models to generalize to new roads not seen in the\ntraining data (unseen roads) are rarely explored. In this paper, we introduce a\nnovel setup called a spatio-temporal (ST) split to evaluate the models'\ncapabilities to generalize to unseen roads. In this setup, the models are\ntrained on data from a sample of roads, but tested on roads not seen in the\ntraining data. Moreover, we also present a novel framework called Spatial\nContrastive Pre-Training (SCPT) where we introduce a spatial encoder module to\nextract latent features from unseen roads during inference time. This spatial\nencoder is pre-trained using contrastive learning. During inference, the\nspatial encoder only requires two days of traffic data on the new roads and\ndoes not require any re-training. We also show that the output from the spatial\nencoder can be used effectively to infer latent node embeddings on unseen roads\nduring inference time. The SCPT framework also incorporates a new layer, named\nthe spatially gated addition (SGA) layer, to effectively combine the latent\nfeatures from the output of the spatial encoder to existing backbones.\nAdditionally, since there is limited data on the unseen roads, we argue that it\nis better to decouple traffic signals to trivial-to-capture periodic signals\nand difficult-to-capture Markovian signals, and for the spatial encoder to only\nlearn the Markovian signals. Finally, we empirically evaluated SCPT using the\nST split setup on four real-world datasets. The results showed that adding SCPT\nto a backbone consistently improves forecasting performance on unseen roads.\nMore importantly, the improvements are greater when forecasting further into\nthe future. The codes are available on GitHub:\nhttps://github.com/cruiseresearchgroup/forecasting-on-new-roads .\n","authors":["Arian Prabowo","Wei Shao","Hao Xue","Piotr Koniusz","Flora D. Salim"],"pdf_url":"https://arxiv.org/pdf/2305.05237v3.pdf","comment":"25 pages including reference, an additional 3 pages of appendix, 8\n figures. ECML PKDD 2023 Journal track special issue: Data Mining and\n Knowledge Discovery (DAMI)"},{"id":"http://arxiv.org/abs/2308.11925v1","updated":"2023-08-23T05:18:19Z","published":"2023-08-23T05:18:19Z","title":"Solving Elliptic Optimal Control Problems using Physics Informed Neural\n Networks","summary":" In this work, we present and analyze a numerical solver for optimal control\nproblems (without / with box constraint) for linear and semilinear second-order\nelliptic problems. The approach is based on a coupled system derived from the\nfirst-order optimality system of the optimal control problem, and applies\nphysics informed neural networks (PINNs) to solve the coupled system. We\npresent an error analysis of the numerical scheme, and provide $L^2(\\Omega)$\nerror bounds on the state, control and adjoint state in terms of deep neural\nnetwork parameters (e.g., depth, width, and parameter bounds) and the number of\nsampling points in the domain and on the boundary. The main tools in the\nanalysis include offset Rademacher complexity and boundedness and Lipschitz\ncontinuity of neural network functions. We present several numerical examples\nto illustrate the approach and compare it with three existing approaches.\n","authors":["Bangti Jin","Ramesh Sau","Luowei Yin","Zhi Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.11925v1.pdf","comment":"28 pages, 5 figures"},{"id":"http://arxiv.org/abs/2205.03977v3","updated":"2023-08-23T05:18:04Z","published":"2022-05-08T23:58:40Z","title":"A Structured Span Selector","summary":" Many natural language processing tasks, e.g., coreference resolution and\nsemantic role labeling, require selecting text spans and making decisions about\nthem. A typical approach to such tasks is to score all possible spans and\ngreedily select spans for task-specific downstream processing. This approach,\nhowever, does not incorporate any inductive bias about what sort of spans ought\nto be selected, e.g., that selected spans tend to be syntactic constituents. In\nthis paper, we propose a novel grammar-based structured span selection model\nwhich learns to make use of the partial span-level annotation provided for such\nproblems. Compared to previous approaches, our approach gets rid of the\nheuristic greedy span selection scheme, allowing us to model the downstream\ntask on an optimal set of spans. We evaluate our model on two popular span\nprediction tasks: coreference resolution and semantic role labeling. We show\nempirical improvements on both.\n","authors":["Tianyu Liu","Yuchen Eleanor Jiang","Ryan Cotterell","Mrinmaya Sachan"],"pdf_url":"https://arxiv.org/pdf/2205.03977v3.pdf","comment":"NAACL 2022 camera-ready"},{"id":"http://arxiv.org/abs/2308.11924v1","updated":"2023-08-23T05:17:51Z","published":"2023-08-23T05:17:51Z","title":"Diverse Policies Converge in Reward-free Markov Decision Processe","summary":" Reinforcement learning has achieved great success in many decision-making\ntasks, and traditional reinforcement learning algorithms are mainly designed\nfor obtaining a single optimal solution. However, recent works show the\nimportance of developing diverse policies, which makes it an emerging research\ntopic. Despite the variety of diversity reinforcement learning algorithms that\nhave emerged, none of them theoretically answer the question of how the\nalgorithm converges and how efficient the algorithm is. In this paper, we\nprovide a unified diversity reinforcement learning framework and investigate\nthe convergence of training diverse policies. Under such a framework, we also\npropose a provably efficient diversity reinforcement learning algorithm.\nFinally, we verify the effectiveness of our method through numerical\nexperiments.\n","authors":["Fanqi Lin","Shiyu Huang","Weiwei Tu"],"pdf_url":"https://arxiv.org/pdf/2308.11924v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11923v1","updated":"2023-08-23T05:13:25Z","published":"2023-08-23T05:13:25Z","title":"Audio Difference Captioning Utilizing Similarity-Discrepancy\n Disentanglement","summary":" We proposed Audio Difference Captioning (ADC) as a new extension task of\naudio captioning for describing the semantic differences between input pairs of\nsimilar but slightly different audio clips. The ADC solves the problem that\nconventional audio captioning sometimes generates similar captions for similar\naudio clips, failing to describe the difference in content. We also propose a\ncross-attention-concentrated transformer encoder to extract differences by\ncomparing a pair of audio clips and a similarity-discrepancy disentanglement to\nemphasize the difference in the latent space. To evaluate the proposed methods,\nwe built an AudioDiffCaps dataset consisting of pairs of similar but slightly\ndifferent audio clips with human-annotated descriptions of their differences.\nThe experiment with the AudioDiffCaps dataset showed that the proposed methods\nsolve the ADC task effectively and improve the attention weights to extract the\ndifference by visualizing them in the transformer encoder.\n","authors":["Daiki Takeuchi","Yasunori Ohishi","Daisuke Niizumi","Noboru Harada","Kunio Kashino"],"pdf_url":"https://arxiv.org/pdf/2308.11923v1.pdf","comment":"Accepted to DCASE2023 Workshop"},{"id":"http://arxiv.org/abs/2308.11912v1","updated":"2023-08-23T04:57:21Z","published":"2023-08-23T04:57:21Z","title":"Addressing Selection Bias in Computerized Adaptive Testing: A User-Wise\n Aggregate Influence Function Approach","summary":" Computerized Adaptive Testing (CAT) is a widely used, efficient test mode\nthat adapts to the examinee's proficiency level in the test domain. CAT\nrequires pre-trained item profiles, for CAT iteratively assesses the student\nreal-time based on the registered items' profiles, and selects the next item to\nadminister using candidate items' profiles. However, obtaining such item\nprofiles is a costly process that involves gathering a large, dense\nitem-response data, then training a diagnostic model on the collected data. In\nthis paper, we explore the possibility of leveraging response data collected in\nthe CAT service. We first show that this poses a unique challenge due to the\ninherent selection bias introduced by CAT, i.e., more proficient students will\nreceive harder questions. Indeed, when naively training the diagnostic model\nusing CAT response data, we observe that item profiles deviate significantly\nfrom the ground-truth. To tackle the selection bias issue, we propose the\nuser-wise aggregate influence function method. Our intuition is to filter out\nusers whose response data is heavily biased in an aggregate manner, as judged\nby how much perturbation the added data will introduce during parameter\nestimation. This way, we may enhance the performance of CAT while introducing\nminimal bias to the item profiles. We provide extensive experiments to\ndemonstrate the superiority of our proposed method based on the three public\ndatasets and one dataset that contains real-world CAT response data.\n","authors":["Soonwoo Kwon","Sojung Kim","Seunghyun Lee","Jin-Young Kim","Suyeong An","Kyuseok Kim"],"pdf_url":"https://arxiv.org/pdf/2308.11912v1.pdf","comment":"CIKM 2023"},{"id":"http://arxiv.org/abs/2302.05601v3","updated":"2023-08-23T04:55:20Z","published":"2023-02-11T04:52:20Z","title":"Pruning Deep Neural Networks from a Sparsity Perspective","summary":" In recent years, deep network pruning has attracted significant attention in\norder to enable the rapid deployment of AI into small devices with computation\nand memory constraints. Pruning is often achieved by dropping redundant\nweights, neurons, or layers of a deep network while attempting to retain a\ncomparable test performance. Many deep pruning algorithms have been proposed\nwith impressive empirical success. However, existing approaches lack a\nquantifiable measure to estimate the compressibility of a sub-network during\neach pruning iteration and thus may under-prune or over-prune the model. In\nthis work, we propose PQ Index (PQI) to measure the potential compressibility\nof deep neural networks and use this to develop a Sparsity-informed Adaptive\nPruning (SAP) algorithm. Our extensive experiments corroborate the hypothesis\nthat for a generic pruning procedure, PQI decreases first when a large model is\nbeing effectively regularized and then increases when its compressibility\nreaches a limit that appears to correspond to the beginning of underfitting.\nSubsequently, PQI decreases again when the model collapse and significant\ndeterioration in the performance of the model start to occur. Additionally, our\nexperiments demonstrate that the proposed adaptive pruning algorithm with\nproper choice of hyper-parameters is superior to the iterative pruning\nalgorithms such as the lottery ticket-based pruning methods, in terms of both\ncompression efficiency and robustness.\n","authors":["Enmao Diao","Ganghua Wang","Jiawei Zhan","Yuhong Yang","Jie Ding","Vahid Tarokh"],"pdf_url":"https://arxiv.org/pdf/2302.05601v3.pdf","comment":"ICLR 2023"},{"id":"http://arxiv.org/abs/2201.06714v3","updated":"2023-08-23T04:54:07Z","published":"2022-01-18T03:13:19Z","title":"AdaTerm: Adaptive T-Distribution Estimated Robust Moments for\n Noise-Robust Stochastic Gradient Optimization","summary":" With the increasing practicality of deep learning applications, practitioners\nare inevitably faced with datasets corrupted by noise from various sources such\nas measurement errors, mislabeling, and estimated surrogate inputs/outputs that\ncan adversely impact the optimization results. It is a common practice to\nimprove the optimization algorithm's robustness to noise, since this algorithm\nis ultimately in charge of updating the network parameters. Previous studies\nrevealed that the first-order moment used in Adam-like stochastic gradient\ndescent optimizers can be modified based on the Student's t-distribution. While\nthis modification led to noise-resistant updates, the other associated\nstatistics remained unchanged, resulting in inconsistencies in the assumed\nmodels. In this paper, we propose AdaTerm, a novel approach that incorporates\nthe Student's t-distribution to derive not only the first-order moment but also\nall the associated statistics. This provides a unified treatment of the\noptimization process, offering a comprehensive framework under the statistical\nmodel of the t-distribution for the first time. The proposed approach offers\nseveral advantages over previously proposed approaches, including reduced\nhyperparameters and improved robustness and adaptability. This noise-adaptive\nbehavior contributes to AdaTerm's exceptional learning performance, as\ndemonstrated through various optimization problems with different and/or\nunknown noise ratios. Furthermore, we introduce a new technique for deriving a\ntheoretical regret bound without relying on AMSGrad, providing a valuable\ncontribution to the field\n","authors":["Wendyam Eric Lionel Ilboudo","Taisuke Kobayashi","Takamitsu Matsubara"],"pdf_url":"https://arxiv.org/pdf/2201.06714v3.pdf","comment":"27 pages; Final version accepted by Elsevier Neurocomputing Journal\n (2023-08; https://doi.org/10.1016/j.neucom.2023.126692)"},{"id":"http://arxiv.org/abs/2308.11905v1","updated":"2023-08-23T04:14:45Z","published":"2023-08-23T04:14:45Z","title":"Utilizing Admissible Bounds for Heuristic Learning","summary":" While learning a heuristic function for forward search algorithms with modern\nmachine learning techniques has been gaining interest in recent years, there\nhas been little theoretical understanding of \\emph{what} they should learn,\n\\emph{how} to train them, and \\emph{why} we do so. This lack of understanding\nleads to various literature performing an ad-hoc selection of datasets\n(suboptimal vs optimal costs or admissible vs inadmissible heuristics) and\noptimization metrics (e.g., squared vs absolute errors). Moreover, due to the\nlack of admissibility of the resulting trained heuristics, little focus has\nbeen put on the role of admissibility \\emph{during} learning. This paper\narticulates the role of admissible heuristics in supervised heuristic learning\nusing them as parameters of Truncated Gaussian distributions, which tightens\nthe hypothesis space compared to ordinary Gaussian distributions. We argue that\nthis mathematical model faithfully follows the principle of maximum entropy and\nempirically show that, as a result, it yields more accurate heuristics and\nconverges faster during training.\n","authors":["Carlos Núñez-Molina","Masataro Asai"],"pdf_url":"https://arxiv.org/pdf/2308.11905v1.pdf","comment":"14 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.11903v1","updated":"2023-08-23T04:08:53Z","published":"2023-08-23T04:08:53Z","title":"Rethinking Data Perturbation and Model Stabilization for Semi-supervised\n Medical Image Segmentation","summary":" Studies on semi-supervised medical image segmentation (SSMIS) have seen fast\nprogress recently. Due to the limited labelled data, SSMIS methods mainly focus\non effectively leveraging unlabeled data to enhance the segmentation\nperformance. However, despite their promising performance, current\nstate-of-the-art methods often prioritize integrating complex techniques and\nloss terms rather than addressing the core challenges of semi-supervised\nscenarios directly. We argue that the key to SSMIS lies in generating\nsubstantial and appropriate prediction disagreement on unlabeled data. To this\nend, we emphasize the crutiality of data perturbation and model stabilization\nin semi-supervised segmentation, and propose a simple yet effective approach to\nboost SSMIS performance significantly, dubbed DPMS. Specifically, we first\nrevisit SSMIS from three distinct perspectives: the data, the model, and the\nloss, and conduct a comprehensive study of corresponding strategies to examine\ntheir effectiveness. Based on these examinations, we then propose DPMS, which\nadopts a plain teacher-student framework with a standard supervised loss and\nunsupervised consistency loss. To produce appropriate prediction disagreements,\nDPMS perturbs the unlabeled data via strong augmentations to enlarge prediction\ndisagreements considerably. On the other hand, using EMA teacher when strong\naugmentation is applied does not necessarily improve performance. DPMS further\nutilizes a forwarding-twice and momentum updating strategies for normalization\nstatistics to stabilize the training on unlabeled data effectively. Despite its\nsimplicity, DPMS can obtain new state-of-the-art performance on the public 2D\nACDC and 3D LA datasets across various semi-supervised settings, e.g. obtaining\na remarkable 22.62% improvement against previous SOTA on ACDC with 5% labels.\n","authors":["Zhen Zhao","Ye Liu","Meng Zhao","Di Yin","Yixuan Yuan","Luping Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.11903v1.pdf","comment":"Code and logs are available at https://github.com/ZhenZHAO/DPMS"},{"id":"http://arxiv.org/abs/2307.00252v2","updated":"2023-08-23T03:59:48Z","published":"2023-07-01T07:17:33Z","title":"An ML approach to resolution of singularities","summary":" The solution set of a system of polynomial equations typically contains\nill-behaved, singular points. Resolution is a fundamental process in geometry\nin which we replace singular points with smooth points, while keeping the rest\nof the solution set unchanged. Resolutions are not unique: the usual way to\ndescribe them involves repeatedly performing a fundamental operation known as\n\"blowing-up\", and the complexity of the resolution highly depends on certain\nchoices. The process can be translated into various versions of a 2-player\ngame, the so-called Hironaka game, and a winning strategy for the first player\nprovides a solution to the resolution problem. In this paper we introduce a new\napproach to the Hironaka game that uses reinforcement learning agents to find\noptimal resolutions of singularities. In certain domains, the trained model\noutperforms state-of-the-art selection heuristics in total number of polynomial\nadditions performed, which provides a proof-of-concept that recent developments\nin machine learning have the potential to improve performance of algorithms in\nsymbolic computation.\n","authors":["Gergely Bérczi","Honglu Fan","Mingcong Zeng"],"pdf_url":"https://arxiv.org/pdf/2307.00252v2.pdf","comment":"To appear in Proceedings of the 40th International Conference on\n Machine Learning TAG Workshop (ICML-TAG 2023)"},{"id":"http://arxiv.org/abs/2307.16680v4","updated":"2023-08-23T03:28:30Z","published":"2023-07-31T13:57:05Z","title":"On the Trustworthiness Landscape of State-of-the-art Generative Models:\n A Comprehensive Survey","summary":" Diffusion models and large language models have emerged as leading-edge\ngenerative models and have sparked a revolutionary impact on various aspects of\nhuman life. However, the practical implementation of these models has also\nexposed inherent risks, highlighting their dual nature and raising concerns\nregarding their trustworthiness. Despite the abundance of literature on this\nsubject, a comprehensive survey specifically delving into the intersection of\nlarge-scale generative models and their trustworthiness remains largely absent.\nTo bridge this gap, This paper investigates both the long-standing and emerging\nthreats associated with these models across four fundamental dimensions:\nprivacy, security, fairness, and responsibility. In this way, we construct an\nextensive map outlining the trustworthiness of these models, while also\nproviding practical recommendations and identifying future directions. These\nefforts are crucial for promoting the trustworthy deployment of these models,\nultimately benefiting society as a whole.\n","authors":["Mingyuan Fan","Cen Chen","Chengyu Wang","Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2307.16680v4.pdf","comment":"Draft Version"},{"id":"http://arxiv.org/abs/2308.11890v1","updated":"2023-08-23T03:23:07Z","published":"2023-08-23T03:23:07Z","title":"Shape-conditioned 3D Molecule Generation via Equivariant Diffusion\n Models","summary":" Ligand-based drug design aims to identify novel drug candidates of similar\nshapes with known active molecules. In this paper, we formulated an in silico\nshape-conditioned molecule generation problem to generate 3D molecule\nstructures conditioned on the shape of a given molecule. To address this\nproblem, we developed a translation- and rotation-equivariant shape-guided\ngenerative model ShapeMol. ShapeMol consists of an equivariant shape encoder\nthat maps molecular surface shapes into latent embeddings, and an equivariant\ndiffusion model that generates 3D molecules based on these embeddings.\nExperimental results show that ShapeMol can generate novel, diverse, drug-like\nmolecules that retain 3D molecular shapes similar to the given shape condition.\nThese results demonstrate the potential of ShapeMol in designing drug\ncandidates of desired 3D shapes binding to protein target pockets.\n","authors":["Ziqi Chen","Bo Peng","Srinivasan Parthasarathy","Xia Ning"],"pdf_url":"https://arxiv.org/pdf/2308.11890v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11881v1","updated":"2023-08-23T02:58:02Z","published":"2023-08-23T02:58:02Z","title":"Adversarial Training Using Feedback Loops","summary":" Deep neural networks (DNN) have found wide applicability in numerous fields\ndue to their ability to accurately learn very complex input-output relations.\nDespite their accuracy and extensive use, DNNs are highly susceptible to\nadversarial attacks due to limited generalizability. For future progress in the\nfield, it is essential to build DNNs that are robust to any kind of\nperturbations to the data points. In the past, many techniques have been\nproposed to robustify DNNs using first-order derivative information of the\nnetwork.\n This paper proposes a new robustification approach based on control theory. A\nneural network architecture that incorporates feedback control, named Feedback\nNeural Networks, is proposed. The controller is itself a neural network, which\nis trained using regular and adversarial data such as to stabilize the system\noutputs. The novel adversarial training approach based on the feedback control\narchitecture is called Feedback Looped Adversarial Training (FLAT). Numerical\nresults on standard test problems empirically show that our FLAT method is more\neffective than the state-of-the-art to guard against adversarial attacks.\n","authors":["Ali Haisam Muhammad Rafid","Adrian Sandu"],"pdf_url":"https://arxiv.org/pdf/2308.11881v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11880v1","updated":"2023-08-23T02:57:58Z","published":"2023-08-23T02:57:58Z","title":"SUMMIT: Source-Free Adaptation of Uni-Modal Models to Multi-Modal\n Targets","summary":" Scene understanding using multi-modal data is necessary in many applications,\ne.g., autonomous navigation. To achieve this in a variety of situations,\nexisting models must be able to adapt to shifting data distributions without\narduous data annotation. Current approaches assume that the source data is\navailable during adaptation and that the source consists of paired multi-modal\ndata. Both these assumptions may be problematic for many applications. Source\ndata may not be available due to privacy, security, or economic concerns.\nAssuming the existence of paired multi-modal data for training also entails\nsignificant data collection costs and fails to take advantage of widely\navailable freely distributed pre-trained uni-modal models. In this work, we\nrelax both of these assumptions by addressing the problem of adapting a set of\nmodels trained independently on uni-modal data to a target domain consisting of\nunlabeled multi-modal data, without having access to the original source\ndataset. Our proposed approach solves this problem through a switching\nframework which automatically chooses between two complementary methods of\ncross-modal pseudo-label fusion -- agreement filtering and entropy weighting --\nbased on the estimated domain gap. We demonstrate our work on the semantic\nsegmentation problem. Experiments across seven challenging adaptation scenarios\nverify the efficacy of our approach, achieving results comparable to, and in\nsome cases outperforming, methods which assume access to source data. Our\nmethod achieves an improvement in mIoU of up to 12% over competing baselines.\nOur code is publicly available at https://github.com/csimo005/SUMMIT.\n","authors":["Cody Simons","Dripta S. Raychaudhuri","Sk Miraj Ahmed","Suya You","Konstantinos Karydis","Amit K. Roy-Chowdhury"],"pdf_url":"https://arxiv.org/pdf/2308.11880v1.pdf","comment":"12 pages, 5 figures, 9 tables, ICCV 2023"},{"id":"http://arxiv.org/abs/2308.11878v1","updated":"2023-08-23T02:49:35Z","published":"2023-08-23T02:49:35Z","title":"Cabrita: closing the gap for foreign languages","summary":" The strategy of training the model from scratch in a specific language or\ndomain serves two essential purposes: i) enhancing performance in the\nparticular linguistic or domain context, and ii) ensuring effective\ntokenization. The main limitation inherent to this approach lies in the\nassociated cost, which can reach six to seven-digit dollar values, depending on\nthe model size and the number of parameters involved.\n The main solution to overcome the cost challenge is to rely on available\npre-trained models, which, despite recent advancements such as the LLaMA and\nLLaMA-2 models, still demonstrate inefficiency for certain specific domain\nproblems or prove ineffective in scenarios involving conversational memory\nresources, given the large number of tokens required to represent text.\n To overcome this issue, we present a methodology named Cabrita, which, as our\nresearch demonstrates, successfully addresses the performance and efficient\ntokenization problem, all at an affordable cost. We believe that this\nmethodology can be applied to any transformer-like architecture model. To\nvalidate the study, we conducted continuous pre-training exclusively using\nPortuguese text on a 3-billion-parameter model known as OpenLLaMA, resulting in\na model named openCabrita 3B. The openCabrita 3B also features a new tokenizer\nthat results in a significant reduction in the number of tokens required to\nrepresent the text. In our assessment, for few-shot learning tasks, we achieved\nsimilar results with this 3B model compared to a traditional continuous\npre-training approach as well as to 7B models English pre-trained models.\n","authors":["Celio Larcher","Marcos Piau","Paulo Finardi","Pedro Gengo","Piero Esposito","Vinicius Caridá"],"pdf_url":"https://arxiv.org/pdf/2308.11878v1.pdf","comment":"9 pages, 1 figure"},{"id":"http://arxiv.org/abs/2308.11873v1","updated":"2023-08-23T02:36:19Z","published":"2023-08-23T02:36:19Z","title":"Integrating Large Language Models into the Debugging C Compiler for\n generating contextual error explanations","summary":" This paper introduces a method for Large Language Models (LLM) to produce\nenhanced compiler error explanations, in simple language, within our Debugging\nC Compiler (DCC). It is well documented that compiler error messages have been\nknown to present a barrier for novices learning how to program. Although our\ninitial use of DCC in introductory programming (CS1) has been instrumental in\nteaching C to novice programmers by providing safeguards to commonly occurring\nerrors and translating the usually cryptic compiler error messages at both\ncompile- and run-time, we proposed that incorporating LLM-generated\nexplanations would further enhance the learning experience for novice\nprogrammers. Through an expert evaluation, we observed that LLM-generated\nexplanations for compiler errors were conceptually accurate in 90% of\ncompile-time errors, and 75% of run-time errors. Additionally, the new DCC-help\ntool has been increasingly adopted by students, with an average of 1047 unique\nruns per week, demonstrating a promising initial assessment of using LLMs to\ncomplement compiler output to enhance programming education for beginners. We\nrelease our tool as open-source to the community.\n","authors":["Andrew Taylor","Alexandra Vassar","Jake Renzella","Hammond Pearce"],"pdf_url":"https://arxiv.org/pdf/2308.11873v1.pdf","comment":"7 pages, 2 figures"},{"id":"http://arxiv.org/abs/2207.03678v2","updated":"2023-08-23T02:22:37Z","published":"2022-07-08T03:54:52Z","title":"Stability of Aggregation Graph Neural Networks","summary":" In this paper we study the stability properties of aggregation graph neural\nnetworks (Agg-GNNs) considering perturbations of the underlying graph. An\nAgg-GNN is a hybrid architecture where information is defined on the nodes of a\ngraph, but it is processed block-wise by Euclidean CNNs on the nodes after\nseveral diffusions on the graph shift operator. We derive stability bounds for\nthe mapping operator associated to a generic Agg-GNN, and we specify conditions\nunder which such operators can be stable to deformations. We prove that the\nstability bounds are defined by the properties of the filters in the first\nlayer of the CNN that acts on each node. Additionally, we show that there is a\nclose relationship between the number of aggregations, the filter's\nselectivity, and the size of the stability constants. We also conclude that in\nAgg-GNNs the selectivity of the mapping operators is tied to the properties of\nthe filters only in the first layer of the CNN stage. This shows a substantial\ndifference with respect to the stability properties of selection GNNs, where\nthe selectivity of the filters in all layers is constrained by their stability.\nWe provide numerical evidence corroborating the results derived, testing the\nbehavior of Agg-GNNs in real life application scenarios considering\nperturbations of different magnitude.\n","authors":["Alejandro Parada-Mayorga","Zhiyang Wang","Fernando Gama","Alejandro Ribeiro"],"pdf_url":"https://arxiv.org/pdf/2207.03678v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19301v2","updated":"2023-08-23T02:18:51Z","published":"2023-05-30T14:24:40Z","title":"On the Choice of Perception Loss Function for Learned Video Compression","summary":" We study causal, low-latency, sequential video compression when the output is\nsubjected to both a mean squared-error (MSE) distortion loss as well as a\nperception loss to target realism. Motivated by prior approaches, we consider\ntwo different perception loss functions (PLFs). The first, PLF-JD, considers\nthe joint distribution (JD) of all the video frames up to the current one,\nwhile the second metric, PLF-FMD, considers the framewise marginal\ndistributions (FMD) between the source and reconstruction. Using information\ntheoretic analysis and deep-learning based experiments, we demonstrate that the\nchoice of PLF can have a significant effect on the reconstruction, especially\nat low-bit rates. In particular, while the reconstruction based on PLF-JD can\nbetter preserve the temporal correlation across frames, it also imposes a\nsignificant penalty in distortion compared to PLF-FMD and further makes it more\ndifficult to recover from errors made in the earlier output frames. Although\nthe choice of PLF decisively affects reconstruction quality, we also\ndemonstrate that it may not be essential to commit to a particular PLF during\nencoding and the choice of PLF can be delegated to the decoder. In particular,\nencoded representations generated by training a system to minimize the MSE\n(without requiring either PLF) can be {\\em near universal} and can generate\nclose to optimal reconstructions for either choice of PLF at the decoder. We\nvalidate our results using (one-shot) information-theoretic analysis, detailed\nstudy of the rate-distortion-perception tradeoff of the Gauss-Markov source\nmodel as well as deep-learning based experiments on moving MNIST and KTH\ndatasets.\n","authors":["Sadaf Salehkalaibar","Buu Phan","Jun Chen","Wei Yu","Ashish Khisti"],"pdf_url":"https://arxiv.org/pdf/2305.19301v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.06980v3","updated":"2023-08-23T02:11:53Z","published":"2023-03-13T10:30:02Z","title":"Self-supervised learning based general laboratory progress pretrained\n model for cardiovascular event detection","summary":" The inherent nature of patient data poses several challenges. Prevalent cases\namass substantial longitudinal data owing to their patient volume and\nconsistent follow-ups, however, longitudinal laboratory data are renowned for\ntheir irregularity, temporality, absenteeism, and sparsity; In contrast,\nrecruitment for rare or specific cases is often constrained due to their\nlimited patient size and episodic observations. This study employed\nself-supervised learning (SSL) to pretrain a generalized laboratory progress\n(GLP) model that captures the overall progression of six common laboratory\nmarkers in prevalent cardiovascular cases, with the intention of transferring\nthis knowledge to aid in the detection of specific cardiovascular event. GLP\nimplemented a two-stage training approach, leveraging the information embedded\nwithin interpolated data and amplify the performance of SSL. After GLP\npretraining, it is transferred for TVR detection. The proposed two-stage\ntraining improved the performance of pure SSL, and the transferability of GLP\nexhibited distinctiveness. After GLP processing, the classification exhibited a\nnotable enhancement, with averaged accuracy rising from 0.63 to 0.90. All\nevaluated metrics demonstrated substantial superiority (p < 0.01) compared to\nprior GLP processing. Our study effectively engages in translational\nengineering by transferring patient progression of cardiovascular laboratory\nparameters from one patient group to another, transcending the limitations of\ndata availability. The transferability of disease progression optimized the\nstrategies of examinations and treatments, and improves patient prognosis while\nusing commonly available laboratory parameters. The potential for expanding\nthis approach to encompass other diseases holds great promise.\n","authors":["Li-Chin Chen","Kuo-Hsuan Hung","Yi-Ju Tseng","Hsin-Yao Wang","Tse-Min Lu","Wei-Chieh Huang","Yu Tsao"],"pdf_url":"https://arxiv.org/pdf/2303.06980v3.pdf","comment":"published in IEEE Journal of Translational Engineering in Health &\n Medicine"},{"id":"http://arxiv.org/abs/2212.08171v2","updated":"2023-08-23T01:52:33Z","published":"2022-12-15T22:11:34Z","title":"Graphon Pooling for Reducing Dimensionality of Signals and Convolutional\n Operators on Graphs","summary":" In this paper we propose a pooling approach for convolutional information\nprocessing on graphs relying on the theory of graphons and limits of dense\ngraph sequences. We present three methods that exploit the induced graphon\nrepresentation of graphs and graph signals on partitions of [0, 1]2 in the\ngraphon space. As a result we derive low dimensional representations of the\nconvolutional operators, while a dimensionality reduction of the signals is\nachieved by simple local interpolation of functions in L2([0, 1]). We prove\nthat those low dimensional representations constitute a convergent sequence of\ngraphs and graph signals, respectively. The methods proposed and the\ntheoretical guarantees that we provide show that the reduced graphs and signals\ninherit spectral-structural properties of the original quantities. We evaluate\nour approach with a set of numerical experiments performed on graph neural\nnetworks (GNNs) that rely on graphon pooling. We observe that graphon pooling\nperforms significantly better than other approaches proposed in the literature\nwhen dimensionality reduction ratios between layers are large. We also observe\nthat when graphon pooling is used we have, in general, less overfitting and\nlower computational cost.\n","authors":["Alejandro Parada-Mayorga","Zhiyang Wang","Alejandro Ribeiro"],"pdf_url":"https://arxiv.org/pdf/2212.08171v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11863v1","updated":"2023-08-23T01:44:28Z","published":"2023-08-23T01:44:28Z","title":"KinSPEAK: Improving speech recognition for Kinyarwanda via\n semi-supervised learning methods","summary":" Despite recent availability of large transcribed Kinyarwanda speech data,\nachieving robust speech recognition for Kinyarwanda is still challenging. In\nthis work, we show that using self-supervised pre-training, following a simple\ncurriculum schedule during fine-tuning and using semi-supervised learning to\nleverage large unlabelled speech data significantly improve speech recognition\nperformance for Kinyarwanda. Our approach focuses on using public domain data\nonly. A new studio-quality speech dataset is collected from a public website,\nthen used to train a clean baseline model. The clean baseline model is then\nused to rank examples from a more diverse and noisy public dataset, defining a\nsimple curriculum training schedule. Finally, we apply semi-supervised learning\nto label and learn from large unlabelled data in four successive generations.\nOur final model achieves 3.2% word error rate (WER) on the new dataset and\n15.9% WER on Mozilla Common Voice benchmark, which is state-of-the-art to the\nbest of our knowledge. Our experiments also indicate that using syllabic rather\nthan character-based tokenization results in better speech recognition\nperformance for Kinyarwanda.\n","authors":["Antoine Nzeyimana"],"pdf_url":"https://arxiv.org/pdf/2308.11863v1.pdf","comment":"9 pages, 2 figures, 5 tables"},{"id":"http://arxiv.org/abs/2207.03364v3","updated":"2023-08-23T01:41:28Z","published":"2022-07-07T15:12:02Z","title":"Group Equality in Adaptive Submodular Maximization","summary":" In this paper, we study the classic submodular maximization problem subject\nto a group equality constraint under both non-adaptive and adaptive settings.\nIt has been shown that the utility function of many machine learning\napplications, including data summarization, influence maximization in social\nnetworks, and personalized recommendation, satisfies the property of\nsubmodularity. Hence, maximizing a submodular function subject to various\nconstraints can be found at the heart of many of those applications. On a high\nlevel, submodular maximization aims to select a group of most representative\nitems (e.g., data points). However, the design of most existing algorithms does\nnot incorporate the fairness constraint, leading to under- or\nover-representation of some particular groups. This motivates us to study the\nsubmodular maximization problem with group equality, where we aim to select a\ngroup of items to maximize a (possibly non-monotone) submodular utility\nfunction subject to a group equality constraint. To this end, we develop the\nfirst constant-factor approximation algorithm for this problem. The design of\nour algorithm is robust enough to be extended to solving the submodular\nmaximization problem under a more complicated adaptive setting. Moreover, we\nfurther extend our study to incorporating a global cardinality constraint and\nother fairness notations.\n","authors":["Shaojie Tang","Jing Yuan"],"pdf_url":"https://arxiv.org/pdf/2207.03364v3.pdf","comment":"This paper has been accepted by INFORMS Journal on Computing"},{"id":"http://arxiv.org/abs/2308.11854v1","updated":"2023-08-23T01:08:01Z","published":"2023-08-23T01:08:01Z","title":"Finding the Perfect Fit: Applying Regression Models to ClimateBench v1.0","summary":" Climate projections using data driven machine learning models acting as\nemulators, is one of the prevailing areas of research to enable policy makers\nmake informed decisions. Use of machine learning emulators as surrogates for\ncomputationally heavy GCM simulators reduces time and carbon footprints. In\nthis direction, ClimateBench [1] is a recently curated benchmarking dataset for\nevaluating the performance of machine learning emulators designed for climate\ndata. Recent studies have reported that despite being considered fundamental,\nregression models offer several advantages pertaining to climate emulations. In\nparticular, by leveraging the kernel trick, regression models can capture\ncomplex relationships and improve their predictive capabilities. This study\nfocuses on evaluating non-linear regression models using the aforementioned\ndataset. Specifically, we compare the emulation capabilities of three\nnon-linear regression models. Among them, Gaussian Process Regressor\ndemonstrates the best-in-class performance against standard evaluation metrics\nused for climate field emulation studies. However, Gaussian Process Regression\nsuffers from being computational resource hungry in terms of space and time\ncomplexity. Alternatively, Support Vector and Kernel Ridge models also deliver\ncompetitive results and but there are certain trade-offs to be addressed.\nAdditionally, we are actively investigating the performance of composite\nkernels and techniques such as variational inference to further enhance the\nperformance of the regression models and effectively model complex non-linear\npatterns, including phenomena like precipitation.\n","authors":["Anmol Chaure","Ashok Kumar Behera","Sudip Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2308.11854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.08973v6","updated":"2023-08-23T01:05:39Z","published":"2023-02-17T16:19:26Z","title":"Measuring Equality in Machine Learning Security Defenses: A Case Study\n in Speech Recognition","summary":" Over the past decade, the machine learning security community has developed a\nmyriad of defenses for evasion attacks. An understudied question in that\ncommunity is: for whom do these defenses defend? This work considers common\napproaches to defending learned systems and how security defenses result in\nperformance inequities across different sub-populations. We outline appropriate\nparity metrics for analysis and begin to answer this question through empirical\nresults of the fairness implications of machine learning security methods. We\nfind that many methods that have been proposed can cause direct harm, like\nfalse rejection and unequal benefits from robustness training. The framework we\npropose for measuring defense equality can be applied to robustly trained\nmodels, preprocessing-based defenses, and rejection methods. We identify a set\nof datasets with a user-centered application and a reasonable computational\ncost suitable for case studies in measuring the equality of defenses. In our\ncase study of speech command recognition, we show how such adversarial training\nand augmentation have non-equal but complex protections for social subgroups\nacross gender, accent, and age in relation to user coverage. We present a\ncomparison of equality between two rejection-based defenses: randomized\nsmoothing and neural rejection, finding randomized smoothing more equitable due\nto the sampling mechanism for minority groups. This represents the first work\nexamining the disparity in the adversarial robustness in the speech domain and\nthe fairness evaluation of rejection-based defenses.\n","authors":["Luke E. Richards","Edward Raff","Cynthia Matuszek"],"pdf_url":"https://arxiv.org/pdf/2302.08973v6.pdf","comment":"Accepted to AISec'23"},{"id":"http://arxiv.org/abs/2308.11849v1","updated":"2023-08-23T00:55:39Z","published":"2023-08-23T00:55:39Z","title":"A deep reinforcement learning approach for real-time demand-responsive\n railway rescheduling to mitigate station overcrowding using mobile data","summary":" Real-time railway rescheduling is a timely and flexible technique to\nautomatically alter the operation schedule in response to time-varying\nconditions. Current research lacks data-driven approaches that capture\nreal-time passenger mobility during railway disruptions, relying mostly on\nOD-based data and model-based methods for estimating demands of trains.\nMeanwhile, the schedule-updating principles for a long-term disruption overlook\nthe uneven distribution of demand over time. To fill this gap, this paper\nproposes a demand-responsive approach by inferring real-world passenger\nmobility from mobile data (MD) to facilitate real-time rescheduling. Unlike\nnetwork-level approaches, this paper focuses on a heavy-demand station upstream\nof the disrupted area. The objective is to reschedule all trains on multiple\nroutes passing through this target station, which have been affected by a\nsevere emergency event such as a natural disaster. Particular attention should\nbe given to avoiding the accumulation of overcrowded passengers at this\nstation, to prevent additional accidents arising from overcrowding. This\nresearch addresses the challenges associated with this scenario, including the\ndynamics of arriving and leaving of passengers, station overcrowding, rolling\nstock shortage, open-ended disruption duration, integrated rescheduling on\nmultiple routes, and delays due to detours. A deep reinforcement learning (DRL)\nframework is proposed to determine the optimal rescheduled timetable, route\nstops, and rolling stock allocation, while considering real-time demand\nsatisfaction, station overcrowding, train capacity utilization, and headway\nsafety.\n","authors":["Enze Liu","Zhiyuan Lin","Judith Y. T. Wang","Hong Chen"],"pdf_url":"https://arxiv.org/pdf/2308.11849v1.pdf","comment":"36 pages,16 figures"},{"id":"http://arxiv.org/abs/2308.11845v1","updated":"2023-08-23T00:49:29Z","published":"2023-08-23T00:49:29Z","title":"SEA: Shareable and Explainable Attribution for Query-based Black-box\n Attacks","summary":" Machine Learning (ML) systems are vulnerable to adversarial examples,\nparticularly those from query-based black-box attacks. Despite various efforts\nto detect and prevent such attacks, there is a need for a more comprehensive\napproach to logging, analyzing, and sharing evidence of attacks. While classic\nsecurity benefits from well-established forensics and intelligence sharing,\nMachine Learning is yet to find a way to profile its attackers and share\ninformation about them. In response, this paper introduces SEA, a novel ML\nsecurity system to characterize black-box attacks on ML systems for forensic\npurposes and to facilitate human-explainable intelligence sharing. SEA\nleverages the Hidden Markov Models framework to attribute the observed query\nsequence to known attacks. It thus understands the attack's progression rather\nthan just focusing on the final adversarial examples. Our evaluations reveal\nthat SEA is effective at attack attribution, even on their second occurrence,\nand is robust to adaptive strategies designed to evade forensics analysis.\nInterestingly, SEA's explanations of the attack behavior allow us even to\nfingerprint specific minor implementation bugs in attack libraries. For\nexample, we discover that the SignOPT and Square attacks implementation in ART\nv1.14 sends over 50% specific zero difference queries. We thoroughly evaluate\nSEA on a variety of settings and demonstrate that it can recognize the same\nattack's second occurrence with 90+% Top-1 and 95+% Top-3 accuracy.\n","authors":["Yue Gao","Ilia Shumailov","Kassem Fawaz"],"pdf_url":"https://arxiv.org/pdf/2308.11845v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11842v1","updated":"2023-08-23T00:18:17Z","published":"2023-08-23T00:18:17Z","title":"${\\rm E}(3)$-Equivariant Actor-Critic Methods for Cooperative\n Multi-Agent Reinforcement Learning","summary":" Identification and analysis of symmetrical patterns in the natural world have\nled to significant discoveries across various scientific fields, such as the\nformulation of gravitational laws in physics and advancements in the study of\nchemical structures. In this paper, we focus on exploiting Euclidean symmetries\ninherent in certain cooperative multi-agent reinforcement learning (MARL)\nproblems and prevalent in many applications. We begin by formally\ncharacterizing a subclass of Markov games with a general notion of symmetries\nthat admits the existence of symmetric optimal values and policies. Motivated\nby these properties, we design neural network architectures with symmetric\nconstraints embedded as an inductive bias for multi-agent actor-critic methods.\nThis inductive bias results in superior performance in various cooperative MARL\nbenchmarks and impressive generalization capabilities such as zero-shot\nlearning and transfer learning in unseen scenarios with repeated symmetric\npatterns. The code is available at: https://github.com/dchen48/E3AC.\n","authors":["Dingyang Chen","Qi Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.11842v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11841v1","updated":"2023-08-23T00:17:51Z","published":"2023-08-23T00:17:51Z","title":"A Survey for Federated Learning Evaluations: Goals and Measures","summary":" Evaluation is a systematic approach to assessing how well a system achieves\nits intended purpose. Federated learning (FL) is a novel paradigm for\nprivacy-preserving machine learning that allows multiple parties to\ncollaboratively train models without sharing sensitive data. However,\nevaluating FL is challenging due to its interdisciplinary nature and diverse\ngoals, such as utility, efficiency, and security. In this survey, we first\nreview the major evaluation goals adopted in the existing studies and then\nexplore the evaluation metrics used for each goal. We also introduce FedEval,\nan open-source platform that provides a standardized and comprehensive\nevaluation framework for FL algorithms in terms of their utility, efficiency,\nand security. Finally, we discuss several challenges and future research\ndirections for FL evaluation.\n","authors":["Di Chai","Leye Wang","Liu Yang","Junxue Zhang","Kai Chen","Qiang Yang"],"pdf_url":"https://arxiv.org/pdf/2308.11841v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11838v1","updated":"2023-08-23T00:10:29Z","published":"2023-08-23T00:10:29Z","title":"A Benchmark Study on Calibration","summary":" Deep neural networks are increasingly utilized in various machine learning\ntasks. However, as these models grow in complexity, they often face calibration\nissues, despite enhanced prediction accuracy. Many studies have endeavored to\nimprove calibration performance through data preprocessing, the use of specific\nloss functions, and training frameworks. Yet, investigations into calibration\nproperties have been somewhat overlooked. Our study leverages the Neural\nArchitecture Search (NAS) search space, offering an exhaustive model\narchitecture space for thorough calibration properties exploration. We\nspecifically create a model calibration dataset. This dataset evaluates 90\nbin-based and 12 additional calibration measurements across 117,702 unique\nneural networks within the widely employed NATS-Bench search space. Our\nanalysis aims to answer several longstanding questions in the field, using our\nproposed dataset: (i) Can model calibration be generalized across different\ntasks? (ii) Can robustness be used as a calibration measurement? (iii) How\nreliable are calibration metrics? (iv) Does a post-hoc calibration method\naffect all models uniformly? (v) How does calibration interact with accuracy?\n(vi) What is the impact of bin size on calibration measurement? (vii) Which\narchitectural designs are beneficial for calibration? Additionally, our study\nbridges an existing gap by exploring calibration within NAS. By providing this\ndataset, we enable further research into NAS calibration. As far as we are\naware, our research represents the first large-scale investigation into\ncalibration properties and the premier study of calibration issues within NAS.\n","authors":["Linwei Tao","Younan Zhu","Haolan Guo","Minjing Dong","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2308.11838v1.pdf","comment":"39 pages, 35 figures"},{"id":"http://arxiv.org/abs/2206.07240v2","updated":"2023-08-23T22:54:40Z","published":"2022-06-15T01:57:12Z","title":"Test-Time Adaptation for Visual Document Understanding","summary":" For visual document understanding (VDU), self-supervised pretraining has been\nshown to successfully generate transferable representations, yet, effective\nadaptation of such representations to distribution shifts at test-time remains\nto be an unexplored area. We propose DocTTA, a novel test-time adaptation\nmethod for documents, that does source-free domain adaptation using unlabeled\ntarget document data. DocTTA leverages cross-modality self-supervised learning\nvia masked visual language modeling, as well as pseudo labeling to adapt models\nlearned on a \\textit{source} domain to an unlabeled \\textit{target} domain at\ntest time. We introduce new benchmarks using existing public datasets for\nvarious VDU tasks, including entity recognition, key-value extraction, and\ndocument visual question answering. DocTTA shows significant improvements on\nthese compared to the source model performance, up to 1.89\\% in (F1 score),\n3.43\\% (F1 score), and 17.68\\% (ANLS score), respectively. Our benchmark\ndatasets are available at \\url{https://saynaebrahimi.github.io/DocTTA.html}.\n","authors":["Sayna Ebrahimi","Sercan O. Arik","Tomas Pfister"],"pdf_url":"https://arxiv.org/pdf/2206.07240v2.pdf","comment":"Accepted at TMLR 2023"},{"id":"http://arxiv.org/abs/2308.12459v1","updated":"2023-08-23T22:50:52Z","published":"2023-08-23T22:50:52Z","title":"Zero-delay Consistent Signal Reconstruction from Streamed Multivariate\n Time Series","summary":" Digitalizing real-world analog signals typically involves sampling in time\nand discretizing in amplitude. Subsequent signal reconstructions inevitably\nincur an error that depends on the amplitude resolution and the temporal\ndensity of the acquired samples. From an implementation viewpoint, consistent\nsignal reconstruction methods have proven a profitable error-rate decay as the\nsampling rate increases. Despite that, these results are obtained under offline\nsettings. Therefore, a research gap exists regarding methods for consistent\nsignal reconstruction from data streams. This paper presents a method that\nconsistently reconstructs streamed multivariate time series of quantization\nintervals under a zero-delay response requirement. On the other hand, previous\nwork has shown that the temporal dependencies within univariate time series can\nbe exploited to reduce the roughness of zero-delay signal reconstructions. This\nwork shows that the spatiotemporal dependencies within multivariate time series\ncan also be exploited to achieve improved results. Specifically, the\nspatiotemporal dependencies of the multivariate time series are learned, with\nthe assistance of a recurrent neural network, to reduce the roughness of the\nsignal reconstruction on average while ensuring consistency. Our experiments\nshow that our proposed method achieves a favorable error-rate decay with the\nsampling rate compared to a similar but non-consistent reconstruction.\n","authors":["Emilio Ruiz-Moreno","Luis Miguel López-Ramos","Baltasar Beferull-Lozano"],"pdf_url":"https://arxiv.org/pdf/2308.12459v1.pdf","comment":"11 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.12454v1","updated":"2023-08-23T22:38:35Z","published":"2023-08-23T22:38:35Z","title":"PFL-GAN: When Client Heterogeneity Meets Generative Models in\n Personalized Federated Learning","summary":" Recent advances of generative learning models are accompanied by the growing\ninterest in federated learning (FL) based on generative adversarial network\n(GAN) models. In the context of FL, GAN can capture the underlying client data\nstructure, and regenerate samples resembling the original data distribution\nwithout compromising the private raw data. Although most existing GAN-based FL\nworks focus on training a global model, Personalized FL (PFL) sometimes can be\nmore effective in view of client data heterogeneity in terms of distinct data\nsample distributions, feature spaces, and labels. To cope with client\nheterogeneity in GAN-based FL, we propose a novel GAN sharing and aggregation\nstrategy for PFL. The proposed PFL-GAN addresses the client heterogeneity in\ndifferent scenarios. More specially, we first learn the similarity among\nclients and then develop an weighted collaborative data aggregation. The\nempirical results through the rigorous experimentation on several well-known\ndatasets demonstrate the effectiveness of PFL-GAN.\n","authors":["Achintha Wijesinghe","Songyang Zhang","Zhi Ding"],"pdf_url":"https://arxiv.org/pdf/2308.12454v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12453v1","updated":"2023-08-23T22:34:49Z","published":"2023-08-23T22:34:49Z","title":"Augmenting medical image classifiers with synthetic data from latent\n diffusion models","summary":" While hundreds of artificial intelligence (AI) algorithms are now approved or\ncleared by the US Food and Drugs Administration (FDA), many studies have shown\ninconsistent generalization or latent bias, particularly for underrepresented\npopulations. Some have proposed that generative AI could reduce the need for\nreal data, but its utility in model development remains unclear. Skin disease\nserves as a useful case study in synthetic image generation due to the\ndiversity of disease appearance, particularly across the protected attribute of\nskin tone. Here we show that latent diffusion models can scalably generate\nimages of skin disease and that augmenting model training with these data\nimproves performance in data-limited settings. These performance gains saturate\nat synthetic-to-real image ratios above 10:1 and are substantially smaller than\nthe gains obtained from adding real images. As part of our analysis, we\ngenerate and analyze a new dataset of 458,920 synthetic images produced using\nseveral generation strategies. Our results suggest that synthetic data could\nserve as a force-multiplier for model development, but the collection of\ndiverse real-world data remains the most important step to improve medical AI\nalgorithms.\n","authors":["Luke W. Sagers","James A. Diao","Luke Melas-Kyriazi","Matthew Groh","Pranav Rajpurkar","Adewole S. Adamson","Veronica Rotemberg","Roxana Daneshjou","Arjun K. Manrai"],"pdf_url":"https://arxiv.org/pdf/2308.12453v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.04118v3","updated":"2023-08-23T22:23:32Z","published":"2022-10-08T22:44:14Z","title":"Convergence of the Backward Deep BSDE Method with Applications to\n Optimal Stopping Problems","summary":" The optimal stopping problem is one of the core problems in financial\nmarkets, with broad applications such as pricing American and Bermudan options.\nThe deep BSDE method [Han, Jentzen and E, PNAS, 115(34):8505-8510, 2018] has\nshown great power in solving high-dimensional forward-backward stochastic\ndifferential equations (FBSDEs), and inspired many applications. However, the\nmethod solves backward stochastic differential equations (BSDEs) in a forward\nmanner, which can not be used for optimal stopping problems that in general\nrequire running BSDE backwardly. To overcome this difficulty, a recent paper\n[Wang, Chen, Sudjianto, Liu and Shen, arXiv:1807.06622, 2018] proposed the\nbackward deep BSDE method to solve the optimal stopping problem. In this paper,\nwe provide the rigorous theory for the backward deep BSDE method. Specifically,\n1. We derive the a posteriori error estimation, i.e., the error of the\nnumerical solution can be bounded by the training loss function; and; 2. We\ngive an upper bound of the loss function, which can be sufficiently small\nsubject to universal approximations. We give two numerical examples, which\npresent consistent performance with the proved theory.\n","authors":["Chengfan Gao","Siping Gao","Ruimeng Hu","Zimu Zhu"],"pdf_url":"https://arxiv.org/pdf/2210.04118v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.01770v2","updated":"2023-08-23T22:14:51Z","published":"2023-03-03T08:22:51Z","title":"Quantized Radio Map Estimation Using Tensor and Deep Generative Models","summary":" Spectrum cartography (SC), also known as radio map estimation (RME), aims at\ncrafting multi-domain (e.g., frequency and space) radio power propagation maps\nfrom limited sensor measurements. While early methods often lacked theoretical\nsupport, recent works have demonstrated that radio maps can be provably\nrecovered using low-dimensional models -- such as the block-term tensor\ndecomposition (BTD) model and certain deep generative models (DGMs) -- of the\nhigh-dimensional multi-domain radio signals. However, these existing provable\nSC approaches assume that sensors send real-valued (full-resolution)\nmeasurements to the fusion center, which is unrealistic. This work puts forth a\nquantized SC framework that generalizes the BTD and DGM-based SC to scenarios\nwhere heavily quantized sensor measurements are used. A maximum likelihood\nestimation (MLE)-based SC framework under a Gaussian quantizer is proposed.\nRecoverability of the radio map using the MLE criterion are characterized under\nrealistic conditions, e.g., imperfect radio map modeling and noisy\nmeasurements. Simulations and real-data experiments are used to showcase the\neffectiveness of the proposed approach.\n","authors":["Subash Timilsina","Sagar Shrestha","Xiao Fu"],"pdf_url":"https://arxiv.org/pdf/2303.01770v2.pdf","comment":"16 pages, 9 figures"},{"id":"http://arxiv.org/abs/2304.01075v3","updated":"2023-08-23T21:59:40Z","published":"2023-04-03T15:32:38Z","title":"Conformal Prediction Regions for Time Series using Linear\n Complementarity Programming","summary":" Conformal prediction is a statistical tool for producing prediction regions\nof machine learning models that are valid with high probability. However,\napplying conformal prediction to time series data leads to conservative\nprediction regions. In fact, to obtain prediction regions over $T$ time steps\nwith confidence $1-\\delta$, {previous works require that each individual\nprediction region is valid} with confidence $1-\\delta/T$. We propose an\noptimization-based method for reducing this conservatism to enable long horizon\nplanning and verification when using learning-enabled time series predictors.\nInstead of considering prediction errors individually at each time step, we\nconsider a parameterized prediction error over multiple time steps. By\noptimizing the parameters over an additional dataset, we find prediction\nregions that are not conservative. We show that this problem can be cast as a\nmixed integer linear complementarity program (MILCP), which we then relax into\na linear complementarity program (LCP). Additionally, we prove that the relaxed\nLP has the same optimal cost as the original MILCP. Finally, we demonstrate the\nefficacy of our method on case studies using pedestrian trajectory predictors\nand F16 fighter jet altitude predictors.\n","authors":["Matthew Cleaveland","Insup Lee","George J. Pappas","Lars Lindemann"],"pdf_url":"https://arxiv.org/pdf/2304.01075v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.08847v2","updated":"2023-08-23T21:57:10Z","published":"2023-04-18T09:22:32Z","title":"BadVFL: Backdoor Attacks in Vertical Federated Learning","summary":" Federated learning (FL) enables multiple parties to collaboratively train a\nmachine learning model without sharing their data; rather, they train their own\nmodel locally and send updates to a central server for aggregation. Depending\non how the data is distributed among the participants, FL can be classified\ninto Horizontal (HFL) and Vertical (VFL). In VFL, the participants share the\nsame set of training instances but only host a different and non-overlapping\nsubset of the whole feature space. Whereas in HFL, each participant shares the\nsame set of features while the training set is split into locally owned\ntraining data subsets.\n VFL is increasingly used in applications like financial fraud detection;\nnonetheless, very little work has analyzed its security. In this paper, we\nfocus on robustness in VFL, in particular, on backdoor attacks, whereby an\nadversary attempts to manipulate the aggregate model during the training\nprocess to trigger misclassifications. Performing backdoor attacks in VFL is\nmore challenging than in HFL, as the adversary i) does not have access to the\nlabels during training and ii) cannot change the labels as she only has access\nto the feature embeddings. We present a first-of-its-kind clean-label backdoor\nattack in VFL, which consists of two phases: a label inference and a backdoor\nphase. We demonstrate the effectiveness of the attack on three different\ndatasets, investigate the factors involved in its success, and discuss\ncountermeasures to mitigate its impact.\n","authors":["Mohammad Naseri","Yufei Han","Emiliano De Cristofaro"],"pdf_url":"https://arxiv.org/pdf/2304.08847v2.pdf","comment":"Accepted for publication at the 45th IEEE Symposium on Security &\n Privacy (S&P 2024). Please cite accordingly"},{"id":"http://arxiv.org/abs/2308.12445v1","updated":"2023-08-23T21:55:20Z","published":"2023-08-23T21:55:20Z","title":"An Intentional Forgetting-Driven Self-Healing Method For Deep\n Reinforcement Learning Systems","summary":" Deep reinforcement learning (DRL) is increasingly applied in large-scale\nproductions like Netflix and Facebook. As with most data-driven systems, DRL\nsystems can exhibit undesirable behaviors due to environmental drifts, which\noften occur in constantly-changing production settings. Continual Learning (CL)\nis the inherent self-healing approach for adapting the DRL agent in response to\nthe environment's conditions shifts. However, successive shifts of considerable\nmagnitude may cause the production environment to drift from its original\nstate. Recent studies have shown that these environmental drifts tend to drive\nCL into long, or even unsuccessful, healing cycles, which arise from\ninefficiencies such as catastrophic forgetting, warm-starting failure, and slow\nconvergence. In this paper, we propose Dr. DRL, an effective self-healing\napproach for DRL systems that integrates a novel mechanism of intentional\nforgetting into vanilla CL to overcome its main issues. Dr. DRL deliberately\nerases the DRL system's minor behaviors to systematically prioritize the\nadaptation of the key problem-solving skills. Using well-established DRL\nalgorithms, Dr. DRL is compared with vanilla CL on various drifted\nenvironments. Dr. DRL is able to reduce, on average, the healing time and\nfine-tuning episodes by, respectively, 18.74% and 17.72%. Dr. DRL successfully\nhelps agents to adapt to 19.63% of drifted environments left unsolved by\nvanilla CL while maintaining and even enhancing by up to 45% the obtained\nrewards for drifted environments that are resolved by both approaches.\n","authors":["Ahmed Haj Yahmed","Rached Bouchoucha","Houssem Ben Braiek","Foutse Khomh"],"pdf_url":"https://arxiv.org/pdf/2308.12445v1.pdf","comment":"Accepted for publication in The 38th IEEE/ACM International\n Conference on Automated Software Engineering (ASE 2023)"},{"id":"http://arxiv.org/abs/2202.12429v3","updated":"2023-08-23T21:54:32Z","published":"2022-02-24T23:54:12Z","title":"BagPipe: Accelerating Deep Recommendation Model Training","summary":" Deep learning based recommendation models (DLRM) are widely used in several\nbusiness critical applications. Training such recommendation models efficiently\nis challenging because they contain billions of embedding-based parameters,\nleading to significant overheads from embedding access. By profiling existing\nsystems for DLRM training, we observe that around 75\\% of the iteration time is\nspent on embedding access and model synchronization. Our key insight in this\npaper is that embedding access has a specific structure which can be used to\naccelerate training. We observe that embedding accesses are heavily skewed,\nwith around 1\\% of embeddings representing more than 92\\% of total accesses.\nFurther, we observe that during offline training we can lookahead at future\nbatches to determine exactly which embeddings will be needed at what iteration\nin the future. Based on these insights, we develop Bagpipe, a system for\ntraining deep recommendation models that uses caching and prefetching to\noverlap remote embedding accesses with the computation. We design an Oracle\nCacher, a new component that uses a lookahead algorithm to generate optimal\ncache update decisions while providing strong consistency guarantees against\nstaleness. We also design a logically replicated, physically partitioned cache\nand show that our design can reduce synchronization overheads in a distributed\nsetting. Finally, we propose a disaggregated system architecture and show that\nour design can enable low-overhead fault tolerance. Our experiments using three\ndatasets and four models show that Bagpipe provides a speed up of up to 5.6x\ncompared to state of the art baselines, while providing the same convergence\nand reproducibility guarantees as synchronous training.\n","authors":["Saurabh Agarwal","Chengpo Yan","Ziyi Zhang","Shivaram Venkataraman"],"pdf_url":"https://arxiv.org/pdf/2202.12429v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12443v1","updated":"2023-08-23T21:51:24Z","published":"2023-08-23T21:51:24Z","title":"TAI-GAN: Temporally and Anatomically Informed GAN for early-to-late\n frame conversion in dynamic cardiac PET motion correction","summary":" The rapid tracer kinetics of rubidium-82 ($^{82}$Rb) and high variation of\ncross-frame distribution in dynamic cardiac positron emission tomography (PET)\nraise significant challenges for inter-frame motion correction, particularly\nfor the early frames where conventional intensity-based image registration\ntechniques are not applicable. Alternatively, a promising approach utilizes\ngenerative methods to handle the tracer distribution changes to assist existing\nregistration methods. To improve frame-wise registration and parametric\nquantification, we propose a Temporally and Anatomically Informed Generative\nAdversarial Network (TAI-GAN) to transform the early frames into the late\nreference frame using an all-to-one mapping. Specifically, a feature-wise\nlinear modulation layer encodes channel-wise parameters generated from temporal\ntracer kinetics information, and rough cardiac segmentations with local shifts\nserve as the anatomical information. We validated our proposed method on a\nclinical $^{82}$Rb PET dataset and found that our TAI-GAN can produce converted\nearly frames with high image quality, comparable to the real reference frames.\nAfter TAI-GAN conversion, motion estimation accuracy and clinical myocardial\nblood flow (MBF) quantification were improved compared to using the original\nframes. Our code is published at https://github.com/gxq1998/TAI-GAN.\n","authors":["Xueqi Guo","Luyao Shi","Xiongchao Chen","Bo Zhou","Qiong Liu","Huidong Xie","Yi-Hwa Liu","Richard Palyo","Edward J. Miller","Albert J. Sinusas","Bruce Spottiswoode","Chi Liu","Nicha C. Dvornek"],"pdf_url":"https://arxiv.org/pdf/2308.12443v1.pdf","comment":"Accepted by Simulation and Synthesis in Medical Imaging (SASHIMI\n 2023, MICCAI workshop), preprint version"},{"id":"http://arxiv.org/abs/2308.12439v1","updated":"2023-08-23T21:47:06Z","published":"2023-08-23T21:47:06Z","title":"BaDExpert: Extracting Backdoor Functionality for Accurate Backdoor Input\n Detection","summary":" We present a novel defense, against backdoor attacks on Deep Neural Networks\n(DNNs), wherein adversaries covertly implant malicious behaviors (backdoors)\ninto DNNs. Our defense falls within the category of post-development defenses\nthat operate independently of how the model was generated. The proposed defense\nis built upon a novel reverse engineering approach that can directly extract\nbackdoor functionality of a given backdoored model to a backdoor expert model.\nThe approach is straightforward -- finetuning the backdoored model over a small\nset of intentionally mislabeled clean samples, such that it unlearns the normal\nfunctionality while still preserving the backdoor functionality, and thus\nresulting in a model (dubbed a backdoor expert model) that can only recognize\nbackdoor inputs. Based on the extracted backdoor expert model, we show the\nfeasibility of devising highly accurate backdoor input detectors that filter\nout the backdoor inputs during model inference. Further augmented by an\nensemble strategy with a finetuned auxiliary model, our defense, BaDExpert\n(Backdoor Input Detection with Backdoor Expert), effectively mitigates 16 SOTA\nbackdoor attacks while minimally impacting clean utility. The effectiveness of\nBaDExpert has been verified on multiple datasets (CIFAR10, GTSRB and ImageNet)\nacross various model architectures (ResNet, VGG, MobileNetV2 and Vision\nTransformer).\n","authors":["Tinghao Xie","Xiangyu Qi","Ping He","Yiming Li","Jiachen T. Wang","Prateek Mittal"],"pdf_url":"https://arxiv.org/pdf/2308.12439v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12438v1","updated":"2023-08-23T21:44:09Z","published":"2023-08-23T21:44:09Z","title":"Deploying Deep Reinforcement Learning Systems: A Taxonomy of Challenges","summary":" Deep reinforcement learning (DRL), leveraging Deep Learning (DL) in\nreinforcement learning, has shown significant potential in achieving\nhuman-level autonomy in a wide range of domains, including robotics, computer\nvision, and computer games. This potential justifies the enthusiasm and growing\ninterest in DRL in both academia and industry. However, the community currently\nfocuses mostly on the development phase of DRL systems, with little attention\ndevoted to DRL deployment. In this paper, we propose an empirical study on\nStack Overflow (SO), the most popular Q&A forum for developers, to uncover and\nunderstand the challenges practitioners faced when deploying DRL systems.\nSpecifically, we categorized relevant SO posts by deployment platforms:\nserver/cloud, mobile/embedded system, browser, and game engine. After filtering\nand manual analysis, we examined 357 SO posts about DRL deployment,\ninvestigated the current state, and identified the challenges related to\ndeploying DRL systems. Then, we investigate the prevalence and difficulty of\nthese challenges. Results show that the general interest in DRL deployment is\ngrowing, confirming the study's relevance and importance. Results also show\nthat DRL deployment is more difficult than other DRL issues. Additionally, we\nbuilt a taxonomy of 31 unique challenges in deploying DRL to different\nplatforms. On all platforms, RL environment-related challenges are the most\npopular, and communication-related challenges are the most difficult among\npractitioners. We hope our study inspires future research and helps the\ncommunity overcome the most common and difficult challenges practitioners face\nwhen deploying DRL systems.\n","authors":["Ahmed Haj Yahmed","Altaf Allah Abbassi","Amin Nikanjam","Heng Li","Foutse Khomh"],"pdf_url":"https://arxiv.org/pdf/2308.12438v1.pdf","comment":"Accepted for publication in The International Conference on Software\n Maintenance and Evolution (ICSME 2023)"},{"id":"http://arxiv.org/abs/2306.03398v2","updated":"2023-08-23T21:30:43Z","published":"2023-06-06T04:28:12Z","title":"Minimum intrinsic dimension scaling for entropic optimal transport","summary":" Motivated by the manifold hypothesis, which states that data with a high\nextrinsic dimension may yet have a low intrinsic dimension, we develop refined\nstatistical bounds for entropic optimal transport that are sensitive to the\nintrinsic dimension of the data. Our bounds involve a robust notion of\nintrinsic dimension, measured at only a single distance scale depending on the\nregularization parameter, and show that it is only the minimum of these\nsingle-scale intrinsic dimensions which governs the rate of convergence. We\ncall this the Minimum Intrinsic Dimension scaling (MID scaling) phenomenon, and\nestablish MID scaling with no assumptions on the data distributions so long as\nthe cost is bounded and Lipschitz, and for various entropic optimal transport\nquantities beyond just values, with stronger analogs when one distribution is\nsupported on a manifold. Our results significantly advance the theoretical\nstate of the art by showing that MID scaling is a generic phenomenon, and\nprovide the first rigorous interpretation of the statistical effect of entropic\nregularization as a distance scale.\n","authors":["Austin J. Stromme"],"pdf_url":"https://arxiv.org/pdf/2306.03398v2.pdf","comment":"53 pages"},{"id":"http://arxiv.org/abs/2305.14706v2","updated":"2023-08-23T21:22:01Z","published":"2023-05-24T04:22:38Z","title":"PruMUX: Augmenting Data Multiplexing with Model Compression","summary":" As language models increase in size by the day, methods for efficient\ninference are critical to leveraging their capabilities for various\napplications. Prior work has investigated techniques like model pruning,\nknowledge distillation, and data multiplexing to increase model throughput\nwithout sacrificing accuracy. In this paper, we combine two such methods --\nstructured pruning and data multiplexing -- to compound the speedup gains\nobtained by either method. Our approach, PruMUX, obtains up to 7.5-29.5X\nthroughput improvement over BERT-base model with accuracy threshold from 80% to\n74%. We further study various combinations of parameters (such as sparsity and\nmultiplexing factor) in the two techniques to provide a comprehensive analysis\nof the tradeoff between accuracy and throughput in the resulting models. We\nthen propose Auto-PruMUX, a meta-level model that can predict the\nhigh-performance parameters for pruning and multiplexing given a desired\naccuracy loss budget, providing a practical method to leverage the combination\neffectively.\n","authors":["Yushan Su","Vishvak Murahari","Karthik Narasimhan","Kai Li"],"pdf_url":"https://arxiv.org/pdf/2305.14706v2.pdf","comment":"Published at Findings of the Association for Computational\n Linguistics (ACL 2023)"},{"id":"http://arxiv.org/abs/2308.10145v2","updated":"2023-08-23T21:21:20Z","published":"2023-08-20T03:12:10Z","title":"Wasserstein Geodesic Generator for Conditional Distributions","summary":" Generating samples given a specific label requires estimating conditional\ndistributions. We derive a tractable upper bound of the Wasserstein distance\nbetween conditional distributions to lay the theoretical groundwork to learn\nconditional distributions. Based on this result, we propose a novel conditional\ngeneration algorithm where conditional distributions are fully characterized by\na metric space defined by a statistical distance. We employ optimal transport\ntheory to propose the Wasserstein geodesic generator, a new conditional\ngenerator that learns the Wasserstein geodesic. The proposed method learns both\nconditional distributions for observed domains and optimal transport maps\nbetween them. The conditional distributions given unobserved intermediate\ndomains are on the Wasserstein geodesic between conditional distributions given\ntwo observed domain labels. Experiments on face images with light conditions as\ndomain labels demonstrate the efficacy of the proposed method.\n","authors":["Young-geun Kim","Kyungbok Lee","Youngwon Choi","Joong-Ho Won","Myunghee Cho Paik"],"pdf_url":"https://arxiv.org/pdf/2308.10145v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12420v1","updated":"2023-08-23T20:42:32Z","published":"2023-08-23T20:42:32Z","title":"Evolution of ESG-focused DLT Research: An NLP Analysis of the Literature","summary":" Distributed Ledger Technologies (DLTs) have rapidly evolved, necessitating\ncomprehensive insights into their diverse components. However, a systematic\nliterature review that emphasizes the Environmental, Sustainability, and\nGovernance (ESG) components of DLT remains lacking. To bridge this gap, we\nselected 107 seed papers to build a citation network of 63,083 references and\nrefined it to a corpus of 24,539 publications for analysis. Then, we labeled\nthe named entities in 46 papers according to twelve top-level categories\nderived from an established technology taxonomy and enhanced the taxonomy by\npinpointing DLT's ESG elements. Leveraging transformer-based language models,\nwe fine-tuned a pre-trained language model for a Named Entity Recognition (NER)\ntask using our labeled dataset. We used our fine-tuned language model to\ndistill the corpus to 505 key papers, facilitating a literature review via\nnamed entities and temporal graph analysis on DLT evolution in the context of\nESG. Our contributions are a methodology to conduct a machine learning-driven\nsystematic literature review in the DLT field, placing a special emphasis on\nESG aspects. Furthermore, we present a first-of-its-kind NER dataset, composed\nof 54,808 named entities, designed for DLT and ESG-related explorations.\n","authors":["Walter Hernandez","Kamil Tylinski","Alastair Moore","Niall Roche","Nikhil Vadgama","Horst Treiblmaier","Jiangbo Shangguan","Paolo Tasca","Jiahua Xu"],"pdf_url":"https://arxiv.org/pdf/2308.12420v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09113v2","updated":"2023-08-23T20:16:27Z","published":"2023-08-17T17:44:59Z","title":"Multi-fidelity Fourier Neural Operator for Fast Modeling of Large-Scale\n Geological Carbon Storage","summary":" Deep learning-based surrogate models have been widely applied in geological\ncarbon storage (GCS) problems to accelerate the prediction of reservoir\npressure and CO2 plume migration. Large amounts of data from physics-based\nnumerical simulators are required to train a model to accurately predict the\ncomplex physical behaviors associated with this process. In practice, the\navailable training data are always limited in large-scale 3D problems due to\nthe high computational cost. Therefore, we propose to use a multi-fidelity\nFourier Neural Operator to solve large-scale GCS problems with more affordable\nmulti-fidelity training datasets. The Fourier Neural Operator has a desirable\ngrid-invariant property, which simplifies the transfer learning procedure\nbetween datasets with different discretization. We first test the model\nefficacy on a GCS reservoir model being discretized into 110k grid cells. The\nmulti-fidelity model can predict with accuracy comparable to a high-fidelity\nmodel trained with the same amount of high-fidelity data with 81% less data\ngeneration costs. We further test the generalizability of the multi-fidelity\nmodel on a same reservoir model with a finer discretization of 1 million grid\ncells. This case was made more challenging by employing high-fidelity and\nlow-fidelity datasets generated by different geostatistical models and\nreservoir simulators. We observe that the multi-fidelity FNO model can predict\npressure fields with reasonable accuracy even when the high-fidelity data are\nextremely limited.\n","authors":["Hewei Tang","Qingkai Kong","Joseph P. Morris"],"pdf_url":"https://arxiv.org/pdf/2308.09113v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10592v3","updated":"2023-08-23T20:08:52Z","published":"2023-06-18T16:11:40Z","title":"Conditional expectation using compactification operators","summary":" The separate tasks of denoising, least squares expectation, and manifold\nlearning can often be posed in a common setting of finding the conditional\nexpectations arising from a product of two random variables. This paper focuses\non this more general problem and describes an operator theoretic approach to\nestimating the conditional expectation. Kernel integral operators are used as a\ncompactification tool, to set up the estimation problem as a linear inverse\nproblem in a reproducing kernel Hilbert space. This equation is shown to have\nsolutions that allow numerical approximation, thus guaranteeing the convergence\nof data-driven implementations. The overall technique is easy to implement, and\ntheir successful application to some real-world problems are also shown.\n","authors":["Suddhasattwa Das"],"pdf_url":"https://arxiv.org/pdf/2306.10592v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12393v1","updated":"2023-08-23T19:20:24Z","published":"2023-08-23T19:20:24Z","title":"Machine learning in parameter estimation of nonlinear systems","summary":" Accurately estimating parameters in complex nonlinear systems is crucial\nacross scientific and engineering fields. We present a novel approach for\nparameter estimation using a neural network with the Huber loss function. This\nmethod taps into deep learning's abilities to uncover parameters governing\nintricate behaviors in nonlinear equations. We validate our approach using\nsynthetic data and predefined functions that model system dynamics. By training\nthe neural network with noisy time series data, it fine-tunes the Huber loss\nfunction to converge to accurate parameters. We apply our method to damped\noscillators, Van der Pol oscillators, Lotka-Volterra systems, and Lorenz\nsystems under multiplicative noise. The trained neural network accurately\nestimates parameters, evident from closely matching latent dynamics. Comparing\ntrue and estimated trajectories visually reinforces our method's precision and\nrobustness. Our study underscores the Huber loss-guided neural network as a\nversatile tool for parameter estimation, effectively uncovering complex\nrelationships in nonlinear systems. The method navigates noise and uncertainty\nadeptly, showcasing its adaptability to real-world challenges.\n","authors":["Kaushal Kumar"],"pdf_url":"https://arxiv.org/pdf/2308.12393v1.pdf","comment":"23 pages, 7 figures,"},{"id":"http://arxiv.org/abs/2308.12388v1","updated":"2023-08-23T19:01:17Z","published":"2023-08-23T19:01:17Z","title":"FOSA: Full Information Maximum Likelihood (FIML) Optimized\n Self-Attention Imputation for Missing Data","summary":" In data imputation, effectively addressing missing values is pivotal,\nespecially in intricate datasets. This paper delves into the FIML Optimized\nSelf-attention (FOSA) framework, an innovative approach that amalgamates the\nstrengths of Full Information Maximum Likelihood (FIML) estimation with the\ncapabilities of self-attention neural networks. Our methodology commences with\nan initial estimation of missing values via FIML, subsequently refining these\nestimates by leveraging the self-attention mechanism. Our comprehensive\nexperiments on both simulated and real-world datasets underscore FOSA's\npronounced advantages over traditional FIML techniques, encapsulating facets of\naccuracy, computational efficiency, and adaptability to diverse data\nstructures. Intriguingly, even in scenarios where the Structural Equation Model\n(SEM) might be mis-specified, leading to suboptimal FIML estimates, the robust\narchitecture of FOSA's self-attention component adeptly rectifies and optimizes\nthe imputation outcomes. Our empirical tests reveal that FOSA consistently\ndelivers commendable predictions, even in the face of up to 40% random\nmissingness, highlighting its robustness and potential for wide-scale\napplications in data imputation.\n","authors":["Ou Deng","Qun Jin"],"pdf_url":"https://arxiv.org/pdf/2308.12388v1.pdf","comment":"The source code for the experiments is publicly available at:\n https://github.com/oudeng/FOSA/"},{"id":"http://arxiv.org/abs/2305.16556v2","updated":"2023-08-23T18:53:22Z","published":"2023-05-26T00:50:09Z","title":"LANISTR: Multimodal Learning from Structured and Unstructured Data","summary":" Multimodal large-scale pretraining has shown impressive performance for\nunstructured data including language, image, audio, and video. However, a\nprevalent real-world scenario involves the combination of structured data types\n(tabular, time-series) with unstructured data which has so far been\nunderstudied. To bridge this gap, we propose LANISTR, an attention-based\nframework to learn from LANguage, Image, and STRuctured data. The core of\nLANISTR's methodology is rooted in \\textit{masking-based} training applied\nacross both unimodal and multimodal levels. In particular, we introduce a new\nsimilarity-based multimodal masking loss that enables it to learn cross-modal\nrelations from large-scale multimodal data with missing modalities. On two\nreal-world datastes, MIMIC-IV (healthcare) and Amazon Product Review (retail),\nLANISTR demonstrates remarkable absolute improvements of 6.6\\% (AUROC) and up\nto 14\\% (accuracy) when fine-tuned on 0.1\\% and 0.01\\% of labeled data,\nrespectively, compared to the state-of-the-art alternatives. Notably, these\nimprovements are observed even in the presence of considerable missingness\nratios of 35.7\\% and 99.8\\%, in the respective datasets.\n","authors":["Sayna Ebrahimi","Sercan O. Arik","Yihe Dong","Tomas Pfister"],"pdf_url":"https://arxiv.org/pdf/2305.16556v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04706v2","updated":"2023-08-23T18:47:13Z","published":"2023-08-09T04:57:56Z","title":"Pareto Invariant Representation Learning for Multimedia Recommendation","summary":" Multimedia recommendation involves personalized ranking tasks, where\nmultimedia content is usually represented using a generic encoder. However,\nthese generic representations introduce spurious correlations that fail to\nreveal users' true preferences. Existing works attempt to alleviate this\nproblem by learning invariant representations, but overlook the balance between\nindependent and identically distributed (IID) and out-of-distribution (OOD)\ngeneralization. In this paper, we propose a framework called Pareto Invariant\nRepresentation Learning (PaInvRL) to mitigate the impact of spurious\ncorrelations from an IID-OOD multi-objective optimization perspective, by\nlearning invariant representations (intrinsic factors that attract user\nattention) and variant representations (other factors) simultaneously.\nSpecifically, PaInvRL includes three iteratively executed modules: (i)\nheterogeneous identification module, which identifies the heterogeneous\nenvironments to reflect distributional shifts for user-item interactions; (ii)\ninvariant mask generation module, which learns invariant masks based on the\nPareto-optimal solutions that minimize the adaptive weighted Invariant Risk\nMinimization (IRM) and Empirical Risk (ERM) losses; (iii) convert module, which\ngenerates both variant representations and item-invariant representations for\ntraining a multi-modal recommendation model that mitigates spurious\ncorrelations and balances the generalization performance within and cross the\nenvironmental distributions. We compare the proposed PaInvRL with\nstate-of-the-art recommendation models on three public multimedia\nrecommendation datasets (Movielens, Tiktok, and Kwai), and the experimental\nresults validate the effectiveness of PaInvRL for both within- and\ncross-environmental learning.\n","authors":["Shanshan Huang","Haoxuan Li","Qingsong Li","Chunyuan Zheng","Li Liu"],"pdf_url":"https://arxiv.org/pdf/2308.04706v2.pdf","comment":"ACM MM 2023 full paper"},{"id":"http://arxiv.org/abs/2205.04701v3","updated":"2023-08-23T18:42:36Z","published":"2022-05-10T07:04:53Z","title":"StableDR: Stabilized Doubly Robust Learning for Recommendation on Data\n Missing Not at Random","summary":" In recommender systems, users always choose the favorite items to rate, which\nleads to data missing not at random and poses a great challenge for unbiased\nevaluation and learning of prediction models. Currently, the doubly robust (DR)\nmethods have been widely studied and demonstrate superior performance. However,\nin this paper, we show that DR methods are unstable and have unbounded bias,\nvariance, and generalization bounds to extremely small propensities. Moreover,\nthe fact that DR relies more on extrapolation will lead to suboptimal\nperformance. To address the above limitations while retaining double\nrobustness, we propose a stabilized doubly robust (StableDR) learning approach\nwith a weaker reliance on extrapolation. Theoretical analysis shows that\nStableDR has bounded bias, variance, and generalization error bound\nsimultaneously under inaccurate imputed errors and arbitrarily small\npropensities. In addition, we propose a novel learning approach for StableDR\nthat updates the imputation, propensity, and prediction models cyclically,\nachieving more stable and accurate predictions. Extensive experiments show that\nour approaches significantly outperform the existing methods.\n","authors":["Haoxuan Li","Chunyuan Zheng","Peng Wu"],"pdf_url":"https://arxiv.org/pdf/2205.04701v3.pdf","comment":"ICLR 23"},{"id":"http://arxiv.org/abs/2308.12371v1","updated":"2023-08-23T18:22:03Z","published":"2023-08-23T18:22:03Z","title":"Open-set Face Recognition with Neural Ensemble, Maximal Entropy Loss and\n Feature Augmentation","summary":" Open-set face recognition refers to a scenario in which biometric systems\nhave incomplete knowledge of all existing subjects. Therefore, they are\nexpected to prevent face samples of unregistered subjects from being identified\nas previously enrolled identities. This watchlist context adds an arduous\nrequirement that calls for the dismissal of irrelevant faces by focusing mainly\non subjects of interest. As a response, this work introduces a novel method\nthat associates an ensemble of compact neural networks with a margin-based cost\nfunction that explores additional samples. Supplementary negative samples can\nbe obtained from external databases or synthetically built at the\nrepresentation level in training time with a new mix-up feature augmentation\napproach. Deep neural networks pre-trained on large face datasets serve as the\npreliminary feature extraction module. We carry out experiments on well-known\nLFW and IJB-C datasets where results show that the approach is able to boost\nclosed and open-set identification rates.\n","authors":["Rafael Henrique Vareto","Manuel Günther","William Robson Schwartz"],"pdf_url":"https://arxiv.org/pdf/2308.12371v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.12141v1","updated":"2023-08-23T13:56:38Z","published":"2023-08-23T13:56:38Z","title":"Aparecium: Revealing Secrets from Physical Photographs","summary":" Watermarking is a crucial tool for safeguarding copyrights and can serve as a\nmore aesthetically pleasing alternative to QR codes. In recent years,\nwatermarking methods based on deep learning have proved superior robustness\nagainst complex physical distortions than traditional watermarking methods.\nHowever, they have certain limitations that render them less effective in\npractice. For instance, current solutions necessitate physical photographs to\nbe rectangular for accurate localization, cannot handle physical bending or\nfolding, and require the hidden area to be completely captured at a close\ndistance and small angle. To overcome these challenges, we propose a novel deep\nwatermarking framework dubbed \\textit{Aparecium}. Specifically, we preprocess\nsecrets (i.e., watermarks) into a pattern and then embed it into the cover\nimage, which is symmetrical to the final decoding-then-extracting process. To\ncapture the watermarked region from complex physical scenarios, a locator is\nalso introduced. Besides, we adopt a three-stage training strategy for training\nconvergence. Extensive experiments demonstrate that \\textit{Aparecium} is not\nonly robust against different digital distortions, but also can resist various\nphysical distortions, such as screen-shooting and printing-shooting, even in\nsevere cases including different shapes, curvature, folding, incompleteness,\nlong distances, and big angles while maintaining high visual quality.\nFurthermore, some ablation studies are also conducted to verify our design.\n","authors":["Zhe Lei","Jie Zhang","Jingtao Li","Weiming Zhang","Nenghai Yu"],"pdf_url":"https://arxiv.org/pdf/2308.12141v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.02051v2","updated":"2023-08-23T12:45:27Z","published":"2023-04-04T18:03:04Z","title":"Multimodal Garment Designer: Human-Centric Latent Diffusion Models for\n Fashion Image Editing","summary":" Fashion illustration is used by designers to communicate their vision and to\nbring the design idea from conceptualization to realization, showing how\nclothes interact with the human body. In this context, computer vision can thus\nbe used to improve the fashion design process. Differently from previous works\nthat mainly focused on the virtual try-on of garments, we propose the task of\nmultimodal-conditioned fashion image editing, guiding the generation of\nhuman-centric fashion images by following multimodal prompts, such as text,\nhuman body poses, and garment sketches. We tackle this problem by proposing a\nnew architecture based on latent diffusion models, an approach that has not\nbeen used before in the fashion domain. Given the lack of existing datasets\nsuitable for the task, we also extend two existing fashion datasets, namely\nDress Code and VITON-HD, with multimodal annotations collected in a\nsemi-automatic manner. Experimental results on these new datasets demonstrate\nthe effectiveness of our proposal, both in terms of realism and coherence with\nthe given multimodal inputs. Source code and collected multimodal annotations\nare publicly available at:\nhttps://github.com/aimagelab/multimodal-garment-designer.\n","authors":["Alberto Baldrati","Davide Morelli","Giuseppe Cartella","Marcella Cornia","Marco Bertini","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2304.02051v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.12045v1","updated":"2023-08-23T10:25:37Z","published":"2023-08-23T10:25:37Z","title":"CgT-GAN: CLIP-guided Text GAN for Image Captioning","summary":" The large-scale visual-language pre-trained model, Contrastive Language-Image\nPre-training (CLIP), has significantly improved image captioning for scenarios\nwithout human-annotated image-caption pairs. Recent advanced CLIP-based image\ncaptioning without human annotations follows a text-only training paradigm,\ni.e., reconstructing text from shared embedding space. Nevertheless, these\napproaches are limited by the training/inference gap or huge storage\nrequirements for text embeddings. Given that it is trivial to obtain images in\nthe real world, we propose CLIP-guided text GAN (CgT-GAN), which incorporates\nimages into the training process to enable the model to \"see\" real visual\nmodality. Particularly, we use adversarial training to teach CgT-GAN to mimic\nthe phrases of an external text corpus and CLIP-based reward to provide\nsemantic guidance. The caption generator is jointly rewarded based on the\ncaption naturalness to human language calculated from the GAN's discriminator\nand the semantic guidance reward computed by the CLIP-based reward module. In\naddition to the cosine similarity as the semantic guidance reward (i.e.,\nCLIP-cos), we further introduce a novel semantic guidance reward called\nCLIP-agg, which aligns the generated caption with a weighted text embedding by\nattentively aggregating the entire corpus. Experimental results on three\nsubtasks (ZS-IC, In-UIC and Cross-UIC) show that CgT-GAN outperforms\nstate-of-the-art methods significantly across all metrics. Code is available at\nhttps://github.com/Lihr747/CgtGAN.\n","authors":["Jiarui Yu","Haoran Li","Yanbin Hao","Bin Zhu","Tong Xu","Xiangnan He"],"pdf_url":"https://arxiv.org/pdf/2308.12045v1.pdf","comment":"Accepted at ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.11971v1","updated":"2023-08-23T07:36:30Z","published":"2023-08-23T07:36:30Z","title":"EVE: Efficient Vision-Language Pre-training with Masked Prediction and\n Modality-Aware MoE","summary":" Building scalable vision-language models to learn from diverse, multimodal\ndata remains an open challenge. In this paper, we introduce an Efficient\nVision-languagE foundation model, namely EVE, which is one unified multimodal\nTransformer pre-trained solely by one unified pre-training task. Specifically,\nEVE encodes both vision and language within a shared Transformer network\nintegrated with modality-aware sparse Mixture-of-Experts (MoE) modules, which\ncapture modality-specific information by selectively switching to different\nexperts. To unify pre-training tasks of vision and language, EVE performs\nmasked signal modeling on image-text pairs to reconstruct masked signals, i.e.,\nimage pixels and text tokens, given visible signals. This simple yet effective\npre-training objective accelerates training by 3.5x compared to the model\npre-trained with Image-Text Contrastive and Image-Text Matching losses. Owing\nto the combination of the unified architecture and pre-training task, EVE is\neasy to scale up, enabling better downstream performance with fewer resources\nand faster training speed. Despite its simplicity, EVE achieves\nstate-of-the-art performance on various vision-language downstream tasks,\nincluding visual question answering, visual reasoning, and image-text\nretrieval.\n","authors":["Junyi Chen","Longteng Guo","Jia Sun","Shuai Shao","Zehuan Yuan","Liang Lin","Dongyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.11971v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07056v4","updated":"2023-08-23T06:39:08Z","published":"2023-08-14T10:31:29Z","title":"VoxBlink: X-Large Speaker Verification Dataset on Camera","summary":" In this paper, we contribute a novel and extensive dataset for speaker\nverification, which contains noisy 38k identities/1.45M utterances (VoxBlink)\nand relatively cleaned 18k identities/1.02M (VoxBlink-Clean) utterances for\ntraining. Firstly, we accumulate a 60K+ users' list with their avatars and\ndownload their short videos on YouTube. We then established an automatic and\nscalable pipeline to extract relevant speech and video segments from these\nvideos. To our knowledge, the VoxBlink dataset is one of the largest speaker\nrecognition datasets available. Secondly, we conduct a series of experiments\nbased on different backbones trained on a mix of the VoxCeleb2 and the\nVoxBlink-Clean. Our findings highlight a notable performance improvement,\nranging from 13% to 30%, across different backbone architectures upon\nintegrating our dataset for training. The dataset will be made publicly\navailable shortly.\n","authors":["Yuke Lin","Xiaoyi Qin","Ming Cheng","Ning Jiang","Guoqing Zhao","Ming Li"],"pdf_url":"https://arxiv.org/pdf/2308.07056v4.pdf","comment":"submit to ICASSP2024"},{"id":"http://arxiv.org/abs/2211.06924v3","updated":"2023-08-23T04:02:28Z","published":"2022-11-13T15:11:03Z","title":"A Tale of Two Graphs: Freezing and Denoising Graph Structures for\n Multimodal Recommendation","summary":" Multimodal recommender systems utilizing multimodal features (e.g., images\nand textual descriptions) typically show better recommendation accuracy than\ngeneral recommendation models based solely on user-item interactions.\nGenerally, prior work fuses multimodal features into item ID embeddings to\nenrich item representations, thus failing to capture the latent semantic\nitem-item structures. In this context, LATTICE proposes to learn the latent\nstructure between items explicitly and achieves state-of-the-art performance\nfor multimodal recommendations. However, we argue the latent graph structure\nlearning of LATTICE is both inefficient and unnecessary. Experimentally, we\ndemonstrate that freezing its item-item structure before training can also\nachieve competitive performance. Based on this finding, we propose a simple yet\neffective model, dubbed as FREEDOM, that FREEzes the item-item graph and\nDenOises the user-item interaction graph simultaneously for Multimodal\nrecommendation. Theoretically, we examine the design of FREEDOM through a graph\nspectral perspective and demonstrate that it possesses a tighter upper bound on\nthe graph spectrum. In denoising the user-item interaction graph, we devise a\ndegree-sensitive edge pruning method, which rejects possibly noisy edges with a\nhigh probability when sampling the graph. We evaluate the proposed model on\nthree real-world datasets and show that FREEDOM can significantly outperform\ncurrent strongest baselines. Compared with LATTICE, FREEDOM achieves an average\nimprovement of 19.07% in recommendation accuracy while reducing its memory cost\nup to 6$\\times$ on large graphs. The source code is available at:\nhttps://github.com/enoche/FREEDOM.\n","authors":["Xin Zhou","Zhiqi Shen"],"pdf_url":"https://arxiv.org/pdf/2211.06924v3.pdf","comment":"Accepted to ACM Multimedia (MM) 2023"},{"id":"http://arxiv.org/abs/2308.12383v1","updated":"2023-08-23T18:53:00Z","published":"2023-08-23T18:53:00Z","title":"With a Little Help from your own Past: Prototypical Memory Networks for\n Image Captioning","summary":" Image captioning, like many tasks involving vision and language, currently\nrelies on Transformer-based architectures for extracting the semantics in an\nimage and translating it into linguistically coherent descriptions. Although\nsuccessful, the attention operator only considers a weighted summation of\nprojections of the current input sample, therefore ignoring the relevant\nsemantic information which can come from the joint observation of other\nsamples. In this paper, we devise a network which can perform attention over\nactivations obtained while processing other training samples, through a\nprototypical memory model. Our memory models the distribution of past keys and\nvalues through the definition of prototype vectors which are both\ndiscriminative and compact. Experimentally, we assess the performance of the\nproposed model on the COCO dataset, in comparison with carefully designed\nbaselines and state-of-the-art approaches, and by investigating the role of\neach of the proposed components. We demonstrate that our proposal can increase\nthe performance of an encoder-decoder Transformer by 3.7 CIDEr points both when\ntraining in cross-entropy only and when fine-tuning with self-critical sequence\ntraining. Source code and trained models are available at:\nhttps://github.com/aimagelab/PMA-Net.\n","authors":["Manuele Barraco","Sara Sarto","Marcella Cornia","Lorenzo Baraldi","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2308.12383v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.12370v1","updated":"2023-08-23T18:20:59Z","published":"2023-08-23T18:20:59Z","title":"AdVerb: Visually Guided Audio Dereverberation","summary":" We present AdVerb, a novel audio-visual dereverberation framework that uses\nvisual cues in addition to the reverberant sound to estimate clean audio.\nAlthough audio-only dereverberation is a well-studied problem, our approach\nincorporates the complementary visual modality to perform audio\ndereverberation. Given an image of the environment where the reverberated sound\nsignal has been recorded, AdVerb employs a novel geometry-aware cross-modal\ntransformer architecture that captures scene geometry and audio-visual\ncross-modal relationship to generate a complex ideal ratio mask, which, when\napplied to the reverberant audio predicts the clean sound. The effectiveness of\nour method is demonstrated through extensive quantitative and qualitative\nevaluations. Our approach significantly outperforms traditional audio-only and\naudio-visual baselines on three downstream tasks: speech enhancement, speech\nrecognition, and speaker verification, with relative improvements in the range\nof 18% - 82% on the LibriSpeech test-clean set. We also achieve highly\nsatisfactory RT60 error scores on the AVSpeech dataset.\n","authors":["Sanjoy Chowdhury","Sreyan Ghosh","Subhrajyoti Dasgupta","Anton Ratnarajah","Utkarsh Tyagi","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2308.12370v1.pdf","comment":"Accepted at ICCV 2023. For project page, see\n https://gamma.umd.edu/researchdirections/speech/adverb"},{"id":"http://arxiv.org/abs/2010.10658v2","updated":"2023-08-23T13:53:08Z","published":"2020-10-20T22:51:53Z","title":"Display object alignment may influence location recall in unexpected\n ways","summary":" There is a presumption in human-computer interaction that laying out menus\nand most other material in neat rows and columns helps users get work done. The\nrule has been so implicit in the field of design as to allow for no debate.\nHowever, the idea that perfect collinearity benefits creates an advantage for\nboth either search and or recall has rarely been tested. Drawing from separate\nbranches of cognitive literature, we tested a minimal brainstorming interface\nwith either aligned or eccentrically arranged layouts on 96 college students.\nIncidental exact recall of recently worked locations improved in the eccentric\ncondition. And in both conditions there were frequent near-miss recall errors\nto neighboring aligned objects and groups of objects. Further analysis found\nonly marginal performance advantages specifically for females with the\neccentric design. However, NASA-TLX subjective measures showed that in\neccentric, females reported higher performance, less effort, and yet also\nhigher frustration; while males reported lower performance with about the same\neffort, and lower frustration.\n","authors":["Peter Zelchenko","Xiaohan Fu","Xiangqian Li","Alex Ivanov","Zhenyu Gu"],"pdf_url":"https://arxiv.org/pdf/2010.10658v2.pdf","comment":"superseded by arXiv:2308.12201"}]},"2023-08-24T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2308.12966v1","updated":"2023-08-24T17:59:17Z","published":"2023-08-24T17:59:17Z","title":"Qwen-VL: A Frontier Large Vision-Language Model with Versatile Abilities","summary":" We introduce the Qwen-VL series, a set of large-scale vision-language models\ndesigned to perceive and understand both text and images. Comprising Qwen-VL\nand Qwen-VL-Chat, these models exhibit remarkable performance in tasks like\nimage captioning, question answering, visual localization, and flexible\ninteraction. The evaluation covers a wide range of tasks including zero-shot\ncaptioning, visual or document visual question answering, and grounding. We\ndemonstrate the Qwen-VL outperforms existing Large Vision Language Models\n(LVLMs). We present their architecture, training, capabilities, and\nperformance, highlighting their contributions to advancing multimodal\nartificial intelligence. Code, demo and models are available at\nhttps://github.com/QwenLM/Qwen-VL.\n","authors":["Jinze Bai","Shuai Bai","Shusheng Yang","Shijie Wang","Sinan Tan","Peng Wang","Junyang Lin","Chang Zhou","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.12966v1.pdf","comment":"Code, demo and models are available at\n https://github.com/QwenLM/Qwen-VL"},{"id":"http://arxiv.org/abs/2308.11764v2","updated":"2023-08-24T17:57:00Z","published":"2023-08-22T20:12:49Z","title":"Halo: Estimation and Reduction of Hallucinations in Open-Source Weak\n Large Language Models","summary":" Large Language Models (LLMs) have revolutionized Natural Language Processing\n(NLP). Although convenient for research and practical applications, open-source\nLLMs with fewer parameters often suffer from severe hallucinations compared to\ntheir larger counterparts. This paper focuses on measuring and reducing\nhallucinations in BLOOM 7B, a representative of such weaker open-source LLMs\nthat are publicly available for research and commercial applications. We\nintroduce HaloCheck, a lightweight BlackBox knowledge-free framework designed\nto quantify the severity of hallucinations in LLMs. Additionally, we explore\ntechniques like knowledge injection and teacher-student approaches to alleviate\nhallucinations in low-parameter LLMs. Our experiments effectively demonstrate\nthe reduction of hallucinations in challenging domains for these LLMs.\n","authors":["Mohamed Elaraby","Mengyin Lu","Jacob Dunn","Xueying Zhang","Yu Wang","Shizhu Liu"],"pdf_url":"https://arxiv.org/pdf/2308.11764v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12950v1","updated":"2023-08-24T17:39:13Z","published":"2023-08-24T17:39:13Z","title":"Code Llama: Open Foundation Models for Code","summary":" We release Code Llama, a family of large language models for code based on\nLlama 2 providing state-of-the-art performance among open models, infilling\ncapabilities, support for large input contexts, and zero-shot instruction\nfollowing ability for programming tasks. We provide multiple flavors to cover a\nwide range of applications: foundation models (Code Llama), Python\nspecializations (Code Llama - Python), and instruction-following models (Code\nLlama - Instruct) with 7B, 13B and 34B parameters each. All models are trained\non sequences of 16k tokens and show improvements on inputs with up to 100k\ntokens. 7B and 13B Code Llama and Code Llama - Instruct variants support\ninfilling based on surrounding content. Code Llama reaches state-of-the-art\nperformance among open models on several code benchmarks, with scores of up to\n53% and 55% on HumanEval and MBPP, respectively. Notably, Code Llama - Python\n7B outperforms Llama 2 70B on HumanEval and MBPP, and all our models outperform\nevery other publicly available model on MultiPL-E. We release Code Llama under\na permissive license that allows for both research and commercial use.\n","authors":["Baptiste Rozière","Jonas Gehring","Fabian Gloeckle","Sten Sootla","Itai Gat","Xiaoqing Ellen Tan","Yossi Adi","Jingyu Liu","Tal Remez","Jérémy Rapin","Artyom Kozhevnikov","Ivan Evtimov","Joanna Bitton","Manish Bhatt","Cristian Canton Ferrer","Aaron Grattafiori","Wenhan Xiong","Alexandre Défossez","Jade Copet","Faisal Azhar","Hugo Touvron","Louis Martin","Nicolas Usunier","Thomas Scialom","Gabriel Synnaeve"],"pdf_url":"https://arxiv.org/pdf/2308.12950v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12898v1","updated":"2023-08-24T16:17:40Z","published":"2023-08-24T16:17:40Z","title":"Can Linguistic Knowledge Improve Multimodal Alignment in Vision-Language\n Pretraining?","summary":" The multimedia community has shown a significant interest in perceiving and\nrepresenting the physical world with multimodal pretrained neural network\nmodels, and among them, the visual-language pertaining (VLP) is, currently, the\nmost captivating topic. However, there have been few endeavors dedicated to the\nexploration of 1) whether essential linguistic knowledge (e.g., semantics and\nsyntax) can be extracted during VLP, and 2) how such linguistic knowledge\nimpact or enhance the multimodal alignment. In response, here we aim to\nelucidate the impact of comprehensive linguistic knowledge, including semantic\nexpression and syntactic structure, on multimodal alignment. Specifically, we\ndesign and release the SNARE, the first large-scale multimodal alignment\nprobing benchmark, to detect the vital linguistic components, e.g., lexical,\nsemantic, and syntax knowledge, containing four tasks: Semantic structure,\nNegation logic, Attribute ownership, and Relationship composition. Based on our\nproposed probing benchmarks, our holistic analyses of five advanced VLP models\nillustrate that the VLP model: i) shows insensitivity towards complex syntax\nstructures and relies on content words for sentence comprehension; ii)\ndemonstrates limited comprehension of combinations between sentences and\nnegations; iii) faces challenges in determining the presence of actions or\nspatial relationships within visual information and struggles with verifying\nthe correctness of triple combinations. We make our benchmark and code\navailable at \\url{https://github.com/WangFei-2019/SNARE/}.\n","authors":["Fei Wang","Liang Ding","Jun Rao","Ye Liu","Li Shen","Changxing Ding"],"pdf_url":"https://arxiv.org/pdf/2308.12898v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12896v1","updated":"2023-08-24T16:16:47Z","published":"2023-08-24T16:16:47Z","title":"Beyond Document Page Classification: Design, Datasets, and Challenges","summary":" This paper highlights the need to bring document classification benchmarking\ncloser to real-world applications, both in the nature of data tested ($X$:\nmulti-channel, multi-paged, multi-industry; $Y$: class distributions and label\nset variety) and in classification tasks considered ($f$: multi-page document,\npage stream, and document bundle classification, ...). We identify the lack of\npublic multi-page document classification datasets, formalize different\nclassification tasks arising in application scenarios, and motivate the value\nof targeting efficient multi-page document representations. An experimental\nstudy on proposed multi-page document classification datasets demonstrates that\ncurrent benchmarks have become irrelevant and need to be updated to evaluate\ncomplete documents, as they naturally occur in practice. This reality check\nalso calls for more mature evaluation methodologies, covering calibration\nevaluation, inference complexity (time-memory), and a range of realistic\ndistribution shifts (e.g., born-digital vs. scanning noise, shifting page\norder). Our study ends on a hopeful note by recommending concrete avenues for\nfuture improvements.}\n","authors":["Jordy Van Landeghem","Sanket Biswas","Matthew B. Blaschko","Marie-Francine Moens"],"pdf_url":"https://arxiv.org/pdf/2308.12896v1.pdf","comment":"8 pages, under review"},{"id":"http://arxiv.org/abs/2308.12890v1","updated":"2023-08-24T16:09:13Z","published":"2023-08-24T16:09:13Z","title":"Large Language Models Vote: Prompting for Rare Disease Identification","summary":" The emergence of generative Large Language Models (LLMs) emphasizes the need\nfor accurate and efficient prompting approaches. LLMs are often applied in\nFew-Shot Learning (FSL) contexts, where tasks are executed with minimal\ntraining data. FSL has become popular in many Artificial Intelligence (AI)\nsubdomains, including AI for health. Rare diseases, affecting a small fraction\nof the population, inherently require FSL techniques due to limited data\navailability, though manual data collection and annotation is costly and\ntime-consuming. In this paper, we propose Models-Vote Prompting (MVP), a\nflexible prompting approach for improving the performance of LLM queries in FSL\nsettings. MVP works by prompting numerous LLMs to perform the same tasks and\nthen conducting a majority vote on the resulting outputs. This method achieves\nimproved results to any one model in the ensemble on one-shot rare disease\nidentification and classification tasks. We also release a novel rare disease\ndataset for FSL, available to those who agreed to the MIMIC-IV Data Use\nAgreement (DUA). Furthermore, in using MVP, each model is prompted multiple\ntimes, substantially increasing the time needed for manual annotation, and to\naddress this, we assess the feasibility of using JSON for automating generative\nLLM evaluation.\n","authors":["David Oniani","Jordan Hilsman","Hang Dong","Fengyi Gao","Shiven Verma","Yanshan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.12890v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12888v1","updated":"2023-08-24T16:06:36Z","published":"2023-08-24T16:06:36Z","title":"Inducing Causal Structure for Abstractive Text Summarization","summary":" The mainstream of data-driven abstractive summarization models tends to\nexplore the correlations rather than the causal relationships. Among such\ncorrelations, there can be spurious ones which suffer from the language prior\nlearned from the training corpus and therefore undermine the overall\neffectiveness of the learned model. To tackle this issue, we introduce a\nStructural Causal Model (SCM) to induce the underlying causal structure of the\nsummarization data. We assume several latent causal factors and non-causal\nfactors, representing the content and style of the document and summary.\nTheoretically, we prove that the latent factors in our SCM can be identified by\nfitting the observed training data under certain conditions. On the basis of\nthis, we propose a Causality Inspired Sequence-to-Sequence model (CI-Seq2Seq)\nto learn the causal representations that can mimic the causal factors, guiding\nus to pursue causal information for summary generation. The key idea is to\nreformulate the Variational Auto-encoder (VAE) to fit the joint distribution of\nthe document and summary variables from the training corpus. Experimental\nresults on two widely used text summarization datasets demonstrate the\nadvantages of our approach.\n","authors":["Lu Chen","Ruqing Zhang","Wei Huang","Wei Chen","Jiafeng Guo","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2308.12888v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12842v1","updated":"2023-08-24T15:06:04Z","published":"2023-08-24T15:06:04Z","title":"Text Similarity from Image Contents using Statistical and Semantic\n Analysis Techniques","summary":" Plagiarism detection is one of the most researched areas among the Natural\nLanguage Processing(NLP) community. A good plagiarism detection covers all the\nNLP methods including semantics, named entities, paraphrases etc. and produces\ndetailed plagiarism reports. Detection of Cross Lingual Plagiarism requires\ndeep knowledge of various advanced methods and algorithms to perform effective\ntext similarity checking. Nowadays the plagiarists are also advancing\nthemselves from hiding the identity from being catch in such offense. The\nplagiarists are bypassed from being detected with techniques like paraphrasing,\nsynonym replacement, mismatching citations, translating one language to\nanother. Image Content Plagiarism Detection (ICPD) has gained importance,\nutilizing advanced image content processing to identify instances of plagiarism\nto ensure the integrity of image content. The issue of plagiarism extends\nbeyond textual content, as images such as figures, graphs, and tables also have\nthe potential to be plagiarized. However, image content plagiarism detection\nremains an unaddressed challenge. Therefore, there is a critical need to\ndevelop methods and systems for detecting plagiarism in image content. In this\npaper, the system has been implemented to detect plagiarism form contents of\nImages such as Figures, Graphs, Tables etc. Along with statistical algorithms\nsuch as Jaccard and Cosine, introducing semantic algorithms such as LSA, BERT,\nWordNet outperformed in detecting efficient and accurate plagiarism.\n","authors":["Sagar Kulkarni","Sharvari Govilkar","Dhiraj Amin"],"pdf_url":"https://arxiv.org/pdf/2308.12842v1.pdf","comment":"NLPTT2023 publication, 10 Pages"},{"id":"http://arxiv.org/abs/2308.12833v1","updated":"2023-08-24T14:45:50Z","published":"2023-08-24T14:45:50Z","title":"Use of LLMs for Illicit Purposes: Threats, Prevention Measures, and\n Vulnerabilities","summary":" Spurred by the recent rapid increase in the development and distribution of\nlarge language models (LLMs) across industry and academia, much recent work has\ndrawn attention to safety- and security-related threats and vulnerabilities of\nLLMs, including in the context of potentially criminal activities.\nSpecifically, it has been shown that LLMs can be misused for fraud,\nimpersonation, and the generation of malware; while other authors have\nconsidered the more general problem of AI alignment. It is important that\ndevelopers and practitioners alike are aware of security-related problems with\nsuch models. In this paper, we provide an overview of existing - predominantly\nscientific - efforts on identifying and mitigating threats and vulnerabilities\narising from LLMs. We present a taxonomy describing the relationship between\nthreats caused by the generative capabilities of LLMs, prevention measures\nintended to address such threats, and vulnerabilities arising from imperfect\nprevention measures. With our work, we hope to raise awareness of the\nlimitations of LLMs in light of such security concerns, among both experienced\ndevelopers and novel users of such technologies.\n","authors":["Maximilian Mozes","Xuanli He","Bennett Kleinberg","Lewis D. Griffin"],"pdf_url":"https://arxiv.org/pdf/2308.12833v1.pdf","comment":"Pre-print"},{"id":"http://arxiv.org/abs/2306.04504v3","updated":"2023-08-24T13:39:17Z","published":"2023-06-07T15:11:26Z","title":"Evaluation of ChatGPT on Biomedical Tasks: A Zero-Shot Comparison with\n Fine-Tuned Generative Transformers","summary":" ChatGPT is a large language model developed by OpenAI. Despite its impressive\nperformance across various tasks, no prior work has investigated its capability\nin the biomedical domain yet. To this end, this paper aims to evaluate the\nperformance of ChatGPT on various benchmark biomedical tasks, such as relation\nextraction, document classification, question answering, and summarization. To\nthe best of our knowledge, this is the first work that conducts an extensive\nevaluation of ChatGPT in the biomedical domain. Interestingly, we find based on\nour evaluation that in biomedical datasets that have smaller training sets,\nzero-shot ChatGPT even outperforms the state-of-the-art fine-tuned generative\ntransformer models, such as BioGPT and BioBART. This suggests that ChatGPT's\npre-training on large text corpora makes it quite specialized even in the\nbiomedical domain. Our findings demonstrate that ChatGPT has the potential to\nbe a valuable tool for various tasks in the biomedical domain that lack large\nannotated data.\n","authors":["Israt Jahan","Md Tahmid Rahman Laskar","Chun Peng","Jimmy Huang"],"pdf_url":"https://arxiv.org/pdf/2306.04504v3.pdf","comment":"Accepted by BioNLP@ACL 2023"},{"id":"http://arxiv.org/abs/2308.12770v1","updated":"2023-08-24T13:17:35Z","published":"2023-08-24T13:17:35Z","title":"WavMark: Watermarking for Audio Generation","summary":" Recent breakthroughs in zero-shot voice synthesis have enabled imitating a\nspeaker's voice using just a few seconds of recording while maintaining a high\nlevel of realism. Alongside its potential benefits, this powerful technology\nintroduces notable risks, including voice fraud and speaker impersonation.\nUnlike the conventional approach of solely relying on passive methods for\ndetecting synthetic data, watermarking presents a proactive and robust defence\nmechanism against these looming risks. This paper introduces an innovative\naudio watermarking framework that encodes up to 32 bits of watermark within a\nmere 1-second audio snippet. The watermark is imperceptible to human senses and\nexhibits strong resilience against various attacks. It can serve as an\neffective identifier for synthesized voices and holds potential for broader\napplications in audio copyright protection. Moreover, this framework boasts\nhigh flexibility, allowing for the combination of multiple watermark segments\nto achieve heightened robustness and expanded capacity. Utilizing 10 to\n20-second audio as the host, our approach demonstrates an average Bit Error\nRate (BER) of 0.48\\% across ten common attacks, a remarkable reduction of over\n2800\\% in BER compared to the state-of-the-art watermarking tool. See\nhttps://aka.ms/wavmark for demos of our work.\n","authors":["Guangyu Chen","Yu Wu","Shujie Liu","Tao Liu","Xiaoyong Du","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2308.12770v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12734v1","updated":"2023-08-24T12:26:15Z","published":"2023-08-24T12:26:15Z","title":"Real-time Detection of AI-Generated Speech for DeepFake Voice Conversion","summary":" There are growing implications surrounding generative AI in the speech domain\nthat enable voice cloning and real-time voice conversion from one individual to\nanother. This technology poses a significant ethical threat and could lead to\nbreaches of privacy and misrepresentation, thus there is an urgent need for\nreal-time detection of AI-generated speech for DeepFake Voice Conversion. To\naddress the above emerging issues, the DEEP-VOICE dataset is generated in this\nstudy, comprised of real human speech from eight well-known figures and their\nspeech converted to one another using Retrieval-based Voice Conversion.\nPresenting as a binary classification problem of whether the speech is real or\nAI-generated, statistical analysis of temporal audio features through t-testing\nreveals that there are significantly different distributions. Hyperparameter\noptimisation is implemented for machine learning models to identify the source\nof speech. Following the training of 208 individual machine learning models\nover 10-fold cross validation, it is found that the Extreme Gradient Boosting\nmodel can achieve an average classification accuracy of 99.3% and can classify\nspeech in real-time, at around 0.004 milliseconds given one second of speech.\nAll data generated for this study is released publicly for future research on\nAI speech detection.\n","authors":["Jordan J. Bird","Ahmad Lotfi"],"pdf_url":"https://arxiv.org/pdf/2308.12734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12711v1","updated":"2023-08-24T11:07:47Z","published":"2023-08-24T11:07:47Z","title":"Harnessing the Power of David against Goliath: Exploring Instruction\n Data Generation without Using Closed-Source Models","summary":" Instruction tuning is instrumental in enabling Large Language Models~(LLMs)\nto follow user instructions to complete various open-domain tasks. The success\nof instruction tuning depends on the availability of high-quality instruction\ndata. Owing to the exorbitant cost and substandard quality of human annotation,\nrecent works have been deeply engaged in the exploration of the utilization of\npowerful closed-source models to generate instruction data automatically.\nHowever, these methods carry potential risks arising from the usage\nrequirements of powerful closed-source models, which strictly forbid the\nutilization of their outputs to develop machine learning models. To deal with\nthis problem, in this work, we explore alternative approaches to generate\nhigh-quality instruction data that do not rely on closed-source models. Our\nexploration includes an investigation of various existing instruction\ngeneration methods, culminating in the integration of the most efficient\nvariant with two novel strategies to enhance the quality further. Evaluation\nresults from two benchmarks and the GPT-4 model demonstrate the effectiveness\nof our generated instruction data, which can outperform Alpaca, a method\nreliant on closed-source models. We hope that more progress can be achieved in\ngenerating high-quality instruction data without using closed-source models.\n","authors":["Yue Wang","Xinrui Wang","Juntao Li","Jinxiong Chang","Qishen Zhang","Zhongyi Liu","Guannan Zhang","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.12711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12674v1","updated":"2023-08-24T09:32:29Z","published":"2023-08-24T09:32:29Z","title":"Improving Translation Faithfulness of Large Language Models via\n Augmenting Instructions","summary":" Large Language Models (LLMs) present strong general capabilities, and a\ncurrent compelling challenge is stimulating their specialized capabilities,\nsuch as machine translation, through low-cost instruction tuning. The standard\ninstruction-following data is sequentially organized as the concatenation of an\ninstruction, an input, and a response. As the attention mechanism of LLMs has\nlimitations on local focus, LLMs tend to focus more on the words or sentences\nnearby at each position. This leads to a high risk of instruction forgetting\nduring decoding. To alleviate the above issues, We propose SWIE\n(Segment-Weighted Instruction Embedding) and an instruction-following dataset\nOVERMISS. SWIE improves the model instruction understanding by adding a global\ninstruction representation on the following input and response representations.\nOVERMISS improves model faithfulness by comparing over-translation and\nmiss-translation results with the correct translation. We apply our methods to\ntwo main-stream open-source LLMs, BLOOM and LLaMA. The experimental results\ndemonstrate significant improvements in translation performance with SWIE based\non BLOOMZ-3b, particularly in zero-shot and long text translations due to\nreduced instruction forgetting risk. Additionally, OVERMISS outperforms the\nbaseline in translation performance (e.g. an increase in BLEU scores from 0.69\nto 3.12 and an average improvement of 0.48 percentage comet scores for\nLLaMA-7b) with further enhancements seen in models combining OVERMISS and SWIE\n(e.g. the BLUE scores increase up to 0.56 from English to German across three\ndifferent backbones), and both exhibit improvements in the faithfulness metric\nbased on word alignment.\n","authors":["Yijie Chen","Yijin Liu","Fandong Meng","Yufeng Chen","Jinan Xu","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.12674v1.pdf","comment":"Our code and datasets are released in Github:\n https://github.com/pppa2019/swie_overmiss_llm4mt"},{"id":"http://arxiv.org/abs/2308.12648v1","updated":"2023-08-24T08:46:30Z","published":"2023-08-24T08:46:30Z","title":"From Chatter to Matter: Addressing Critical Steps of Emotion Recognition\n Learning in Task-oriented Dialogue","summary":" Emotion recognition in conversations (ERC) is a crucial task for building\nhuman-like conversational agents. While substantial efforts have been devoted\nto ERC for chit-chat dialogues, the task-oriented counterpart is largely left\nunattended. Directly applying chit-chat ERC models to task-oriented dialogues\n(ToDs) results in suboptimal performance as these models overlook key features\nsuch as the correlation between emotions and task completion in ToDs. In this\npaper, we propose a framework that turns a chit-chat ERC model into a\ntask-oriented one, addressing three critical aspects: data, features and\nobjective. First, we devise two ways of augmenting rare emotions to improve ERC\nperformance. Second, we use dialogue states as auxiliary features to\nincorporate key information from the goal of the user. Lastly, we leverage a\nmulti-aspect emotion definition in ToDs to devise a multi-task learning\nobjective and a novel emotion-distance weighted loss function. Our framework\nyields significant improvements for a range of chit-chat ERC models on EmoWOZ,\na large-scale dataset for user emotion in ToDs. We further investigate the\ngeneralisability of the best resulting model to predict user satisfaction in\ndifferent ToD datasets. A comparison with supervised baselines shows a strong\nzero-shot capability, highlighting the potential usage of our framework in\nwider scenarios.\n","authors":["Shutong Feng","Nurul Lubis","Benjamin Ruppik","Christian Geishauser","Michael Heck","Hsien-chin Lin","Carel van Niekerk","Renato Vukovic","Milica Gašić"],"pdf_url":"https://arxiv.org/pdf/2308.12648v1.pdf","comment":"Accepted by SIGDIAL 2023"},{"id":"http://arxiv.org/abs/2308.12643v1","updated":"2023-08-24T08:36:28Z","published":"2023-08-24T08:36:28Z","title":"Probabilistic Method of Measuring Linguistic Productivity","summary":" In this paper I propose a new way of measuring linguistic productivity that\nobjectively assesses the ability of an affix to be used to coin new complex\nwords and, unlike other popular measures, is not directly dependent upon token\nfrequency. Specifically, I suggest that linguistic productivity may be viewed\nas the probability of an affix to combine with a random base. The advantages of\nthis approach include the following. First, token frequency does not dominate\nthe productivity measure but naturally influences the sampling of bases.\nSecond, we are not just counting attested word types with an affix but rather\nsimulating the construction of these types and then checking whether they are\nattested in the corpus. Third, a corpus-based approach and randomised design\nassure that true neologisms and words coined long ago have equal chances to be\nselected. The proposed algorithm is evaluated both on English and Russian data.\nThe obtained results provide some valuable insights into the relation of\nlinguistic productivity to the number of types and tokens. It looks like\nburgeoning linguistic productivity manifests itself in an increasing number of\ntypes. However, this process unfolds in two stages: first comes the increase in\nhigh-frequency items, and only then follows the increase in low-frequency\nitems.\n","authors":["Sergei Monakhov"],"pdf_url":"https://arxiv.org/pdf/2308.12643v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12635v1","updated":"2023-08-24T08:19:51Z","published":"2023-08-24T08:19:51Z","title":"Advancing Hungarian Text Processing with HuSpaCy: Efficient and Accurate\n NLP Pipelines","summary":" This paper presents a set of industrial-grade text processing models for\nHungarian that achieve near state-of-the-art performance while balancing\nresource efficiency and accuracy. Models have been implemented in the spaCy\nframework, extending the HuSpaCy toolkit with several improvements to its\narchitecture. Compared to existing NLP tools for Hungarian, all of our\npipelines feature all basic text processing steps including tokenization,\nsentence-boundary detection, part-of-speech tagging, morphological feature\ntagging, lemmatization, dependency parsing and named entity recognition with\nhigh accuracy and throughput. We thoroughly evaluated the proposed\nenhancements, compared the pipelines with state-of-the-art tools and\ndemonstrated the competitive performance of the new models in all text\npreprocessing steps. All experiments are reproducible and the pipelines are\nfreely available under a permissive license.\n","authors":["György Orosz","Gergő Szabó","Péter Berkecz","Zsolt Szántó","Richárd Farkas"],"pdf_url":"https://arxiv.org/pdf/2308.12635v1.pdf","comment":"Submitted to TSD 2023 Conference"},{"id":"http://arxiv.org/abs/2201.05337v5","updated":"2023-08-24T08:16:57Z","published":"2022-01-14T08:32:20Z","title":"A Survey of Controllable Text Generation using Transformer-based\n Pre-trained Language Models","summary":" Controllable Text Generation (CTG) is emerging area in the field of natural\nlanguage generation (NLG). It is regarded as crucial for the development of\nadvanced text generation technologies that better meet the specific constraints\nin practical applications. In recent years, methods using large-scale\npre-trained language models (PLMs), in particular the widely used\ntransformer-based PLMs, have become a new paradigm of NLG, allowing generation\nof more diverse and fluent text. However, due to the limited level of\ninterpretability of deep neural networks, the controllability of these methods\nneed to be guaranteed. To this end, controllable text generation using\ntransformer-based PLMs has become a rapidly growing yet challenging new\nresearch hotspot. A diverse range of approaches have emerged in the recent 3-4\nyears, targeting different CTG tasks that require different types of controlled\nconstraints. In this paper, we present a systematic critical review on the\ncommon tasks, main approaches, and evaluation methods in this area. Finally, we\ndiscuss the challenges that the field is facing, and put forward various\npromising future directions. To the best of our knowledge, this is the first\nsurvey paper to summarize the state-of-the-art CTG techniques from the\nperspective of Transformer-based PLMs. We hope it can help researchers and\npractitioners in the related fields to quickly track the academic and\ntechnological frontier, providing them with a landscape of the area and a\nroadmap for future research.\n","authors":["Hanqing Zhang","Haolin Song","Shaoyu Li","Ming Zhou","Dawei Song"],"pdf_url":"https://arxiv.org/pdf/2201.05337v5.pdf","comment":"Accpeted by ACM Computing Surveys Journal"},{"id":"http://arxiv.org/abs/2210.08471v5","updated":"2023-08-24T07:13:27Z","published":"2022-10-16T07:17:27Z","title":"Improving Semantic Matching through Dependency-Enhanced Pre-trained\n Model with Adaptive Fusion","summary":" Transformer-based pre-trained models like BERT have achieved great progress\non Semantic Sentence Matching. Meanwhile, dependency prior knowledge has also\nshown general benefits in multiple NLP tasks. However, how to efficiently\nintegrate dependency prior structure into pre-trained models to better model\ncomplex semantic matching relations is still unsettled. In this paper, we\npropose the \\textbf{D}ependency-Enhanced \\textbf{A}daptive \\textbf{F}usion\n\\textbf{A}ttention (\\textbf{DAFA}), which explicitly introduces dependency\nstructure into pre-trained models and adaptively fuses it with semantic\ninformation. Specifically, \\textbf{\\emph{(i)}} DAFA first proposes a\nstructure-sensitive paradigm to construct a dependency matrix for calibrating\nattention weights. It adopts an adaptive fusion module to integrate the\nobtained dependency information and the original semantic signals. Moreover,\nDAFA reconstructs the attention calculation flow and provides better\ninterpretability. By applying it on BERT, our method achieves state-of-the-art\nor competitive performance on 10 public datasets, demonstrating the benefits of\nadaptively fusing dependency structure in semantic matching task.\n","authors":["Jian Song","Di Liang","Rumei Li","Yuntao Li","Sirui Wang","Minlong Peng","Wei Wu","Yongxin Yu"],"pdf_url":"https://arxiv.org/pdf/2210.08471v5.pdf","comment":"Accepted by Findings of EMNLP 2022"},{"id":"http://arxiv.org/abs/2308.12604v1","updated":"2023-08-24T07:10:31Z","published":"2023-08-24T07:10:31Z","title":"PromptMRG: Diagnosis-Driven Prompts for Medical Report Generation","summary":" Automatic medical report generation (MRG) is of great research value as it\nhas the potential to relieve radiologists from the heavy burden of report\nwriting. Despite recent advancements, accurate MRG remains challenging due to\nthe need for precise clinical understanding and the identification of clinical\nfindings. Moreover, the imbalanced distribution of diseases makes the challenge\neven more pronounced, as rare diseases are underrepresented in training data,\nmaking their diagnostic performance unreliable. To address these challenges, we\npropose diagnosis-driven prompts for medical report generation (PromptMRG), a\nnovel framework that aims to improve the diagnostic accuracy of MRG with the\nguidance of diagnosis-aware prompts. Specifically, PromptMRG is based on\nencoder-decoder architecture with an extra disease classification branch. When\ngenerating reports, the diagnostic results from the classification branch are\nconverted into token prompts to explicitly guide the generation process. To\nfurther improve the diagnostic accuracy, we design cross-modal feature\nenhancement, which retrieves similar reports from the database to assist the\ndiagnosis of a query image by leveraging the knowledge from a pre-trained CLIP.\nMoreover, the disease imbalanced issue is addressed by applying an adaptive\nlogit-adjusted loss to the classification branch based on the individual\nlearning status of each disease, which overcomes the barrier of text decoder's\ninability to manipulate disease distributions. Experiments on two MRG\nbenchmarks show the effectiveness of the proposed method, where it obtains\nstate-of-the-art clinical efficacy performance on both datasets.\n","authors":["Haibo Jin","Haoxuan Che","Yi Lin","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2308.12604v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04528v3","updated":"2023-08-24T07:09:25Z","published":"2023-06-07T15:37:00Z","title":"PromptBench: Towards Evaluating the Robustness of Large Language Models\n on Adversarial Prompts","summary":" The increasing reliance on Large Language Models (LLMs) across academia and\nindustry necessitates a comprehensive understanding of their robustness to\nprompts. In response to this vital need, we introduce PromptBench, a robustness\nbenchmark designed to measure LLMs' resilience to adversarial prompts. This\nstudy uses a plethora of adversarial textual attacks targeting prompts across\nmultiple levels: character, word, sentence, and semantic. These prompts are\nthen employed in diverse tasks, such as sentiment analysis, natural language\ninference, reading comprehension, machine translation, and math\nproblem-solving. Our study generates 4,032 adversarial prompts, meticulously\nevaluated over 8 tasks and 13 datasets, with 567,084 test samples in total. Our\nfindings demonstrate that contemporary LLMs are vulnerable to adversarial\nprompts. Furthermore, we present comprehensive analysis to understand the\nmystery behind prompt robustness and its transferability. We then offer\ninsightful robustness analysis and pragmatic recommendations for prompt\ncomposition, beneficial to both researchers and everyday users. We make our\ncode, prompts, and methodologies to generate adversarial prompts publicly\naccessible, thereby enabling and encouraging collaborative exploration in this\npivotal field: https://github.com/microsoft/promptbench.\n","authors":["Kaijie Zhu","Jindong Wang","Jiaheng Zhou","Zichen Wang","Hao Chen","Yidong Wang","Linyi Yang","Wei Ye","Neil Zhenqiang Gong","Yue Zhang","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2306.04528v3.pdf","comment":"Technical report; updated with new experiments and related work; 27\n pages; code is at: https://github.com/microsoft/promptbench"},{"id":"http://arxiv.org/abs/2308.12578v1","updated":"2023-08-24T05:35:58Z","published":"2023-08-24T05:35:58Z","title":"Mind vs. Mouth: On Measuring Re-judge Inconsistency of Social Bias in\n Large Language Models","summary":" Recent researches indicate that Pre-trained Large Language Models (LLMs)\npossess cognitive constructs similar to those observed in humans, prompting\nresearchers to investigate the cognitive aspects of LLMs. This paper focuses on\nexplicit and implicit social bias, a distinctive two-level cognitive construct\nin psychology. It posits that individuals' explicit social bias, which is their\nconscious expression of bias in the statements, may differ from their implicit\nsocial bias, which represents their unconscious bias. We propose a two-stage\napproach and discover a parallel phenomenon in LLMs known as \"re-judge\ninconsistency\" in social bias. In the initial stage, the LLM is tasked with\nautomatically completing statements, potentially incorporating implicit social\nbias. However, in the subsequent stage, the same LLM re-judges the biased\nstatement generated by itself but contradicts it. We propose that this re-judge\ninconsistency can be similar to the inconsistency between human's unaware\nimplicit social bias and their aware explicit social bias. Experimental\ninvestigations on ChatGPT and GPT-4 concerning common gender biases examined in\npsychology corroborate the highly stable nature of the re-judge inconsistency.\nThis finding may suggest that diverse cognitive constructs emerge as LLMs'\ncapabilities strengthen. Consequently, leveraging psychological theories can\nprovide enhanced insights into the underlying mechanisms governing the\nexpressions of explicit and implicit constructs in LLMs.\n","authors":["Yachao Zhao","Bo Wang","Dongming Zhao","Kun Huang","Yan Wang","Ruifang He","Yuexian Hou"],"pdf_url":"https://arxiv.org/pdf/2308.12578v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12568v1","updated":"2023-08-24T05:15:43Z","published":"2023-08-24T05:15:43Z","title":"A Small and Fast BERT for Chinese Medical Punctuation Restoration","summary":" In clinical dictation, utterances after automatic speech recognition (ASR)\nwithout explicit punctuation marks may lead to the misunderstanding of dictated\nreports. To give a precise and understandable clinical report with ASR,\nautomatic punctuation restoration is required. Considering a practical\nscenario, we propose a fast and light pre-trained model for Chinese medical\npunctuation restoration based on 'pretraining and fine-tuning' paradigm. In\nthis work, we distill pre-trained models by incorporating supervised\ncontrastive learning and a novel auxiliary pre-training task (Punctuation Mark\nPrediction) to make it well-suited for punctuation restoration. Our experiments\non various distilled models reveal that our model can achieve 95% performance\nwhile 10% model size relative to state-of-the-art Chinese RoBERTa.\n","authors":["Tongtao Ling","Chen Liao","Zhipeng Yu","Lei Chen","Shilei Huang","Yi Liu"],"pdf_url":"https://arxiv.org/pdf/2308.12568v1.pdf","comment":"5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2305.06152v2","updated":"2023-08-24T04:44:39Z","published":"2023-05-06T03:57:05Z","title":"Structure-CLIP: Towards Scene Graph Knowledge to Enhance Multi-modal\n Structured Representations","summary":" Large-scale vision-language pre-training has achieved significant performance\nin multi-modal understanding and generation tasks. However, existing methods\noften perform poorly on image-text matching tasks that require structured\nrepresentations, i.e., representations of objects, attributes, and relations.\nPrevious models cannot make a distinction between ``An astronaut rides a horse\"\nand ``A horse rides an astronaut\". This is because they fail to fully leverage\nstructured knowledge when learning representations in multi-modal scenarios. In\nthis paper, we present an end-to-end framework Structure-CLIP, which integrates\nScene Graph Knowledge (SGK) to enhance multi-modal structured representations.\nFirstly, we use scene graphs to guide the construction of semantic negative\nexamples, which results in an increased emphasis on learning structured\nrepresentations. Moreover, a Knowledge-Enhance Encoder (KEE) is proposed to\nleverage SGK as input to further enhance structured representations. To verify\nthe effectiveness of the proposed framework, we pre-train our model with the\naforementioned approaches and conduct experiments on downstream tasks.\nExperimental results demonstrate that Structure-CLIP achieves state-of-the-art\n(SOTA) performance on VG-Attribution and VG-Relation datasets, with 12.5% and\n4.1% ahead of the multi-modal SOTA model respectively. Meanwhile, the results\non MSCOCO indicate that Structure-CLIP significantly enhances the structured\nrepresentations while maintaining the ability of general representations. Our\ncode will be available soon.\n","authors":["Yufeng Huang","Jiji Tang","Zhuo Chen","Rongsheng Zhang","Xinfeng Zhang","Weijie Chen","Zeng Zhao","Zhou Zhao","Tangjie Lv","Zhipeng Hu","Wen Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.06152v2.pdf","comment":"Version 2.0. Improve grammar and experiments"},{"id":"http://arxiv.org/abs/2308.07134v3","updated":"2023-08-24T03:54:45Z","published":"2023-08-14T13:41:09Z","title":"Natural Language is All a Graph Needs","summary":" The emergence of large-scale pre-trained language models, such as ChatGPT,\nhas revolutionized various research fields in artificial intelligence.\nTransformers-based large language models (LLMs) have gradually replaced CNNs\nand RNNs to unify fields of computer vision and natural language processing.\nCompared with the data that exists relatively independently such as images,\nvideos or texts, graph is a type of data that contains rich structural and\nrelational information. Meanwhile, natural language, as one of the most\nexpressive mediums, excels in describing complex structures. However, existing\nwork on incorporating graph learning problems into the generative language\nmodeling framework remains very limited. As the importance of large language\nmodels continues to grow, it becomes essential to explore whether LLMs can also\nreplace GNNs as the foundation model for graphs. In this paper, we propose\nInstructGLM (Instruction-finetuned Graph Language Model), systematically design\nhighly scalable prompts based on natural language instructions, and use natural\nlanguage to describe the geometric structure and node features of the graph for\ninstruction tuning an LLM to perform learning and inference on graphs in a\ngenerative manner. Our method exceeds all competitive GNN baselines on\nogbn-arxiv, Cora and PubMed datasets, which demonstrates the effectiveness of\nour method and sheds light on generative large language models as the\nfoundation model for graph machine learning.\n","authors":["Ruosong Ye","Caiqi Zhang","Runhui Wang","Shuyuan Xu","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.07134v3.pdf","comment":"21 pages, 2 figures, 5 tables"},{"id":"http://arxiv.org/abs/2308.12539v1","updated":"2023-08-24T03:53:55Z","published":"2023-08-24T03:53:55Z","title":"CALM : A Multi-task Benchmark for Comprehensive Assessment of Language\n Model Bias","summary":" As language models (LMs) become increasingly powerful, it is important to\nquantify and compare them for sociodemographic bias with potential for harm.\nPrior bias measurement datasets are sensitive to perturbations in their\nmanually designed templates, therefore unreliable. To achieve reliability, we\nintroduce the Comprehensive Assessment of Language Model bias (CALM), a\nbenchmark dataset to quantify bias in LMs across three tasks. We integrate 16\nexisting datasets across different domains, such as Wikipedia and news\narticles, to filter 224 templates from which we construct a dataset of 78,400\nexamples. We compare the diversity of CALM with prior datasets on metrics such\nas average semantic similarity, and variation in template length, and test the\nsensitivity to small perturbations. We show that our dataset is more diverse\nand reliable than previous datasets, thus better capture the breadth of\nlinguistic variation required to reliably evaluate model bias. We evaluate 20\nlarge language models including six prominent families of LMs such as Llama-2.\nIn two LM series, OPT and Bloom, we found that larger parameter models are more\nbiased than lower parameter models. We found the T0 series of models to be the\nleast biased. Furthermore, we noticed a tradeoff between gender and racial bias\nwith increasing model size in some model series. The code is available at\nhttps://github.com/vipulgupta1011/CALM.\n","authors":["Vipul Gupta","Pranav Narayanan Venkit","Hugo Laurençon","Shomir Wilson","Rebecca J. Passonneau"],"pdf_url":"https://arxiv.org/pdf/2308.12539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12531v1","updated":"2023-08-24T03:40:54Z","published":"2023-08-24T03:40:54Z","title":"CARE: Co-Attention Network for Joint Entity and Relation Extraction","summary":" Joint entity and relation extraction is the fundamental task of information\nextraction, consisting of two subtasks: named entity recognition and relation\nextraction. Most existing joint extraction methods suffer from issues of\nfeature confusion or inadequate interaction between two subtasks. In this work,\nwe propose a Co-Attention network for joint entity and Relation Extraction\n(CARE). Our approach involves learning separate representations for each\nsubtask, aiming to avoid feature overlap. At the core of our approach is the\nco-attention module that captures two-way interaction between two subtasks,\nallowing the model to leverage entity information for relation prediction and\nvice versa, thus promoting mutual enhancement. Extensive experiments on three\njoint entity-relation extraction benchmark datasets (NYT, WebNLG and SciERC)\nshow that our proposed model achieves superior performance, surpassing existing\nbaseline models.\n","authors":["Wenjun Kong","Yamei Xia"],"pdf_url":"https://arxiv.org/pdf/2308.12531v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12519v1","updated":"2023-08-24T03:11:45Z","published":"2023-08-24T03:11:45Z","title":"Large Language Model as Autonomous Decision Maker","summary":" While large language models (LLMs) exhibit impressive language understanding\nand in-context learning abilities, their decision-making ability still heavily\nrelies on the guidance of task-specific expert knowledge when solving\nreal-world tasks. To unleash the potential of LLMs as autonomous decision\nmakers, this paper presents an approach JuDec to endow LLMs with the\nself-judgment ability, enabling LLMs to achieve autonomous judgment and\nexploration for decision making. Specifically, in JuDec, Elo-based\nSelf-Judgment Mechanism is designed to assign Elo scores to decision steps to\njudge their values and utilities via pairwise comparisons between two solutions\nand then guide the decision-searching process toward the optimal solution\naccordingly. Experimental results on the ToolBench dataset demonstrate JuDec's\nsuperiority over baselines, achieving over 10% improvement in Pass Rate on\ndiverse tasks. It offers higher-quality solutions and reduces costs (ChatGPT\nAPI calls), highlighting its effectiveness and efficiency.\n","authors":["Yining Ye","Xin Cong","Yujia Qin","Yankai Lin","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2308.12519v1.pdf","comment":"Work in progess"},{"id":"http://arxiv.org/abs/2308.12490v1","updated":"2023-08-24T01:24:09Z","published":"2023-08-24T01:24:09Z","title":"MultiPA: a multi-task speech pronunciation assessment system for a\n closed and open response scenario","summary":" The design of automatic speech pronunciation assessment can be categorized\ninto closed and open response scenarios, each with strengths and limitations. A\nsystem with the ability to function in both scenarios can cater to diverse\nlearning needs and provide a more precise and holistic assessment of\npronunciation skills. In this study, we propose a Multi-task Pronunciation\nAssessment model called MultiPA. MultiPA provides an alternative to Kaldi-based\nsystems in that it has simpler format requirements and better compatibility\nwith other neural network models. Compared with previous open response systems,\nMultiPA provides a wider range of evaluations, encompassing assessments at both\nthe sentence and word-level. Our experimental results show that MultiPA\nachieves comparable performance when working in closed response scenarios and\nmaintains more robust performance when directly used for open responses.\n","authors":["Yu-Wen Chen","Zhou Yu","Julia Hirschberg"],"pdf_url":"https://arxiv.org/pdf/2308.12490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.17810v2","updated":"2023-08-24T01:22:36Z","published":"2023-06-30T17:16:04Z","title":"A Massive Scale Semantic Similarity Dataset of Historical English","summary":" A diversity of tasks use language models trained on semantic similarity data.\nWhile there are a variety of datasets that capture semantic similarity, they\nare either constructed from modern web data or are relatively small datasets\ncreated in the past decade by human annotators. This study utilizes a novel\nsource, newly digitized articles from off-copyright, local U.S. newspapers, to\nassemble a massive-scale semantic similarity dataset spanning 70 years from\n1920 to 1989 and containing nearly 400M positive semantic similarity pairs.\nHistorically, around half of articles in U.S. local newspapers came from\nnewswires like the Associated Press. While local papers reproduced articles\nfrom the newswire, they wrote their own headlines, which form abstractive\nsummaries of the associated articles. We associate articles and their headlines\nby exploiting document layouts and language understanding. We then use deep\nneural methods to detect which articles are from the same underlying source, in\nthe presence of substantial noise and abridgement. The headlines of reproduced\narticles form positive semantic similarity pairs. The resulting publicly\navailable HEADLINES dataset is significantly larger than most existing semantic\nsimilarity datasets and covers a much longer span of time. It will facilitate\nthe application of contrastively trained semantic similarity models to a\nvariety of tasks, including the study of semantic change across space and time.\n","authors":["Emily Silcock","Melissa Dell"],"pdf_url":"https://arxiv.org/pdf/2306.17810v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12488v1","updated":"2023-08-24T01:17:16Z","published":"2023-08-24T01:17:16Z","title":"GPTEval: A Survey on Assessments of ChatGPT and GPT-4","summary":" The emergence of ChatGPT has generated much speculation in the press about\nits potential to disrupt social and economic systems. Its astonishing language\nability has aroused strong curiosity among scholars about its performance in\ndifferent domains. There have been many studies evaluating the ability of\nChatGPT and GPT-4 in different tasks and disciplines. However, a comprehensive\nreview summarizing the collective assessment findings is lacking. The objective\nof this survey is to thoroughly analyze prior assessments of ChatGPT and GPT-4,\nfocusing on its language and reasoning abilities, scientific knowledge, and\nethical considerations. Furthermore, an examination of the existing evaluation\nmethods is conducted, offering several recommendations for future research in\nevaluating large language models.\n","authors":["Rui Mao","Guanyi Chen","Xulang Zhang","Frank Guerin","Erik Cambria"],"pdf_url":"https://arxiv.org/pdf/2308.12488v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12477v1","updated":"2023-08-24T00:24:42Z","published":"2023-08-24T00:24:42Z","title":"American Stories: A Large-Scale Structured Text Dataset of Historical\n U.S. Newspapers","summary":" Existing full text datasets of U.S. public domain newspapers do not recognize\nthe often complex layouts of newspaper scans, and as a result the digitized\ncontent scrambles texts from articles, headlines, captions, advertisements, and\nother layout regions. OCR quality can also be low. This study develops a novel,\ndeep learning pipeline for extracting full article texts from newspaper images\nand applies it to the nearly 20 million scans in Library of Congress's public\ndomain Chronicling America collection. The pipeline includes layout detection,\nlegibility classification, custom OCR, and association of article texts\nspanning multiple bounding boxes. To achieve high scalability, it is built with\nefficient architectures designed for mobile phones. The resulting American\nStories dataset provides high quality data that could be used for pre-training\na large language model to achieve better understanding of historical English\nand historical world knowledge. The dataset could also be added to the external\ndatabase of a retrieval-augmented language model to make historical information\n- ranging from interpretations of political events to minutiae about the lives\nof people's ancestors - more widely accessible. Furthermore, structured article\ntexts facilitate using transformer-based methods for popular social science\napplications like topic classification, detection of reproduced content, and\nnews story clustering. Finally, American Stories provides a massive silver\nquality dataset for innovating multimodal layout analysis models and other\nmultimodal applications.\n","authors":["Melissa Dell","Jacob Carlson","Tom Bryan","Emily Silcock","Abhishek Arora","Zejiang Shen","Luca D'Amico-Wong","Quan Le","Pablo Querubin","Leander Heldring"],"pdf_url":"https://arxiv.org/pdf/2308.12477v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13116v1","updated":"2023-08-24T23:38:44Z","published":"2023-08-24T23:38:44Z","title":"Sentence Embedding Models for Ancient Greek Using Multilingual Knowledge\n Distillation","summary":" Contextual language models have been trained on Classical languages,\nincluding Ancient Greek and Latin, for tasks such as lemmatization,\nmorphological tagging, part of speech tagging, authorship attribution, and\ndetection of scribal errors. However, high-quality sentence embedding models\nfor these historical languages are significantly more difficult to achieve due\nto the lack of training data. In this work, we use a multilingual knowledge\ndistillation approach to train BERT models to produce sentence embeddings for\nAncient Greek text. The state-of-the-art sentence embedding approaches for\nhigh-resource languages use massive datasets, but our distillation approach\nallows our Ancient Greek models to inherit the properties of these models while\nusing a relatively small amount of translated sentence data. We build a\nparallel sentence dataset using a sentence-embedding alignment method to align\nAncient Greek documents with English translations, and use this dataset to\ntrain our models. We evaluate our models on translation search, semantic\nsimilarity, and semantic retrieval tasks and investigate translation bias. We\nmake our training and evaluation datasets freely available at\nhttps://github.com/kevinkrahn/ancient-greek-datasets .\n","authors":["Kevin Krahn","Derrick Tate","Andrew C. Lamicela"],"pdf_url":"https://arxiv.org/pdf/2308.13116v1.pdf","comment":"Paper accepted for publication at the First Workshop on Ancient\n Language Processing (ALP) 2023; 10 pages, 3 figures, 9 tables"},{"id":"http://arxiv.org/abs/2305.07011v3","updated":"2023-08-24T23:21:27Z","published":"2023-05-11T17:53:29Z","title":"Region-Aware Pretraining for Open-Vocabulary Object Detection with\n Vision Transformers","summary":" We present Region-aware Open-vocabulary Vision Transformers (RO-ViT) - a\ncontrastive image-text pretraining recipe to bridge the gap between image-level\npretraining and open-vocabulary object detection. At the pretraining phase, we\npropose to randomly crop and resize regions of positional embeddings instead of\nusing the whole image positional embeddings. This better matches the use of\npositional embeddings at region-level in the detection finetuning phase. In\naddition, we replace the common softmax cross entropy loss in contrastive\nlearning with focal loss to better learn the informative yet difficult\nexamples. Finally, we leverage recent advances in novel object proposals to\nimprove open-vocabulary detection finetuning. We evaluate our full model on the\nLVIS and COCO open-vocabulary detection benchmarks and zero-shot transfer.\nRO-ViT achieves a state-of-the-art 34.1 $AP_r$ on LVIS, surpassing the best\nexisting approach by +7.8 points in addition to competitive zero-shot transfer\ndetection. Surprisingly, RO-ViT improves the image-level representation as well\nand achieves the state of the art on 9 out of 12 metrics on COCO and Flickr\nimage-text retrieval benchmarks, outperforming competitive approaches with\nlarger models.\n","authors":["Dahun Kim","Anelia Angelova","Weicheng Kuo"],"pdf_url":"https://arxiv.org/pdf/2305.07011v3.pdf","comment":"CVPR 2023 Highlight (https://github.com/mcahny/rovit); adds LAION-2B\n result"},{"id":"http://arxiv.org/abs/2308.13089v1","updated":"2023-08-24T21:19:48Z","published":"2023-08-24T21:19:48Z","title":"Towards a Holistic Approach: Understanding Sociodemographic Biases in\n NLP Models using an Interdisciplinary Lens","summary":" The rapid growth in the usage and applications of Natural Language Processing\n(NLP) in various sociotechnical solutions has highlighted the need for a\ncomprehensive understanding of bias and its impact on society. While research\non bias in NLP has expanded, several challenges persist that require attention.\nThese include the limited focus on sociodemographic biases beyond race and\ngender, the narrow scope of analysis predominantly centered on models, and the\ntechnocentric implementation approaches. This paper addresses these challenges\nand advocates for a more interdisciplinary approach to understanding bias in\nNLP. The work is structured into three facets, each exploring a specific aspect\nof bias in NLP.\n","authors":["Pranav Narayanan Venkit"],"pdf_url":"https://arxiv.org/pdf/2308.13089v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13081v1","updated":"2023-08-24T20:57:07Z","published":"2023-08-24T20:57:07Z","title":"Formal specification terminology for demographic agent-based models of\n fixed-step single-clocked simulations","summary":" This document presents adequate formal terminology for the mathematical\nspecification of a subset of Agent Based Models (ABMs) in the field of\nDemography. The simulation of the targeted ABMs follows a fixed-step\nsingle-clocked pattern. The proposed terminology further improves the model\nunderstanding and can act as a stand-alone methodology for the specification\nand optionally the documentation of a significant set of (demographic) ABMs.\nNevertheless, it is imaginable the this terminology probably with further\nextensions can be merged with the largely-informal widely-used model\ndocumentation and communication O.D.D. protocol [Grimm and et al., 2020,\nAmouroux et al., 2010] to reduce many sources of ambiguity, hindering model\nreplications by other modelers. A published demographic model documentation,\nlargely simplified version of the Lone Parent Model [Gostoli and Silverman,\n2020] is separately published in [Elsheikh, 2023b] as illustration for the\nformal terminology. The model was implemented in the Julia language [Elsheikh,\n2023a] based on the Agents.jl julia package [Datseris et al., 2022].\n","authors":["Atiyah Elsheikh"],"pdf_url":"https://arxiv.org/pdf/2308.13081v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2307.16548"},{"id":"http://arxiv.org/abs/2308.11490v2","updated":"2023-08-24T20:52:01Z","published":"2023-08-22T15:10:45Z","title":"Can Authorship Representation Learning Capture Stylistic Features?","summary":" Automatically disentangling an author's style from the content of their\nwriting is a longstanding and possibly insurmountable problem in computational\nlinguistics. At the same time, the availability of large text corpora furnished\nwith author labels has recently enabled learning authorship representations in\na purely data-driven manner for authorship attribution, a task that ostensibly\ndepends to a greater extent on encoding writing style than encoding content.\nHowever, success on this surrogate task does not ensure that such\nrepresentations capture writing style since authorship could also be correlated\nwith other latent variables, such as topic. In an effort to better understand\nthe nature of the information these representations convey, and specifically to\nvalidate the hypothesis that they chiefly encode writing style, we\nsystematically probe these representations through a series of targeted\nexperiments. The results of these experiments suggest that representations\nlearned for the surrogate authorship prediction task are indeed sensitive to\nwriting style. As a consequence, authorship representations may be expected to\nbe robust to certain kinds of data shift, such as topic drift over time.\nAdditionally, our findings may open the door to downstream applications that\nrequire stylistic representations, such as style transfer.\n","authors":["Andrew Wang","Cristina Aggazzotti","Rebecca Kotula","Rafael Rivera Soto","Marcus Bishop","Nicholas Andrews"],"pdf_url":"https://arxiv.org/pdf/2308.11490v2.pdf","comment":"appearing at TACL 2023"},{"id":"http://arxiv.org/abs/2306.10067v2","updated":"2023-08-24T20:24:13Z","published":"2023-06-15T15:26:20Z","title":"Domain-specific ChatBots for Science using Embeddings","summary":" Large language models (LLMs) have emerged as powerful machine-learning\nsystems capable of handling a myriad of tasks. Tuned versions of these systems\nhave been turned into chatbots that can respond to user queries on a vast\ndiversity of topics, providing informative and creative replies. However, their\napplication to physical science research remains limited owing to their\nincomplete knowledge in these areas, contrasted with the needs of rigor and\nsourcing in science domains. Here, we demonstrate how existing methods and\nsoftware tools can be easily combined to yield a domain-specific chatbot. The\nsystem ingests scientific documents in existing formats, and uses text\nembedding lookup to provide the LLM with domain-specific contextual information\nwhen composing its reply. We similarly demonstrate that existing image\nembedding methods can be used for search and retrieval across publication\nfigures. These results confirm that LLMs are already suitable for use by\nphysical scientists in accelerating their research efforts.\n","authors":["Kevin G. Yager"],"pdf_url":"https://arxiv.org/pdf/2306.10067v2.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.13056v1","updated":"2023-08-24T19:49:30Z","published":"2023-08-24T19:49:30Z","title":"Lexical Diversity in Kinship Across Languages and Dialects","summary":" Languages are known to describe the world in diverse ways. Across lexicons,\ndiversity is pervasive, appearing through phenomena such as lexical gaps and\nuntranslatability. However, in computational resources, such as multilingual\nlexical databases, diversity is hardly ever represented. In this paper, we\nintroduce a method to enrich computational lexicons with content relating to\nlinguistic diversity. The method is verified through two large-scale case\nstudies on kinship terminology, a domain known to be diverse across languages\nand cultures: one case study deals with seven Arabic dialects, while the other\none with three Indonesian languages. Our results, made available as browseable\nand downloadable computational resources, extend prior linguistics research on\nkinship terminology, and provide insight into the extent of diversity even\nwithin linguistically and culturally close communities.\n","authors":["Hadi Khalilia","Gábor Bella","Abed Alhakim Freihat","Shandy Darma","Fausto Giunchiglia"],"pdf_url":"https://arxiv.org/pdf/2308.13056v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13032v1","updated":"2023-08-24T18:58:10Z","published":"2023-08-24T18:58:10Z","title":"Financial News Analytics Using Fine-Tuned Llama 2 GPT Model","summary":" The paper considers the possibility to fine-tune Llama 2 Large Language Model\n(LLM) for the multitask analysis of financial news. For fine-tuning, the\nPEFT/LoRA based approach was used. In the study, the model was fine-tuned for\nthe following tasks: analysing a text from financial market perspectives,\nhighlighting main points of a text, summarizing a text and extracting named\nentities with appropriate sentiments. The obtained results show that the\nfine-tuned Llama 2 model can perform a multitask financial news analysis with a\nspecified structure of response, part of response can be a structured text and\nanother part of data can have JSON format for further processing. Extracted\nsentiments for named entities can be considered as predictive features in\nsupervised machine learning models with quantitative target variables.\n","authors":["Bohdan M. Pavlyshenko"],"pdf_url":"https://arxiv.org/pdf/2308.13032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.16894v3","updated":"2023-08-24T18:45:43Z","published":"2023-03-29T17:59:10Z","title":"ViewRefer: Grasp the Multi-view Knowledge for 3D Visual Grounding with\n GPT and Prototype Guidance","summary":" Understanding 3D scenes from multi-view inputs has been proven to alleviate\nthe view discrepancy issue in 3D visual grounding. However, existing methods\nnormally neglect the view cues embedded in the text modality and fail to weigh\nthe relative importance of different views. In this paper, we propose\nViewRefer, a multi-view framework for 3D visual grounding exploring how to\ngrasp the view knowledge from both text and 3D modalities. For the text branch,\nViewRefer leverages the diverse linguistic knowledge of large-scale language\nmodels, e.g., GPT, to expand a single grounding text to multiple\ngeometry-consistent descriptions. Meanwhile, in the 3D modality, a transformer\nfusion module with inter-view attention is introduced to boost the interaction\nof objects across views. On top of that, we further present a set of learnable\nmulti-view prototypes, which memorize scene-agnostic knowledge for different\nviews, and enhance the framework from two perspectives: a view-guided attention\nmodule for more robust text features, and a view-guided scoring strategy during\nthe final prediction. With our designed paradigm, ViewRefer achieves superior\nperformance on three benchmarks and surpasses the second-best by +2.8%, +1.5%,\nand +1.35% on Sr3D, Nr3D, and ScanRefer.\n","authors":["Zoey Guo","Yiwen Tang","Ray Zhang","Dong Wang","Zhigang Wang","Bin Zhao","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2303.16894v3.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2202.08806v2","updated":"2023-08-24T17:46:12Z","published":"2022-02-17T18:19:53Z","title":"Grammar-Based Grounded Lexicon Learning","summary":" We present Grammar-Based Grounded Lexicon Learning (G2L2), a lexicalist\napproach toward learning a compositional and grounded meaning representation of\nlanguage from grounded data, such as paired images and texts. At the core of\nG2L2 is a collection of lexicon entries, which map each word to a tuple of a\nsyntactic type and a neuro-symbolic semantic program. For example, the word\nshiny has a syntactic type of adjective; its neuro-symbolic semantic program\nhas the symbolic form {\\lambda}x. filter(x, SHINY), where the concept SHINY is\nassociated with a neural network embedding, which will be used to classify\nshiny objects. Given an input sentence, G2L2 first looks up the lexicon entries\nassociated with each token. It then derives the meaning of the sentence as an\nexecutable neuro-symbolic program by composing lexical meanings based on\nsyntax. The recovered meaning programs can be executed on grounded inputs. To\nfacilitate learning in an exponentially-growing compositional space, we\nintroduce a joint parsing and expected execution algorithm, which does local\nmarginalization over derivations to reduce the training time. We evaluate G2L2\non two domains: visual reasoning and language-driven navigation. Results show\nthat G2L2 can generalize from small amounts of data to novel compositions of\nwords.\n","authors":["Jiayuan Mao","Haoyue Shi","Jiajun Wu","Roger P. Levy","Joshua B. Tenenbaum"],"pdf_url":"https://arxiv.org/pdf/2202.08806v2.pdf","comment":"Minor typo fixes. NeurIPS 2021. Project page:\n https://g2l2.csail.mit.edu/"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.12969v1","updated":"2023-08-24T17:59:51Z","published":"2023-08-24T17:59:51Z","title":"ROAM: Robust and Object-aware Motion Generation using Neural Pose\n Descriptors","summary":" Existing automatic approaches for 3D virtual character motion synthesis\nsupporting scene interactions do not generalise well to new objects outside\ntraining distributions, even when trained on extensive motion capture datasets\nwith diverse objects and annotated interactions. This paper addresses this\nlimitation and shows that robustness and generalisation to novel scene objects\nin 3D object-aware character synthesis can be achieved by training a motion\nmodel with as few as one reference object. We leverage an implicit feature\nrepresentation trained on object-only datasets, which encodes an\nSE(3)-equivariant descriptor field around the object. Given an unseen object\nand a reference pose-object pair, we optimise for the object-aware pose that is\nclosest in the feature space to the reference pose. Finally, we use l-NSM,\ni.e., our motion generation model that is trained to seamlessly transition from\nlocomotion to object interaction with the proposed bidirectional pose blending\nscheme. Through comprehensive numerical comparisons to state-of-the-art methods\nand in a user study, we demonstrate substantial improvements in 3D virtual\ncharacter motion and interaction quality and robustness to scenarios with\nunseen objects. Our project page is available at\nhttps://vcai.mpi-inf.mpg.de/projects/ROAM/.\n","authors":["Wanyue Zhang","Rishabh Dabral","Thomas Leimkühler","Vladislav Golyanik","Marc Habermann","Christian Theobalt"],"pdf_url":"https://arxiv.org/pdf/2308.12969v1.pdf","comment":"12 pages, 10 figures; project page:\n https://vcai.mpi-inf.mpg.de/projects/ROAM/"},{"id":"http://arxiv.org/abs/2308.12967v1","updated":"2023-08-24T17:59:50Z","published":"2023-08-24T17:59:50Z","title":"NeO 360: Neural Fields for Sparse View Synthesis of Outdoor Scenes","summary":" Recent implicit neural representations have shown great results for novel\nview synthesis. However, existing methods require expensive per-scene\noptimization from many views hence limiting their application to real-world\nunbounded urban settings where the objects of interest or backgrounds are\nobserved from very few views. To mitigate this challenge, we introduce a new\napproach called NeO 360, Neural fields for sparse view synthesis of outdoor\nscenes. NeO 360 is a generalizable method that reconstructs 360{\\deg} scenes\nfrom a single or a few posed RGB images. The essence of our approach is in\ncapturing the distribution of complex real-world outdoor 3D scenes and using a\nhybrid image-conditional triplanar representation that can be queried from any\nworld point. Our representation combines the best of both voxel-based and\nbird's-eye-view (BEV) representations and is more effective and expressive than\neach. NeO 360's representation allows us to learn from a large collection of\nunbounded 3D scenes while offering generalizability to new views and novel\nscenes from as few as a single image during inference. We demonstrate our\napproach on the proposed challenging 360{\\deg} unbounded dataset, called NeRDS\n360, and show that NeO 360 outperforms state-of-the-art generalizable methods\nfor novel view synthesis while also offering editing and composition\ncapabilities. Project page:\nhttps://zubair-irshad.github.io/projects/neo360.html\n","authors":["Muhammad Zubair Irshad","Sergey Zakharov","Katherine Liu","Vitor Guizilini","Thomas Kollar","Adrien Gaidon","Zsolt Kira","Rares Ambrus"],"pdf_url":"https://arxiv.org/pdf/2308.12967v1.pdf","comment":"Accepted to International Conference on Computer Vision (ICCV), 2023.\n Project page: https://zubair-irshad.github.io/projects/neo360.html"},{"id":"http://arxiv.org/abs/2308.12968v1","updated":"2023-08-24T17:59:50Z","published":"2023-08-24T17:59:50Z","title":"Scenimefy: Learning to Craft Anime Scene via Semi-Supervised\n Image-to-Image Translation","summary":" Automatic high-quality rendering of anime scenes from complex real-world\nimages is of significant practical value. The challenges of this task lie in\nthe complexity of the scenes, the unique features of anime style, and the lack\nof high-quality datasets to bridge the domain gap. Despite promising attempts,\nprevious efforts are still incompetent in achieving satisfactory results with\nconsistent semantic preservation, evident stylization, and fine details. In\nthis study, we propose Scenimefy, a novel semi-supervised image-to-image\ntranslation framework that addresses these challenges. Our approach guides the\nlearning with structure-consistent pseudo paired data, simplifying the pure\nunsupervised setting. The pseudo data are derived uniquely from a\nsemantic-constrained StyleGAN leveraging rich model priors like CLIP. We\nfurther apply segmentation-guided data selection to obtain high-quality pseudo\nsupervision. A patch-wise contrastive style loss is introduced to improve\nstylization and fine details. Besides, we contribute a high-resolution anime\nscene dataset to facilitate future research. Our extensive experiments\ndemonstrate the superiority of our method over state-of-the-art baselines in\nterms of both perceptual quality and quantitative performance.\n","authors":["Yuxin Jiang","Liming Jiang","Shuai Yang","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2308.12968v1.pdf","comment":"ICCV 2023. The first two authors contributed equally. Code:\n https://github.com/Yuxinn-J/Scenimefy Project page:\n https://yuxinn-j.github.io/projects/Scenimefy.html"},{"id":"http://arxiv.org/abs/2308.12966v1","updated":"2023-08-24T17:59:17Z","published":"2023-08-24T17:59:17Z","title":"Qwen-VL: A Frontier Large Vision-Language Model with Versatile Abilities","summary":" We introduce the Qwen-VL series, a set of large-scale vision-language models\ndesigned to perceive and understand both text and images. Comprising Qwen-VL\nand Qwen-VL-Chat, these models exhibit remarkable performance in tasks like\nimage captioning, question answering, visual localization, and flexible\ninteraction. The evaluation covers a wide range of tasks including zero-shot\ncaptioning, visual or document visual question answering, and grounding. We\ndemonstrate the Qwen-VL outperforms existing Large Vision Language Models\n(LVLMs). We present their architecture, training, capabilities, and\nperformance, highlighting their contributions to advancing multimodal\nartificial intelligence. Code, demo and models are available at\nhttps://github.com/QwenLM/Qwen-VL.\n","authors":["Jinze Bai","Shuai Bai","Shusheng Yang","Shijie Wang","Sinan Tan","Peng Wang","Junyang Lin","Chang Zhou","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.12966v1.pdf","comment":"Code, demo and models are available at\n https://github.com/QwenLM/Qwen-VL"},{"id":"http://arxiv.org/abs/2308.12965v1","updated":"2023-08-24T17:59:04Z","published":"2023-08-24T17:59:04Z","title":"POCO: 3D Pose and Shape Estimation with Confidence","summary":" The regression of 3D Human Pose and Shape (HPS) from an image is becoming\nincreasingly accurate. This makes the results useful for downstream tasks like\nhuman action recognition or 3D graphics. Yet, no regressor is perfect, and\naccuracy can be affected by ambiguous image evidence or by poses and appearance\nthat are unseen during training. Most current HPS regressors, however, do not\nreport the confidence of their outputs, meaning that downstream tasks cannot\ndifferentiate accurate estimates from inaccurate ones. To address this, we\ndevelop POCO, a novel framework for training HPS regressors to estimate not\nonly a 3D human body, but also their confidence, in a single feed-forward pass.\nSpecifically, POCO estimates both the 3D body pose and a per-sample variance.\nThe key idea is to introduce a Dual Conditioning Strategy (DCS) for regressing\nuncertainty that is highly correlated to pose reconstruction quality. The POCO\nframework can be applied to any HPS regressor and here we evaluate it by\nmodifying HMR, PARE, and CLIFF. In all cases, training the network to reason\nabout uncertainty helps it learn to more accurately estimate 3D pose. While\nthis was not our goal, the improvement is modest but consistent. Our main\nmotivation is to provide uncertainty estimates for downstream tasks; we\ndemonstrate this in two ways: (1) We use the confidence estimates to bootstrap\nHPS training. Given unlabelled image data, we take the confident estimates of a\nPOCO-trained regressor as pseudo ground truth. Retraining with this\nautomatically-curated data improves accuracy. (2) We exploit uncertainty in\nvideo pose estimation by automatically identifying uncertain frames (e.g. due\nto occlusion) and inpainting these from confident frames. Code and models will\nbe available for research at https://poco.is.tue.mpg.de.\n","authors":["Sai Kumar Dwivedi","Cordelia Schmid","Hongwei Yi","Michael J. Black","Dimitrios Tzionas"],"pdf_url":"https://arxiv.org/pdf/2308.12965v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12964v1","updated":"2023-08-24T17:59:01Z","published":"2023-08-24T17:59:01Z","title":"Dense Text-to-Image Generation with Attention Modulation","summary":" Existing text-to-image diffusion models struggle to synthesize realistic\nimages given dense captions, where each text prompt provides a detailed\ndescription for a specific image region. To address this, we propose\nDenseDiffusion, a training-free method that adapts a pre-trained text-to-image\nmodel to handle such dense captions while offering control over the scene\nlayout. We first analyze the relationship between generated images' layouts and\nthe pre-trained model's intermediate attention maps. Next, we develop an\nattention modulation method that guides objects to appear in specific regions\naccording to layout guidance. Without requiring additional fine-tuning or\ndatasets, we improve image generation performance given dense captions\nregarding both automatic and human evaluation scores. In addition, we achieve\nsimilar-quality visual results with models specifically trained with layout\nconditions.\n","authors":["Yunji Kim","Jiyoung Lee","Jin-Hwa Kim","Jung-Woo Ha","Jun-Yan Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.12964v1.pdf","comment":"Accepted by ICCV2023. Code and data are available at\n https://github.com/naver-ai/DenseDiffusion"},{"id":"http://arxiv.org/abs/2308.12963v1","updated":"2023-08-24T17:58:30Z","published":"2023-08-24T17:58:30Z","title":"MapPrior: Bird's-Eye View Map Layout Estimation with Generative Models","summary":" Despite tremendous advancements in bird's-eye view (BEV) perception, existing\nmodels fall short in generating realistic and coherent semantic map layouts,\nand they fail to account for uncertainties arising from partial sensor\ninformation (such as occlusion or limited coverage). In this work, we introduce\nMapPrior, a novel BEV perception framework that combines a traditional\ndiscriminative BEV perception model with a learned generative model for\nsemantic map layouts. Our MapPrior delivers predictions with better accuracy,\nrealism, and uncertainty awareness. We evaluate our model on the large-scale\nnuScenes benchmark. At the time of submission, MapPrior outperforms the\nstrongest competing method, with significantly improved MMD and ECE scores in\ncamera- and LiDAR-based BEV perception.\n","authors":["Xiyue Zhu","Vlas Zyrianov","Zhijian Liu","Shenlong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.12963v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12962v1","updated":"2023-08-24T17:58:04Z","published":"2023-08-24T17:58:04Z","title":"Motion-Guided Masking for Spatiotemporal Representation Learning","summary":" Several recent works have directly extended the image masked autoencoder\n(MAE) with random masking into video domain, achieving promising results.\nHowever, unlike images, both spatial and temporal information are important for\nvideo understanding. This suggests that the random masking strategy that is\ninherited from the image MAE is less effective for video MAE. This motivates\nthe design of a novel masking algorithm that can more efficiently make use of\nvideo saliency. Specifically, we propose a motion-guided masking algorithm\n(MGM) which leverages motion vectors to guide the position of each mask over\ntime. Crucially, these motion-based correspondences can be directly obtained\nfrom information stored in the compressed format of the video, which makes our\nmethod efficient and scalable. On two challenging large-scale video benchmarks\n(Kinetics-400 and Something-Something V2), we equip video MAE with our MGM and\nachieve up to +$1.3\\%$ improvement compared to previous state-of-the-art\nmethods. Additionally, our MGM achieves equivalent performance to previous\nvideo MAE using up to $66\\%$ fewer training epochs. Lastly, we show that MGM\ngeneralizes better to downstream transfer learning and domain adaptation tasks\non the UCF101, HMDB51, and Diving48 datasets, achieving up to +$4.9\\%$\nimprovement compared to baseline methods.\n","authors":["David Fan","Jue Wang","Shuai Liao","Yi Zhu","Vimal Bhat","Hector Santos-Villalobos","Rohith MV","Xinyu Li"],"pdf_url":"https://arxiv.org/pdf/2308.12962v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.12961v1","updated":"2023-08-24T17:58:03Z","published":"2023-08-24T17:58:03Z","title":"Less is More: Towards Efficient Few-shot 3D Semantic Segmentation via\n Training-free Networks","summary":" To reduce the reliance on large-scale datasets, recent works in 3D\nsegmentation resort to few-shot learning. Current 3D few-shot semantic\nsegmentation methods first pre-train the models on `seen' classes, and then\nevaluate their generalization performance on `unseen' classes. However, the\nprior pre-training stage not only introduces excessive time overhead, but also\nincurs a significant domain gap on `unseen' classes. To tackle these issues, we\npropose an efficient Training-free Few-shot 3D Segmentation netwrok, TFS3D, and\na further training-based variant, TFS3D-T. Without any learnable parameters,\nTFS3D extracts dense representations by trigonometric positional encodings, and\nachieves comparable performance to previous training-based methods. Due to the\nelimination of pre-training, TFS3D can alleviate the domain gap issue and save\na substantial amount of time. Building upon TFS3D, TFS3D-T only requires to\ntrain a lightweight query-support transferring attention (QUEST), which\nenhances the interaction between the few-shot query and support data.\nExperiments demonstrate TFS3D-T improves previous state-of-the-art methods by\n+6.93% and +17.96% mIoU respectively on S3DIS and ScanNet, while reducing the\ntraining time by -90%, indicating superior effectiveness and efficiency.\n","authors":["Xiangyang Zhu","Renrui Zhang","Bowei He","Ziyu Guo","Jiaming Liu","Hao Dong","Peng Gao"],"pdf_url":"https://arxiv.org/pdf/2308.12961v1.pdf","comment":"Code is available at https://github.com/yangyangyang127/TFS3D"},{"id":"http://arxiv.org/abs/2308.12960v1","updated":"2023-08-24T17:56:46Z","published":"2023-08-24T17:56:46Z","title":"Towards Realistic Zero-Shot Classification via Self Structural Semantic\n Alignment","summary":" Large-scale pre-trained Vision Language Models (VLMs) have proven effective\nfor zero-shot classification. Despite the success, most traditional VLMs-based\nmethods are restricted by the assumption of partial source supervision or ideal\nvocabularies, which rarely satisfy the open-world scenario. In this paper, we\naim at a more challenging setting, Realistic Zero-Shot Classification, which\nassumes no annotation but instead a broad vocabulary. To address this\nchallenge, we propose the Self Structural Semantic Alignment (S^3A) framework,\nwhich extracts the structural semantic information from unlabeled data while\nsimultaneously self-learning. Our S^3A framework adopts a unique\nCluster-Vote-Prompt-Realign (CVPR) algorithm, which iteratively groups\nunlabeled data to derive structural semantics for pseudo-supervision. Our CVPR\nprocess includes iterative clustering on images, voting within each cluster to\nidentify initial class candidates from the vocabulary, generating\ndiscriminative prompts with large language models to discern confusing\ncandidates, and realigning images and the vocabulary as structural semantic\nalignment. Finally, we propose to self-learn the CLIP image encoder with both\nindividual and structural semantic alignment through a teacher-student learning\nstrategy. Our comprehensive experiments across various generic and fine-grained\nbenchmarks demonstrate that the S^3A method offers substantial improvements\nover existing VLMs-based approaches, achieving a more than 15% accuracy\nimprovement over CLIP on average. Our codes, models, and prompts are publicly\nreleased at https://github.com/sheng-eatamath/S3A.\n","authors":["Sheng Zhang","Muzammal Naseer","Guangyi Chen","Zhiqiang Shen","Salman Khan","Kun Zhang","Fahad Khan"],"pdf_url":"https://arxiv.org/pdf/2308.12960v1.pdf","comment":"submission at 24 Aug"},{"id":"http://arxiv.org/abs/2308.12956v1","updated":"2023-08-24T17:50:21Z","published":"2023-08-24T17:50:21Z","title":"DLIP: Distilling Language-Image Pre-training","summary":" Vision-Language Pre-training (VLP) shows remarkable progress with the\nassistance of extremely heavy parameters, which challenges deployment in real\napplications. Knowledge distillation is well recognized as the essential\nprocedure in model compression. However, existing knowledge distillation\ntechniques lack an in-depth investigation and analysis of VLP, and practical\nguidelines for VLP-oriented distillation are still not yet explored. In this\npaper, we present DLIP, a simple yet efficient Distilling Language-Image\nPre-training framework, through which we investigate how to distill a light VLP\nmodel. Specifically, we dissect the model distillation from multiple\ndimensions, such as the architecture characteristics of different modules and\nthe information transfer of different modalities. We conduct comprehensive\nexperiments and provide insights on distilling a light but performant VLP\nmodel. Experimental results reveal that DLIP can achieve a state-of-the-art\naccuracy/efficiency trade-off across diverse cross-modal tasks, e.g.,\nimage-text retrieval, image captioning and visual question answering. For\nexample, DLIP compresses BLIP by 1.9x, from 213M to 108M parameters, while\nachieving comparable or better performance. Furthermore, DLIP succeeds in\nretaining more than 95% of the performance with 22.4% parameters and 24.8%\nFLOPs compared to the teacher model and accelerates inference speed by 2.7x.\n","authors":["Huafeng Kuang","Jie Wu","Xiawu Zheng","Ming Li","Xuefeng Xiao","Rui Wang","Min Zheng","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2308.12956v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.14944v4","updated":"2023-08-24T17:46:42Z","published":"2022-03-28T17:53:06Z","title":"Differentiable Microscopy Designs an All Optical Phase Retrieval\n Microscope","summary":" Since the late 16th century, scientists have continuously innovated and\ndeveloped new microscope types for various applications. Creating a new\narchitecture from the ground up requires substantial scientific expertise and\ncreativity, often spanning years or even decades. In this study, we propose an\nalternative approach called \"Differentiable Microscopy,\" which introduces a\ntop-down design paradigm for optical microscopes. Using all-optical phase\nretrieval as an illustrative example, we demonstrate the effectiveness of\ndata-driven microscopy design through $\\partial\\mu$. Furthermore, we conduct\ncomprehensive comparisons with competing methods, showcasing the consistent\nsuperiority of our learned designs across multiple datasets, including\nbiological samples. To substantiate our ideas, we experimentally validate the\nfunctionality of one of the learned designs, providing a proof of concept. The\nproposed differentiable microscopy framework supplements the creative process\nof designing new optical systems and would perhaps lead to unconventional but\nbetter optical designs.\n","authors":["Kithmini Herath","Udith Haputhanthri","Ramith Hettiarachchi","Hasindu Kariyawasam","Raja N. Ahmad","Azeem Ahmad","Balpreet S. Ahluwalia","Chamira U. S. Edussooriya","Dushan N. Wadduwage"],"pdf_url":"https://arxiv.org/pdf/2203.14944v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12949v1","updated":"2023-08-24T17:38:14Z","published":"2023-08-24T17:38:14Z","title":"Label Budget Allocation in Multi-Task Learning","summary":" The cost of labeling data often limits the performance of machine learning\nsystems. In multi-task learning, related tasks provide information to each\nother and improve overall performance, but the label cost can vary among tasks.\nHow should the label budget (i.e. the amount of money spent on labeling) be\nallocated among different tasks to achieve optimal multi-task performance? We\nare the first to propose and formally define the label budget allocation\nproblem in multi-task learning and to empirically show that different budget\nallocation strategies make a big difference to its performance. We propose a\nTask-Adaptive Budget Allocation algorithm to robustly generate the optimal\nbudget allocation adaptive to different multi-task learning settings.\nSpecifically, we estimate and then maximize the extent of new information\nobtained from the allocated budget as a proxy for multi-task learning\nperformance. Experiments on PASCAL VOC and Taskonomy demonstrate the efficacy\nof our approach over other widely used heuristic labeling strategies.\n","authors":["Ximeng Sun","Kihyuk Sohn","Kate Saenko","Clayton Mellina","Xiao Bian"],"pdf_url":"https://arxiv.org/pdf/2308.12949v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.01249v3","updated":"2023-08-24T17:30:57Z","published":"2022-10-03T22:04:00Z","title":"LOPR: Latent Occupancy PRediction using Generative Models","summary":" Environment prediction frameworks are integral for autonomous vehicles,\nenabling safe navigation in dynamic environments. LiDAR generated occupancy\ngrid maps (L-OGMs) offer a robust bird's eye-view scene representation that\nfacilitates joint scene predictions without relying on manual labeling unlike\ncommonly used trajectory prediction frameworks. Prior approaches have optimized\ndeterministic L-OGM prediction architectures directly in grid cell space. While\nthese methods have achieved some degree of success in prediction, they\noccasionally grapple with unrealistic and incorrect predictions. We claim that\nthe quality and realism of the forecasted occupancy grids can be enhanced with\nthe use of generative models. We propose a framework that decouples occupancy\nprediction into: representation learning and stochastic prediction within the\nlearned latent space. Our approach allows for conditioning the model on other\navailable sensor modalities such as RGB-cameras and high definition maps. We\ndemonstrate that our approach achieves state-of-the-art performance and is\nreadily transferable between different robotic platforms on the real-world\nNuScenes, Waymo Open, and a custom dataset we collected on an experimental\nvehicle platform.\n","authors":["Bernard Lange","Masha Itkina","Mykel J. Kochenderfer"],"pdf_url":"https://arxiv.org/pdf/2210.01249v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.00143v3","updated":"2023-08-24T17:29:24Z","published":"2022-11-30T22:28:24Z","title":"FIESTA: Autoencoders for accurate fiber segmentation in tractography","summary":" White matter bundle segmentation is a cornerstone of modern tractography to\nstudy the brain's structural connectivity in domains such as neurological\ndisorders, neurosurgery, and aging. In this study, we present FIESTA (FIbEr\nSegmentation in Tractography using Autoencoders), a reliable and robust, fully\nautomated, and easily semi-automatically calibrated pipeline based on deep\nautoencoders that can dissect and fully populate white matter bundles. This\npipeline is built upon previous works that demonstrated how autoencoders can be\nused successfully for streamline filtering, bundle segmentation, and streamline\ngeneration in tractography. Our proposed method improves bundle segmentation\ncoverage by recovering hard-to-track bundles with generative sampling through\nthe latent space seeding of the subject bundle and the atlas bundle. A latent\nspace of streamlines is learned using autoencoder-based modeling combined with\ncontrastive learning. Using an atlas of bundles in standard space (MNI), our\nproposed method segments new tractograms using the autoencoder latent distance\nbetween each tractogram streamline and its closest neighbor bundle in the atlas\nof bundles. Intra-subject bundle reliability is improved by recovering\nhard-to-track streamlines, using the autoencoder to generate new streamlines\nthat increase the spatial coverage of each bundle while remaining anatomically\ncorrect. Results show that our method is more reliable than state-of-the-art\nautomated virtual dissection methods such as RecoBundles, RecoBundlesX,\nTractSeg, White Matter Analysis and XTRACT. Our framework allows for the\ntransition from one anatomical bundle definition to another with marginal\ncalibration efforts. Overall, these results show that our framework improves\nthe practicality and usability of current state-of-the-art bundle segmentation\nframework.\n","authors":["Félix Dumais","Jon Haitz Legarreta","Carl Lemaire","Philippe Poulin","François Rheault","Laurent Petit","Muhamed Barakovic","Stefano Magon","Maxime Descoteaux","Pierre-Marc Jodoin"],"pdf_url":"https://arxiv.org/pdf/2212.00143v3.pdf","comment":"36 pages, 13 figures, accepted in NeuroImage"},{"id":"http://arxiv.org/abs/2308.12938v1","updated":"2023-08-24T17:25:36Z","published":"2023-08-24T17:25:36Z","title":"Perspective-aware Convolution for Monocular 3D Object Detection","summary":" Monocular 3D object detection is a crucial and challenging task for\nautonomous driving vehicle, while it uses only a single camera image to infer\n3D objects in the scene. To address the difficulty of predicting depth using\nonly pictorial clue, we propose a novel perspective-aware convolutional layer\nthat captures long-range dependencies in images. By enforcing convolutional\nkernels to extract features along the depth axis of every image pixel, we\nincorporates perspective information into network architecture. We integrate\nour perspective-aware convolutional layer into a 3D object detector and\ndemonstrate improved performance on the KITTI3D dataset, achieving a 23.9\\%\naverage precision in the easy benchmark. These results underscore the\nimportance of modeling scene clues for accurate depth inference and highlight\nthe benefits of incorporating scene structure in network design. Our\nperspective-aware convolutional layer has the potential to enhance object\ndetection accuracy by providing more precise and context-aware feature\nextraction.\n","authors":["Jia-Quan Yu","Soo-Chang Pei"],"pdf_url":"https://arxiv.org/pdf/2308.12938v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12937v1","updated":"2023-08-24T17:25:09Z","published":"2023-08-24T17:25:09Z","title":"Panoptic-Depth Color Map for Combination of Depth and Image Segmentation","summary":" Image segmentation and depth estimation are crucial tasks in computer vision,\nespecially in autonomous driving scenarios. Although these tasks are typically\naddressed separately, we propose an innovative approach to combine them in our\nnovel deep learning network, Panoptic-DepthLab. By incorporating an additional\ndepth estimation branch into the segmentation network, it can predict the depth\nof each instance segment. Evaluating on Cityscape dataset, we demonstrate the\neffectiveness of our method in achieving high-quality segmentation results with\ndepth and visualize it with a color map. Our proposed method demonstrates a new\npossibility of combining different tasks and networks to generate a more\ncomprehensive image recognition result to facilitate the safety of autonomous\ndriving vehicles.\n","authors":["Jia-Quan Yu","Soo-Chang Pei"],"pdf_url":"https://arxiv.org/pdf/2308.12937v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11067v3","updated":"2023-08-24T17:17:18Z","published":"2023-07-20T17:46:21Z","title":"CNOS: A Strong Baseline for CAD-based Novel Object Segmentation","summary":" We propose a simple three-stage approach to segment unseen objects in RGB\nimages using their CAD models. Leveraging recent powerful foundation models,\nDINOv2 and Segment Anything, we create descriptors and generate proposals,\nincluding binary masks for a given input RGB image. By matching proposals with\nreference descriptors created from CAD models, we achieve precise object ID\nassignment along with modal masks. We experimentally demonstrate that our\nmethod achieves state-of-the-art results in CAD-based novel object\nsegmentation, surpassing existing approaches on the seven core datasets of the\nBOP challenge by 19.8% AP using the same BOP evaluation protocol. Our source\ncode is available at https://github.com/nv-nguyen/cnos.\n","authors":["Van Nguyen Nguyen","Tomas Hodan","Georgy Ponimatkin","Thibault Groueix","Vincent Lepetit"],"pdf_url":"https://arxiv.org/pdf/2307.11067v3.pdf","comment":"ICCV 2023, R6D Workshop"},{"id":"http://arxiv.org/abs/2307.06948v2","updated":"2023-08-24T16:56:59Z","published":"2023-07-13T17:59:35Z","title":"Self-regulating Prompts: Foundational Model Adaptation without\n Forgetting","summary":" Prompt learning has emerged as an efficient alternative for fine-tuning\nfoundational models, such as CLIP, for various downstream tasks. Conventionally\ntrained using the task-specific objective, i.e., cross-entropy loss, prompts\ntend to overfit downstream data distributions and find it challenging to\ncapture task-agnostic general features from the frozen CLIP. This leads to the\nloss of the model's original generalization capability. To address this issue,\nour work introduces a self-regularization framework for prompting called\nPromptSRC (Prompting with Self-regulating Constraints). PromptSRC guides the\nprompts to optimize for both task-specific and task-agnostic general\nrepresentations using a three-pronged approach by: (a) regulating prompted\nrepresentations via mutual agreement maximization with the frozen model, (b)\nregulating with self-ensemble of prompts over the training trajectory to encode\ntheir complementary strengths, and (c) regulating with textual diversity to\nmitigate sample diversity imbalance with the visual branch. To the best of our\nknowledge, this is the first regularization framework for prompt learning that\navoids overfitting by jointly attending to pre-trained model features, the\ntraining trajectory during prompting, and the textual diversity. PromptSRC\nexplicitly steers the prompts to learn a representation space that maximizes\nperformance on downstream tasks without compromising CLIP generalization. We\nperform extensive experiments on 4 benchmarks where PromptSRC overall performs\nfavorably well compared to the existing methods. Our code and pre-trained\nmodels are publicly available at: https://github.com/muzairkhattak/PromptSRC.\n","authors":["Muhammad Uzair Khattak","Syed Talal Wasim","Muzammal Naseer","Salman Khan","Ming-Hsuan Yang","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2307.06948v2.pdf","comment":"Accepted to ICCV-2023. Camera-Ready version. Project page:\n https://muzairkhattak.github.io/PromptSRC/"},{"id":"http://arxiv.org/abs/2307.06947v3","updated":"2023-08-24T16:48:28Z","published":"2023-07-13T17:59:33Z","title":"Video-FocalNets: Spatio-Temporal Focal Modulation for Video Action\n Recognition","summary":" Recent video recognition models utilize Transformer models for long-range\nspatio-temporal context modeling. Video transformer designs are based on\nself-attention that can model global context at a high computational cost. In\ncomparison, convolutional designs for videos offer an efficient alternative but\nlack long-range dependency modeling. Towards achieving the best of both\ndesigns, this work proposes Video-FocalNet, an effective and efficient\narchitecture for video recognition that models both local and global contexts.\nVideo-FocalNet is based on a spatio-temporal focal modulation architecture that\nreverses the interaction and aggregation steps of self-attention for better\nefficiency. Further, the aggregation step and the interaction step are both\nimplemented using efficient convolution and element-wise multiplication\noperations that are computationally less expensive than their self-attention\ncounterparts on video representations. We extensively explore the design space\nof focal modulation-based spatio-temporal context modeling and demonstrate our\nparallel spatial and temporal encoding design to be the optimal choice.\nVideo-FocalNets perform favorably well against the state-of-the-art\ntransformer-based models for video recognition on five large-scale datasets\n(Kinetics-400, Kinetics-600, SS-v2, Diving-48, and ActivityNet-1.3) at a lower\ncomputational cost. Our code/models are released at\nhttps://github.com/TalalWasim/Video-FocalNets.\n","authors":["Syed Talal Wasim","Muhammad Uzair Khattak","Muzammal Naseer","Salman Khan","Mubarak Shah","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2307.06947v3.pdf","comment":"Accepted to ICCV-2023. Camera-Ready version. Project page:\n https://TalalWasim.github.io/Video-FocalNets/"},{"id":"http://arxiv.org/abs/2308.12919v1","updated":"2023-08-24T16:47:17Z","published":"2023-08-24T16:47:17Z","title":"Towards Realistic Unsupervised Fine-tuning with CLIP","summary":" The emergence of vision-language models (VLMs), such as CLIP, has spurred a\nsignificant research effort towards their application for downstream supervised\nlearning tasks. Although some previous studies have explored the unsupervised\nfine-tuning of CLIP, they often rely on prior knowledge in the form of class\nnames associated with ground truth labels. In this paper, we delve into a\nrealistic unsupervised fine-tuning scenario by assuming that the unlabeled data\nmight contain out-of-distribution samples from unknown classes. Furthermore, we\nemphasize the importance of simultaneously enhancing out-of-distribution\ndetection capabilities alongside the recognition of instances associated with\npredefined class labels.\n To tackle this problem, we present a simple, efficient, and effective\nfine-tuning approach called Universal Entropy Optimization (UEO). UEO leverages\nsample-level confidence to approximately minimize the conditional entropy of\nconfident instances and maximize the marginal entropy of less confident\ninstances. Apart from optimizing the textual prompts, UEO also incorporates\noptimization of channel-wise affine transformations within the visual branch of\nCLIP. Through extensive experiments conducted across 15 domains and 4 different\ntypes of prior knowledge, we demonstrate that UEO surpasses baseline methods in\nterms of both generalization and out-of-distribution detection.\n","authors":["Jian Liang","Lijun Sheng","Zhengbo Wang","Ran He","Tieniu Tan"],"pdf_url":"https://arxiv.org/pdf/2308.12919v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12914v1","updated":"2023-08-24T16:40:47Z","published":"2023-08-24T16:40:47Z","title":"Robot Pose Nowcasting: Forecast the Future to Improve the Present","summary":" In recent years, the effective and safe collaboration between humans and\nmachines has gained significant importance, particularly in the Industry 4.0\nscenario. A critical prerequisite for realizing this collaborative paradigm is\nprecisely understanding the robot's 3D pose within its environment. Therefore,\nin this paper, we introduce a novel vision-based system leveraging depth data\nto accurately establish the 3D locations of robotic joints. Specifically, we\nprove the ability of the proposed system to enhance its current pose estimation\naccuracy by jointly learning to forecast future poses. Indeed, we introduce the\nconcept of Pose Nowcasting, denoting the capability of a system to exploit the\nlearned knowledge of the future to improve the estimation of the present. The\nexperimental evaluation is conducted on two different datasets, providing\nstate-of-the-art and real-time performance and confirming the validity of the\nproposed method on both the robotic and human scenarios.\n","authors":["Alessandro Simoni","Francesco Marchetti","Guido Borghi","Federico Becattini","Lorenzo Seidenari","Roberto Vezzani","Alberto Del Bimbo"],"pdf_url":"https://arxiv.org/pdf/2308.12914v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.06496v3","updated":"2023-08-24T16:39:42Z","published":"2023-01-16T16:10:52Z","title":"Efficient data transport over multimode light-pipes with Megapixel\n images using differentiable ray tracing and Machine-learning","summary":" Retrieving images transmitted through multi-mode fibers is of growing\ninterest, thanks to their ability to confine and transport light efficiently in\na compact system. Here, we demonstrate machine-learning-based decoding of\nlarge-scale digital images (pages), maximizing page capacity for optical\nstorage applications. Using a millimeter-sized square cross-section waveguide,\nwe image an 8-bit spatial light modulator, presenting data as a matrix of\nsymbols. Normally, decoders will incur a prohibitive O(n^2) computational\nscaling to decode n symbols in spatially scrambled data. However, by combining\na digital twin of the setup with a U-Net, we can retrieve up to 66 kB using\nefficient convolutional operations only. We compare trainable ray-tracing-based\nwith eigenmode-based twins and show the former to be superior thanks to its\nability to overcome the simulation-to-experiment gap by adjusting to optical\nimperfections. We train the pipeline end-to-end using a differentiable\nmutual-information estimator based on the von-Mises distribution, generally\napplicable to phase-coding channels.\n","authors":["Joowon Lim","Jannes Gladrow","Douglas Kelly","Greg O'Shea","Govert Verkes","Ioan Stefanovici","Sebastian Nowozin","Benn Thomsen"],"pdf_url":"https://arxiv.org/pdf/2301.06496v3.pdf","comment":"21 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.12910v1","updated":"2023-08-24T16:35:35Z","published":"2023-08-24T16:35:35Z","title":"SCoRD: Subject-Conditional Relation Detection with Text-Augmented Data","summary":" We propose Subject-Conditional Relation Detection SCoRD, where conditioned on\nan input subject, the goal is to predict all its relations to other objects in\na scene along with their locations. Based on the Open Images dataset, we\npropose a challenging OIv6-SCoRD benchmark such that the training and testing\nsplits have a distribution shift in terms of the occurrence statistics of\n$\\langle$subject, relation, object$\\rangle$ triplets. To solve this problem, we\npropose an auto-regressive model that given a subject, it predicts its\nrelations, objects, and object locations by casting this output as a sequence\nof tokens. First, we show that previous scene-graph prediction methods fail to\nproduce as exhaustive an enumeration of relation-object pairs when conditioned\non a subject on this benchmark. Particularly, we obtain a recall@3 of 83.8% for\nour relation-object predictions compared to the 49.75% obtained by a recent\nscene graph detector. Then, we show improved generalization on both\nrelation-object and object-box predictions by leveraging during training\nrelation-object pairs obtained automatically from textual captions and for\nwhich no object-box annotations are available. Particularly, for\n$\\langle$subject, relation, object$\\rangle$ triplets for which no object\nlocations are available during training, we are able to obtain a recall@3 of\n42.59% for relation-object pairs and 32.27% for their box locations.\n","authors":["Ziyan Yang","Kushal Kafle","Zhe Lin","Scott Cohen","Zhihong Ding","Vicente Ordonez"],"pdf_url":"https://arxiv.org/pdf/2308.12910v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.00939v6","updated":"2023-08-24T16:26:54Z","published":"2022-10-03T13:50:58Z","title":"Improving Sample Quality of Diffusion Models Using Self-Attention\n Guidance","summary":" Denoising diffusion models (DDMs) have attracted attention for their\nexceptional generation quality and diversity. This success is largely\nattributed to the use of class- or text-conditional diffusion guidance methods,\nsuch as classifier and classifier-free guidance. In this paper, we present a\nmore comprehensive perspective that goes beyond the traditional guidance\nmethods. From this generalized perspective, we introduce novel condition- and\ntraining-free strategies to enhance the quality of generated images. As a\nsimple solution, blur guidance improves the suitability of intermediate samples\nfor their fine-scale information and structures, enabling diffusion models to\ngenerate higher quality samples with a moderate guidance scale. Improving upon\nthis, Self-Attention Guidance (SAG) uses the intermediate self-attention maps\nof diffusion models to enhance their stability and efficacy. Specifically, SAG\nadversarially blurs only the regions that diffusion models attend to at each\niteration and guides them accordingly. Our experimental results show that our\nSAG improves the performance of various diffusion models, including ADM, IDDPM,\nStable Diffusion, and DiT. Moreover, combining SAG with conventional guidance\nmethods leads to further improvement.\n","authors":["Susung Hong","Gyuseong Lee","Wooseok Jang","Seungryong Kim"],"pdf_url":"https://arxiv.org/pdf/2210.00939v6.pdf","comment":"Accepted to ICCV 2023. Project Page:\n https://ku-cvlab.github.io/Self-Attention-Guidance"},{"id":"http://arxiv.org/abs/2308.12902v1","updated":"2023-08-24T16:22:05Z","published":"2023-08-24T16:22:05Z","title":"CDAN: Convolutional Dense Attention-guided Network for Low-light Image\n Enhancement","summary":" Low-light images, characterized by inadequate illumination, pose challenges\nof diminished clarity, muted colors, and reduced details. Low-light image\nenhancement, an essential task in computer vision, aims to rectify these issues\nby improving brightness, contrast, and overall perceptual quality, thereby\nfacilitating accurate analysis and interpretation. This paper introduces the\nConvolutional Dense Attention-guided Network (CDAN), a novel solution for\nenhancing low-light images. CDAN integrates an autoencoder-based architecture\nwith convolutional and dense blocks, complemented by an attention mechanism and\nskip connections. This architecture ensures efficient information propagation\nand feature learning. Furthermore, a dedicated post-processing phase refines\ncolor balance and contrast. Our approach demonstrates notable progress compared\nto state-of-the-art results in low-light image enhancement, showcasing its\nrobustness across a wide range of challenging scenarios. Our model performs\nremarkably on benchmark datasets, effectively mitigating under-exposure and\nproficiently restoring textures and colors in diverse low-light scenarios. This\nachievement underscores CDAN's potential for diverse computer vision tasks,\nnotably enabling robust object detection and recognition in challenging\nlow-light conditions.\n","authors":["Hossein Shakibania","Sina Raoufi","Hassan Khotanlou"],"pdf_url":"https://arxiv.org/pdf/2308.12902v1.pdf","comment":"18 pages, 13 figures"},{"id":"http://arxiv.org/abs/2303.13796v3","updated":"2023-08-24T16:18:35Z","published":"2023-03-24T04:22:41Z","title":"Zolly: Zoom Focal Length Correctly for Perspective-Distorted Human Mesh\n Reconstruction","summary":" As it is hard to calibrate single-view RGB images in the wild, existing 3D\nhuman mesh reconstruction (3DHMR) methods either use a constant large focal\nlength or estimate one based on the background environment context, which can\nnot tackle the problem of the torso, limb, hand or face distortion caused by\nperspective camera projection when the camera is close to the human body. The\nnaive focal length assumptions can harm this task with the incorrectly\nformulated projection matrices. To solve this, we propose Zolly, the first\n3DHMR method focusing on perspective-distorted images. Our approach begins with\nanalysing the reason for perspective distortion, which we find is mainly caused\nby the relative location of the human body to the camera center. We propose a\nnew camera model and a novel 2D representation, termed distortion image, which\ndescribes the 2D dense distortion scale of the human body. We then estimate the\ndistance from distortion scale features rather than environment context\nfeatures. Afterwards, we integrate the distortion feature with image features\nto reconstruct the body mesh. To formulate the correct projection matrix and\nlocate the human body position, we simultaneously use perspective and\nweak-perspective projection loss. Since existing datasets could not handle this\ntask, we propose the first synthetic dataset PDHuman and extend two real-world\ndatasets tailored for this task, all containing perspective-distorted human\nimages. Extensive experiments show that Zolly outperforms existing\nstate-of-the-art methods on both perspective-distorted datasets and the\nstandard benchmark (3DPW).\n","authors":["Wenjia Wang","Yongtao Ge","Haiyi Mei","Zhongang Cai","Qingping Sun","Yanjun Wang","Chunhua Shen","Lei Yang","Taku Komura"],"pdf_url":"https://arxiv.org/pdf/2303.13796v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12898v1","updated":"2023-08-24T16:17:40Z","published":"2023-08-24T16:17:40Z","title":"Can Linguistic Knowledge Improve Multimodal Alignment in Vision-Language\n Pretraining?","summary":" The multimedia community has shown a significant interest in perceiving and\nrepresenting the physical world with multimodal pretrained neural network\nmodels, and among them, the visual-language pertaining (VLP) is, currently, the\nmost captivating topic. However, there have been few endeavors dedicated to the\nexploration of 1) whether essential linguistic knowledge (e.g., semantics and\nsyntax) can be extracted during VLP, and 2) how such linguistic knowledge\nimpact or enhance the multimodal alignment. In response, here we aim to\nelucidate the impact of comprehensive linguistic knowledge, including semantic\nexpression and syntactic structure, on multimodal alignment. Specifically, we\ndesign and release the SNARE, the first large-scale multimodal alignment\nprobing benchmark, to detect the vital linguistic components, e.g., lexical,\nsemantic, and syntax knowledge, containing four tasks: Semantic structure,\nNegation logic, Attribute ownership, and Relationship composition. Based on our\nproposed probing benchmarks, our holistic analyses of five advanced VLP models\nillustrate that the VLP model: i) shows insensitivity towards complex syntax\nstructures and relies on content words for sentence comprehension; ii)\ndemonstrates limited comprehension of combinations between sentences and\nnegations; iii) faces challenges in determining the presence of actions or\nspatial relationships within visual information and struggles with verifying\nthe correctness of triple combinations. We make our benchmark and code\navailable at \\url{https://github.com/WangFei-2019/SNARE/}.\n","authors":["Fei Wang","Liang Ding","Jun Rao","Ye Liu","Li Shen","Changxing Ding"],"pdf_url":"https://arxiv.org/pdf/2308.12898v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.13592v6","updated":"2023-08-24T16:17:21Z","published":"2021-12-27T10:00:16Z","title":"Multimodal Image Synthesis and Editing: The Generative AI Era","summary":" As information exists in various modalities in real world, effective\ninteraction and fusion among multimodal information plays a key role for the\ncreation and perception of multimodal data in computer vision and deep learning\nresearch. With superb power in modeling the interaction among multimodal\ninformation, multimodal image synthesis and editing has become a hot research\ntopic in recent years. Instead of providing explicit guidance for network\ntraining, multimodal guidance offers intuitive and flexible means for image\nsynthesis and editing. On the other hand, this field is also facing several\nchallenges in alignment of multimodal features, synthesis of high-resolution\nimages, faithful evaluation metrics, etc. In this survey, we comprehensively\ncontextualize the advance of the recent multimodal image synthesis and editing\nand formulate taxonomies according to data modalities and model types. We start\nwith an introduction to different guidance modalities in image synthesis and\nediting, and then describe multimodal image synthesis and editing approaches\nextensively according to their model types. After that, we describe benchmark\ndatasets and evaluation metrics as well as corresponding experimental results.\nFinally, we provide insights about the current research challenges and possible\ndirections for future research. A project associated with this survey is\navailable at https://github.com/fnzhan/Generative-AI.\n","authors":["Fangneng Zhan","Yingchen Yu","Rongliang Wu","Jiahui Zhang","Shijian Lu","Lingjie Liu","Adam Kortylewski","Christian Theobalt","Eric Xing"],"pdf_url":"https://arxiv.org/pdf/2112.13592v6.pdf","comment":"TPAMI 2023"},{"id":"http://arxiv.org/abs/2308.12896v1","updated":"2023-08-24T16:16:47Z","published":"2023-08-24T16:16:47Z","title":"Beyond Document Page Classification: Design, Datasets, and Challenges","summary":" This paper highlights the need to bring document classification benchmarking\ncloser to real-world applications, both in the nature of data tested ($X$:\nmulti-channel, multi-paged, multi-industry; $Y$: class distributions and label\nset variety) and in classification tasks considered ($f$: multi-page document,\npage stream, and document bundle classification, ...). We identify the lack of\npublic multi-page document classification datasets, formalize different\nclassification tasks arising in application scenarios, and motivate the value\nof targeting efficient multi-page document representations. An experimental\nstudy on proposed multi-page document classification datasets demonstrates that\ncurrent benchmarks have become irrelevant and need to be updated to evaluate\ncomplete documents, as they naturally occur in practice. This reality check\nalso calls for more mature evaluation methodologies, covering calibration\nevaluation, inference complexity (time-memory), and a range of realistic\ndistribution shifts (e.g., born-digital vs. scanning noise, shifting page\norder). Our study ends on a hopeful note by recommending concrete avenues for\nfuture improvements.}\n","authors":["Jordy Van Landeghem","Sanket Biswas","Matthew B. Blaschko","Marie-Francine Moens"],"pdf_url":"https://arxiv.org/pdf/2308.12896v1.pdf","comment":"8 pages, under review"},{"id":"http://arxiv.org/abs/2308.12894v1","updated":"2023-08-24T16:16:10Z","published":"2023-08-24T16:16:10Z","title":"Boosting Semantic Segmentation from the Perspective of Explicit Class\n Embeddings","summary":" Semantic segmentation is a computer vision task that associates a label with\neach pixel in an image. Modern approaches tend to introduce class embeddings\ninto semantic segmentation for deeply utilizing category semantics, and regard\nsupervised class masks as final predictions. In this paper, we explore the\nmechanism of class embeddings and have an insight that more explicit and\nmeaningful class embeddings can be generated based on class masks purposely.\nFollowing this observation, we propose ECENet, a new segmentation paradigm, in\nwhich class embeddings are obtained and enhanced explicitly during interacting\nwith multi-stage image features. Based on this, we revisit the traditional\ndecoding process and explore inverted information flow between segmentation\nmasks and class embeddings. Furthermore, to ensure the discriminability and\ninformativity of features from backbone, we propose a Feature Reconstruction\nmodule, which combines intrinsic and diverse branches together to ensure the\nconcurrence of diversity and redundancy in features. Experiments show that our\nECENet outperforms its counterparts on the ADE20K dataset with much less\ncomputational cost and achieves new state-of-the-art results on PASCAL-Context\ndataset. The code will be released at https://gitee.com/mindspore/models and\nhttps://github.com/Carol-lyh/ECENet.\n","authors":["Yuhe Liu","Chuanjian Liu","Kai Han","Quan Tang","Zengchang Qin"],"pdf_url":"https://arxiv.org/pdf/2308.12894v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12880v1","updated":"2023-08-24T16:00:01Z","published":"2023-08-24T16:00:01Z","title":"Multi-stage feature decorrelation constraints for improving CNN\n classification performance","summary":" For the convolutional neural network (CNN) used for pattern classification,\nthe training loss function is usually applied to the final output of the\nnetwork, except for some regularization constraints on the network parameters.\nHowever, with the increasing of the number of network layers, the influence of\nthe loss function on the network front layers gradually decreases, and the\nnetwork parameters tend to fall into local optimization. At the same time, it\nis found that the trained network has significant information redundancy at all\nstages of features, which reduces the effectiveness of feature mapping at all\nstages and is not conducive to the change of the subsequent parameters of the\nnetwork in the direction of optimality. Therefore, it is possible to obtain a\nmore optimized solution of the network and further improve the classification\naccuracy of the network by designing a loss function for restraining the front\nstage features and eliminating the information redundancy of the front stage\nfeatures .For CNN, this article proposes a multi-stage feature decorrelation\nloss (MFD Loss), which refines effective features and eliminates information\nredundancy by constraining the correlation of features at all stages.\nConsidering that there are many layers in CNN, through experimental comparison\nand analysis, MFD Loss acts on multiple front layers of CNN, constrains the\noutput features of each layer and each channel, and performs supervision\ntraining jointly with classification loss function during network training.\nCompared with the single Softmax Loss supervised learning, the experiments on\nseveral commonly used datasets on several typical CNNs prove that the\nclassification performance of Softmax Loss+MFD Loss is significantly better.\nMeanwhile, the comparison experiments before and after the combination of MFD\nLoss and some other typical loss functions verify its good universality.\n","authors":["Qiuyu Zhu","Xuewen Zu","Chengfei Liu"],"pdf_url":"https://arxiv.org/pdf/2308.12880v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12870v1","updated":"2023-08-24T15:47:21Z","published":"2023-08-24T15:47:21Z","title":"VNI-Net: Vector Neurons-based Rotation-Invariant Descriptor for LiDAR\n Place Recognition","summary":" LiDAR-based place recognition plays a crucial role in Simultaneous\nLocalization and Mapping (SLAM) and LiDAR localization.\n Despite the emergence of various deep learning-based and hand-crafting-based\nmethods, rotation-induced place recognition failure remains a critical\nchallenge.\n Existing studies address this limitation through specific training strategies\nor network structures.\n However, the former does not produce satisfactory results, while the latter\nfocuses mainly on the reduced problem of SO(2) rotation invariance. Methods\ntargeting SO(3) rotation invariance suffer from limitations in discrimination\ncapability.\n In this paper, we propose a new method that employs Vector Neurons Network\n(VNN) to achieve SO(3) rotation invariance.\n We first extract rotation-equivariant features from neighboring points and\nmap low-dimensional features to a high-dimensional space through VNN.\n Afterwards, we calculate the Euclidean and Cosine distance in the\nrotation-equivariant feature space as rotation-invariant feature descriptors.\n Finally, we aggregate the features using GeM pooling to obtain global\ndescriptors.\n To address the significant information loss when formulating\nrotation-invariant descriptors, we propose computing distances between features\nat different layers within the Euclidean space neighborhood.\n This greatly improves the discriminability of the point cloud descriptors\nwhile ensuring computational efficiency.\n Experimental results on public datasets show that our approach significantly\noutperforms other baseline methods implementing rotation invariance, while\nachieving comparable results with current state-of-the-art place recognition\nmethods that do not consider rotation issues.\n","authors":["Gengxuan Tian","Junqiao Zhao","Yingfeng Cai","Fenglin Zhang","Wenjie Mu","Chen Ye"],"pdf_url":"https://arxiv.org/pdf/2308.12870v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08741v2","updated":"2023-08-24T15:43:17Z","published":"2023-08-17T02:33:16Z","title":"MIPS-Fusion: Multi-Implicit-Submaps for Scalable and Robust Online\n Neural RGB-D Reconstruction","summary":" We introduce MIPS-Fusion, a robust and scalable online RGB-D reconstruction\nmethod based on a novel neural implicit representation --\nmulti-implicit-submap. Different from existing neural RGB-D reconstruction\nmethods lacking either flexibility with a single neural map or scalability due\nto extra storage of feature grids, we propose a pure neural representation\ntackling both difficulties with a divide-and-conquer design. In our method,\nneural submaps are incrementally allocated alongside the scanning trajectory\nand efficiently learned with local neural bundle adjustments. The submaps can\nbe refined individually in a back-end optimization and optimized jointly to\nrealize submap-level loop closure. Meanwhile, we propose a hybrid tracking\napproach combining randomized and gradient-based pose optimizations. For the\nfirst time, randomized optimization is made possible in neural tracking with\nseveral key designs to the learning process, enabling efficient and robust\ntracking even under fast camera motions. The extensive evaluation demonstrates\nthat our method attains higher reconstruction quality than the state of the\narts for large-scale scenes and under fast camera motions.\n","authors":["Yijie Tang","Jiazhao Zhang","Zhinan Yu","He Wang","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2308.08741v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12866v1","updated":"2023-08-24T15:43:14Z","published":"2023-08-24T15:43:14Z","title":"ToonTalker: Cross-Domain Face Reenactment","summary":" We target cross-domain face reenactment in this paper, i.e., driving a\ncartoon image with the video of a real person and vice versa. Recently, many\nworks have focused on one-shot talking face generation to drive a portrait with\na real video, i.e., within-domain reenactment. Straightforwardly applying those\nmethods to cross-domain animation will cause inaccurate expression transfer,\nblur effects, and even apparent artifacts due to the domain shift between\ncartoon and real faces. Only a few works attempt to settle cross-domain face\nreenactment. The most related work AnimeCeleb requires constructing a dataset\nwith pose vector and cartoon image pairs by animating 3D characters, which\nmakes it inapplicable anymore if no paired data is available. In this paper, we\npropose a novel method for cross-domain reenactment without paired data.\nSpecifically, we propose a transformer-based framework to align the motions\nfrom different domains into a common latent space where motion transfer is\nconducted via latent code addition. Two domain-specific motion encoders and two\nlearnable motion base memories are used to capture domain properties. A source\nquery transformer and a driving one are exploited to project domain-specific\nmotion to the canonical space. The edited motion is projected back to the\ndomain of the source with a transformer. Moreover, since no paired data is\nprovided, we propose a novel cross-domain training scheme using data from two\ndomains with the designed analogy constraint. Besides, we contribute a cartoon\ndataset in Disney style. Extensive evaluations demonstrate the superiority of\nour method over competing methods.\n","authors":["Yuan Gong","Yong Zhang","Xiaodong Cun","Fei Yin","Yanbo Fan","Xuan Wang","Baoyuan Wu","Yujiu Yang"],"pdf_url":"https://arxiv.org/pdf/2308.12866v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12863v1","updated":"2023-08-24T15:34:31Z","published":"2023-08-24T15:34:31Z","title":"SkipcrossNets: Adaptive Skip-cross Fusion for Road Detection","summary":" Multi-modal fusion is increasingly being used for autonomous driving tasks,\nas images from different modalities provide unique information for feature\nextraction. However, the existing two-stream networks are only fused at a\nspecific network layer, which requires a lot of manual attempts to set up. As\nthe CNN goes deeper, the two modal features become more and more advanced and\nabstract, and the fusion occurs at the feature level with a large gap, which\ncan easily hurt the performance. In this study, we propose a novel fusion\narchitecture called skip-cross networks (SkipcrossNets), which combines\nadaptively LiDAR point clouds and camera images without being bound to a\ncertain fusion epoch. Specifically, skip-cross connects each layer to each\nlayer in a feed-forward manner, and for each layer, the feature maps of all\nprevious layers are used as input and its own feature maps are used as input to\nall subsequent layers for the other modality, enhancing feature propagation and\nmulti-modal features fusion. This strategy facilitates selection of the most\nsimilar feature layers from two data pipelines, providing a complementary\neffect for sparse point cloud features during fusion processes. The network is\nalso divided into several blocks to reduce the complexity of feature fusion and\nthe number of model parameters. The advantages of skip-cross fusion were\ndemonstrated through application to the KITTI and A2D2 datasets, achieving a\nMaxF score of 96.85% on KITTI and an F1 score of 84.84% on A2D2. The model\nparameters required only 2.33 MB of memory at a speed of 68.24 FPS, which could\nbe viable for mobile terminals and embedded devices.\n","authors":["Xinyu Zhang","Yan Gong","Zhiwei Li","Xin Gao","Dafeng Jin","Jun Li","Huaping Liu"],"pdf_url":"https://arxiv.org/pdf/2308.12863v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12861v1","updated":"2023-08-24T15:32:27Z","published":"2023-08-24T15:32:27Z","title":"Learned Local Attention Maps for Synthesising Vessel Segmentations","summary":" Magnetic resonance angiography (MRA) is an imaging modality for visualising\nblood vessels. It is useful for several diagnostic applications and for\nassessing the risk of adverse events such as haemorrhagic stroke (resulting\nfrom the rupture of aneurysms in blood vessels). However, MRAs are not acquired\nroutinely, hence, an approach to synthesise blood vessel segmentations from\nmore routinely acquired MR contrasts such as T1 and T2, would be useful. We\npresent an encoder-decoder model for synthesising segmentations of the main\ncerebral arteries in the circle of Willis (CoW) from only T2 MRI. We propose a\ntwo-phase multi-objective learning approach, which captures both global and\nlocal features. It uses learned local attention maps generated by dilating the\nsegmentation labels, which forces the network to only extract information from\nthe T2 MRI relevant to synthesising the CoW. Our synthetic vessel segmentations\ngenerated from only T2 MRI achieved a mean Dice score of $0.79 \\pm 0.03$ in\ntesting, compared to state-of-the-art segmentation networks such as transformer\nU-Net ($0.71 \\pm 0.04$) and nnU-net($0.68 \\pm 0.05$), while using only a\nfraction of the parameters. The main qualitative difference between our\nsynthetic vessel segmentations and the comparative models was in the sharper\nresolution of the CoW vessel segments, especially in the posterior circulation.\n","authors":["Yash Deo","Rodrigo Bonazzola","Haoran Dou","Yan Xia","Tianyou Wei","Nishant Ravikumar","Alejandro F. Frangi","Toni Lassila"],"pdf_url":"https://arxiv.org/pdf/2308.12861v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.05153v4","updated":"2023-08-24T15:15:44Z","published":"2022-12-10T00:18:05Z","title":"Algorithmic progress in computer vision","summary":" We investigate algorithmic progress in image classification on ImageNet,\nperhaps the most well-known test bed for computer vision. We estimate a model,\ninformed by work on neural scaling laws, and infer a decomposition of progress\ninto the scaling of compute, data, and algorithms. Using Shapley values to\nattribute performance improvements, we find that algorithmic improvements have\nbeen roughly as important as the scaling of compute for progress computer\nvision. Our estimates indicate that algorithmic innovations mostly take the\nform of compute-augmenting algorithmic advances (which enable researchers to\nget better performance from less compute), not data-augmenting algorithmic\nadvances. We find that compute-augmenting algorithmic advances are made at a\npace more than twice as fast as the rate usually associated with Moore's law.\nIn particular, we estimate that compute-augmenting innovations halve compute\nrequirements every nine months (95\\% confidence interval: 4 to 25 months).\n","authors":["Ege Erdil","Tamay Besiroglu"],"pdf_url":"https://arxiv.org/pdf/2212.05153v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12845v1","updated":"2023-08-24T15:10:28Z","published":"2023-08-24T15:10:28Z","title":"Implicit Obstacle Map-driven Indoor Navigation Model for Robust Obstacle\n Avoidance","summary":" Robust obstacle avoidance is one of the critical steps for successful\ngoal-driven indoor navigation tasks.Due to the obstacle missing in the visual\nimage and the possible missed detection issue, visual image-based obstacle\navoidance techniques still suffer from unsatisfactory robustness. To mitigate\nit, in this paper, we propose a novel implicit obstacle map-driven indoor\nnavigation framework for robust obstacle avoidance, where an implicit obstacle\nmap is learned based on the historical trial-and-error experience rather than\nthe visual image. In order to further improve the navigation efficiency, a\nnon-local target memory aggregation module is designed to leverage a non-local\nnetwork to model the intrinsic relationship between the target semantic and the\ntarget orientation clues during the navigation process so as to mine the most\ntarget-correlated object clues for the navigation decision. Extensive\nexperimental results on AI2-Thor and RoboTHOR benchmarks verify the excellent\nobstacle avoidance and navigation efficiency of our proposed method. The core\nsource code is available at https://github.com/xwaiyy123/object-navigation.\n","authors":["Wei Xie","Haobo Jiang","Shuo Gu","Jin Xie"],"pdf_url":"https://arxiv.org/pdf/2308.12845v1.pdf","comment":"9 pages, 7 figures, 43 references. This paper has been accepted for\n ACM MM 2023"},{"id":"http://arxiv.org/abs/2303.13111v3","updated":"2023-08-24T15:03:57Z","published":"2023-03-23T08:59:09Z","title":"Boosting Convolution with Efficient MLP-Permutation for Volumetric\n Medical Image Segmentation","summary":" Recently, the advent of vision Transformer (ViT) has brought substantial\nadvancements in 3D dataset benchmarks, particularly in 3D volumetric medical\nimage segmentation (Vol-MedSeg). Concurrently, multi-layer perceptron (MLP)\nnetwork has regained popularity among researchers due to their comparable\nresults to ViT, albeit with the exclusion of the resource-intensive\nself-attention module. In this work, we propose a novel permutable hybrid\nnetwork for Vol-MedSeg, named PHNet, which capitalizes on the strengths of both\nconvolution neural networks (CNNs) and MLP. PHNet addresses the intrinsic\nisotropy problem of 3D volumetric data by employing a combination of 2D and 3D\nCNNs to extract local features. Besides, we propose an efficient multi-layer\npermute perceptron (MLPP) module that captures long-range dependence while\npreserving positional information. This is achieved through an axis\ndecomposition operation that permutes the input tensor along different axes,\nthereby enabling the separate encoding of the positional information.\nFurthermore, MLPP tackles the resolution sensitivity issue of MLP in Vol-MedSeg\nwith a token segmentation operation, which divides the feature into smaller\ntokens and processes them individually. Extensive experimental results validate\nthat PHNet outperforms the state-of-the-art methods with lower computational\ncosts on the widely-used yet challenging COVID-19-20 and Synapse benchmarks.\nThe ablation study also demonstrates the effectiveness of PHNet in harnessing\nthe strengths of both CNNs and MLP.\n","authors":["Yi Lin","Xiao Fang","Dong Zhang","Kwang-Ting Cheng","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2303.13111v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12840v1","updated":"2023-08-24T14:55:38Z","published":"2023-08-24T14:55:38Z","title":"FaceTouch: Detecting hand-to-face touch with supervised contrastive\n learning to assist in tracing infectious disease","summary":" Through our respiratory system, many viruses and diseases frequently spread\nand pass from one person to another. Covid-19 served as an example of how\ncrucial it is to track down and cut back on contacts to stop its spread. There\nis a clear gap in finding automatic methods that can detect hand-to-face\ncontact in complex urban scenes or indoors. In this paper, we introduce a\ncomputer vision framework, called FaceTouch, based on deep learning. It\ncomprises deep sub-models to detect humans and analyse their actions. FaceTouch\nseeks to detect hand-to-face touches in the wild, such as through video chats,\nbus footage, or CCTV feeds. Despite partial occlusion of faces, the introduced\nsystem learns to detect face touches from the RGB representation of a given\nscene by utilising the representation of the body gestures such as arm\nmovement. This has been demonstrated to be useful in complex urban scenarios\nbeyond simply identifying hand movement and its closeness to faces. Relying on\nSupervised Contrastive Learning, the introduced model is trained on our\ncollected dataset, given the absence of other benchmark datasets. The framework\nshows a strong validation in unseen datasets which opens the door for potential\ndeployment.\n","authors":["Mohamed R. Ibrahim","Terry Lyons"],"pdf_url":"https://arxiv.org/pdf/2308.12840v1.pdf","comment":"Set to be published in the PLoS ONE Journal"},{"id":"http://arxiv.org/abs/2308.12831v1","updated":"2023-08-24T14:45:03Z","published":"2023-08-24T14:45:03Z","title":"EFormer: Enhanced Transformer towards Semantic-Contour Features of\n Foreground for Portraits Matting","summary":" The portrait matting task aims to extract an alpha matte with complete\nsemantics and finely-detailed contours. In comparison to CNN-based approaches,\ntransformers with self-attention allow a larger receptive field, enabling it to\nbetter capture long-range dependencies and low-frequency semantic information\nof a portrait. However, the recent research shows that self-attention mechanism\nstruggle with modeling high-frequency information and capturing fine contour\ndetails, which can lead to bias while predicting the portrait's contours. To\naddress the problem, we propose EFormer to enhance the model's attention\ntowards semantic and contour features. Especially the latter, which is\nsurrounded by a large amount of high-frequency details. We build a semantic and\ncontour detector (SCD) to accurately capture the distribution of semantic and\ncontour features. And we further design contour-edge extraction branch and\nsemantic extraction branch for refining contour features and complete semantic\ninformation. Finally, we fuse the two kinds of features and leverage the\nsegmentation head to generate the predicted portrait matte. Remarkably, EFormer\nis an end-to-end trimap-free method and boasts a simple structure. Experiments\nconducted on VideoMatte240K-JPEGSD and AIM datasets demonstrate that EFormer\noutperforms previous portrait matte methods.\n","authors":["Zitao Wang","Qiguang Miao","Yue Xi"],"pdf_url":"https://arxiv.org/pdf/2308.12831v1.pdf","comment":"17 pages, 6 figures"},{"id":"http://arxiv.org/abs/2211.09945v6","updated":"2023-08-24T14:35:54Z","published":"2022-11-17T23:42:10Z","title":"VeriCompress: A Tool to Streamline the Synthesis of Verified Robust\n Compressed Neural Networks from Scratch","summary":" AI's widespread integration has led to neural networks (NNs) deployment on\nedge and similar limited-resource platforms for safety-critical scenarios. Yet,\nNN's fragility raises concerns about reliable inference. Moreover, constrained\nplatforms demand compact networks. This study introduces VeriCompress, a tool\nthat automates the search and training of compressed models with robustness\nguarantees. These models are well-suited for safety-critical applications and\nadhere to predefined architecture and size limitations, making them deployable\non resource-restricted platforms. The method trains models 2-3 times faster\nthan the state-of-the-art approaches, surpassing relevant baseline approaches\nby average accuracy and robustness gains of 15.1 and 9.8 percentage points,\nrespectively. When deployed on a resource-restricted generic platform, these\nmodels require 5-8 times less memory and 2-4 times less inference time than\nmodels used in verified robustness literature. Our comprehensive evaluation\nacross various model architectures and datasets, including MNIST, CIFAR, SVHN,\nand a relevant pedestrian detection dataset, showcases VeriCompress's capacity\nto identify compressed verified robust models with reduced computation overhead\ncompared to current standards. This underscores its potential as a valuable\ntool for end users, such as developers of safety-critical applications on edge\nor Internet of Things platforms, empowering them to create suitable models for\nsafety-critical, resource-constrained platforms in their respective domains.\n","authors":["Sawinder Kaur","Yi Xiao","Asif Salekin"],"pdf_url":"https://arxiv.org/pdf/2211.09945v6.pdf","comment":"9 pages, 5 tables, 1 figure"},{"id":"http://arxiv.org/abs/2308.12789v1","updated":"2023-08-24T13:44:55Z","published":"2023-08-24T13:44:55Z","title":"Robotic Scene Segmentation with Memory Network for Runtime Surgical\n Context Inference","summary":" Surgical context inference has recently garnered significant attention in\nrobot-assisted surgery as it can facilitate workflow analysis, skill\nassessment, and error detection. However, runtime context inference is\nchallenging since it requires timely and accurate detection of the interactions\namong the tools and objects in the surgical scene based on the segmentation of\nvideo data. On the other hand, existing state-of-the-art video segmentation\nmethods are often biased against infrequent classes and fail to provide\ntemporal consistency for segmented masks. This can negatively impact the\ncontext inference and accurate detection of critical states. In this study, we\npropose a solution to these challenges using a Space Time Correspondence\nNetwork (STCN). STCN is a memory network that performs binary segmentation and\nminimizes the effects of class imbalance. The use of a memory bank in STCN\nallows for the utilization of past image and segmentation information, thereby\nensuring consistency of the masks. Our experiments using the publicly available\nJIGSAWS dataset demonstrate that STCN achieves superior segmentation\nperformance for objects that are difficult to segment, such as needle and\nthread, and improves context inference compared to the state-of-the-art. We\nalso demonstrate that segmentation and context inference can be performed at\nruntime without compromising performance.\n","authors":["Zongyu Li","Ian Reyes","Homa Alemzadeh"],"pdf_url":"https://arxiv.org/pdf/2308.12789v1.pdf","comment":"accepted at The IEEE/RSJ International Conference on Intelligent\n Robots and Systems (IROS) 2023"},{"id":"http://arxiv.org/abs/2308.12779v1","updated":"2023-08-24T13:31:51Z","published":"2023-08-24T13:31:51Z","title":"On Offline Evaluation of 3D Object Detection for Autonomous Driving","summary":" Prior work in 3D object detection evaluates models using offline metrics like\naverage precision since closed-loop online evaluation on the downstream driving\ntask is costly. However, it is unclear how indicative offline results are of\ndriving performance. In this work, we perform the first empirical evaluation\nmeasuring how predictive different detection metrics are of driving performance\nwhen detectors are integrated into a full self-driving stack. We conduct\nextensive experiments on urban driving in the CARLA simulator using 16 object\ndetection models. We find that the nuScenes Detection Score has a higher\ncorrelation to driving performance than the widely used average precision\nmetric. In addition, our results call for caution on the exclusive reliance on\nthe emerging class of `planner-centric' metrics.\n","authors":["Tim Schreier","Katrin Renz","Andreas Geiger","Kashyap Chitta"],"pdf_url":"https://arxiv.org/pdf/2308.12779v1.pdf","comment":"Appears in: IEEE International Conference on Computer Vision\n (ICCV'23) Workshops"},{"id":"http://arxiv.org/abs/2308.12774v1","updated":"2023-08-24T13:26:18Z","published":"2023-08-24T13:26:18Z","title":"LISTER: Neighbor Decoding for Length-Insensitive Scene Text Recognition","summary":" The diversity in length constitutes a significant characteristic of text. Due\nto the long-tail distribution of text lengths, most existing methods for scene\ntext recognition (STR) only work well on short or seen-length text, lacking the\ncapability of recognizing longer text or performing length extrapolation. This\nis a crucial issue, since the lengths of the text to be recognized are usually\nnot given in advance in real-world applications, but it has not been adequately\ninvestigated in previous works. Therefore, we propose in this paper a method\ncalled Length-Insensitive Scene TExt Recognizer (LISTER), which remedies the\nlimitation regarding the robustness to various text lengths. Specifically, a\nNeighbor Decoder is proposed to obtain accurate character attention maps with\nthe assistance of a novel neighbor matrix regardless of the text lengths.\nBesides, a Feature Enhancement Module is devised to model the long-range\ndependency with low computation cost, which is able to perform iterations with\nthe neighbor decoder to enhance the feature map progressively. To the best of\nour knowledge, we are the first to achieve effective length-insensitive scene\ntext recognition. Extensive experiments demonstrate that the proposed LISTER\nalgorithm exhibits obvious superiority on long text recognition and the ability\nfor length extrapolation, while comparing favourably with the previous\nstate-of-the-art methods on standard benchmarks for STR (mainly short text).\n","authors":["Changxu Cheng","Peng Wang","Cheng Da","Qi Zheng","Cong Yao"],"pdf_url":"https://arxiv.org/pdf/2308.12774v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2303.09790v3","updated":"2023-08-24T13:11:00Z","published":"2023-03-17T06:18:16Z","title":"Reliable Multimodality Eye Disease Screening via Mixture of Student's t\n Distributions","summary":" Multimodality eye disease screening is crucial in ophthalmology as it\nintegrates information from diverse sources to complement their respective\nperformances. However, the existing methods are weak in assessing the\nreliability of each unimodality, and directly fusing an unreliable modality may\ncause screening errors. To address this issue, we introduce a novel\nmultimodality evidential fusion pipeline for eye disease screening, EyeMoSt,\nwhich provides a measure of confidence for unimodality and elegantly integrates\nthe multimodality information from a multi-distribution fusion perspective.\nSpecifically, our model estimates both local uncertainty for unimodality and\nglobal uncertainty for the fusion modality to produce reliable classification\nresults. More importantly, the proposed mixture of Student's $t$ distributions\nadaptively integrates different modalities to endow the model with heavy-tailed\nproperties, increasing robustness and reliability. Our experimental findings on\nboth public and in-house datasets show that our model is more reliable than\ncurrent methods. Additionally, EyeMost has the potential ability to serve as a\ndata quality discriminator, enabling reliable decision-making for multimodality\neye disease screening.\n","authors":["Ke Zou","Tian Lin","Xuedong Yuan","Haoyu Chen","Xiaojing Shen","Meng Wang","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2303.09790v3.pdf","comment":"MICCAI 2023 (Early accept):11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.12761v1","updated":"2023-08-24T13:08:02Z","published":"2023-08-24T13:08:02Z","title":"IP-UNet: Intensity Projection UNet Architecture for 3D Medical Volume\n Segmentation","summary":" CNNs have been widely applied for medical image analysis. However, limited\nmemory capacity is one of the most common drawbacks of processing\nhigh-resolution 3D volumetric data. 3D volumes are usually cropped or downsized\nfirst before processing, which can result in a loss of resolution, increase\nclass imbalance, and affect the performance of the segmentation algorithms. In\nthis paper, we propose an end-to-end deep learning approach called IP-UNet.\nIP-UNet is a UNet-based model that performs multi-class segmentation on\nIntensity Projection (IP) of 3D volumetric data instead of the memory-consuming\n3D volumes. IP-UNet uses limited memory capability for training without losing\nthe original 3D image resolution. We compare the performance of three models in\nterms of segmentation accuracy and computational cost: 1) Slice-by-slice 2D\nsegmentation of the CT scan images using a conventional 2D UNet model. 2)\nIP-UNet that operates on data obtained by merging the extracted Maximum\nIntensity Projection (MIP), Closest Vessel Projection (CVP), and Average\nIntensity Projection (AvgIP) representations of the source 3D volumes, then\napplying the UNet model on the output IP images. 3) 3D-UNet model directly\nreads the 3D volumes constructed from a series of CT scan images and outputs\nthe 3D volume of the predicted segmentation. We test the performance of these\nmethods on 3D volumetric images for automatic breast calcification detection.\nExperimental results show that IP-Unet can achieve similar segmentation\naccuracy with 3D-Unet but with much better performance. It reduces the training\ntime by 70\\% and memory consumption by 92\\%.\n","authors":["Nyothiri Aung","Tahar Kechadi","Liming Chen","Sahraoui Dhelim"],"pdf_url":"https://arxiv.org/pdf/2308.12761v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11909v2","updated":"2023-08-24T13:05:46Z","published":"2023-08-23T04:29:40Z","title":"Edge-aware Hard Clustering Graph Pooling for Brain Imaging Data","summary":" Graph Convolutional Networks (GCNs) can capture non-Euclidean spatial\ndependence between different brain regions, and the graph pooling operator in\nGCNs is key to enhancing the representation learning capability and acquiring\nabnormal brain maps. However, the majority of existing research designs graph\npooling operators only from the perspective of nodes while disregarding the\noriginal edge features, in a way that not only confines graph pooling\napplication scenarios, but also diminishes its ability to capture critical\nsubstructures. In this study, a clustering graph pooling method that first\nsupports multidimensional edge features, called Edge-aware hard clustering\ngraph pooling (EHCPool), is developed. EHCPool proposes the first\n'Edge-to-node' score evaluation criterion based on edge features to assess node\nfeature significance. To more effectively capture the critical subgraphs, a\nnovel Iteration n-top strategy is further designed to adaptively learn sparse\nhard clustering assignments for graphs. Subsequently, an innovative N-E\nAggregation strategy is presented to aggregate node and edge feature\ninformation in each independent subgraph. The proposed model was evaluated on\nmulti-site brain imaging public datasets and yielded state-of-the-art\nperformance. We believe this method is the first deep learning tool with the\npotential to probe different types of abnormal functional brain networks from\ndata-driven perspective.\n","authors":["Cheng Zhu","Jiayi Zhu","Lijuan Zhang","Xi Wu","Shuqi Yang","Ping Liang","Honghan Chen","Ying Tan"],"pdf_url":"https://arxiv.org/pdf/2308.11909v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12757v1","updated":"2023-08-24T13:03:42Z","published":"2023-08-24T13:03:42Z","title":"PartSeg: Few-shot Part Segmentation via Part-aware Prompt Learning","summary":" In this work, we address the task of few-shot part segmentation, which aims\nto segment the different parts of an unseen object using very few labeled\nexamples. It is found that leveraging the textual space of a powerful\npre-trained image-language model (such as CLIP) can be beneficial in learning\nvisual features. Therefore, we develop a novel method termed PartSeg for\nfew-shot part segmentation based on multimodal learning. Specifically, we\ndesign a part-aware prompt learning method to generate part-specific prompts\nthat enable the CLIP model to better understand the concept of ``part'' and\nfully utilize its textual space. Furthermore, since the concept of the same\npart under different object categories is general, we establish relationships\nbetween these parts during the prompt learning process. We conduct extensive\nexperiments on the PartImageNet and Pascal$\\_$Part datasets, and the\nexperimental results demonstrated that our proposed method achieves\nstate-of-the-art performance.\n","authors":["Mengya Han","Heliang Zheng","Chaoyue Wang","Yong Luo","Han Hu","Jing Zhang","Yonggang Wen"],"pdf_url":"https://arxiv.org/pdf/2308.12757v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12738v1","updated":"2023-08-24T12:32:46Z","published":"2023-08-24T12:32:46Z","title":"Learning Heavily-Degraded Prior for Underwater Object Detection","summary":" Underwater object detection suffers from low detection performance because\nthe distance and wavelength dependent imaging process yield evident image\nquality degradations such as haze-like effects, low visibility, and color\ndistortions. Therefore, we commit to resolving the issue of underwater object\ndetection with compounded environmental degradations. Typical approaches\nattempt to develop sophisticated deep architecture to generate high-quality\nimages or features. However, these methods are only work for limited ranges\nbecause imaging factors are either unstable, too sensitive, or compounded.\nUnlike these approaches catering for high-quality images or features, this\npaper seeks transferable prior knowledge from detector-friendly images. The\nprior guides detectors removing degradations that interfere with detection. It\nis based on statistical observations that, the heavily degraded regions of\ndetector-friendly (DFUI) and underwater images have evident feature\ndistribution gaps while the lightly degraded regions of them overlap each\nother. Therefore, we propose a residual feature transference module (RFTM) to\nlearn a mapping between deep representations of the heavily degraded patches of\nDFUI- and underwater- images, and make the mapping as a heavily degraded prior\n(HDP) for underwater detection. Since the statistical properties are\nindependent to image content, HDP can be learned without the supervision of\nsemantic labels and plugged into popular CNNbased feature extraction networks\nto improve their performance on underwater object detection. Without bells and\nwhistles, evaluations on URPC2020 and UODD show that our methods outperform\nCNN-based detectors by a large margin. Our method with higher speeds and less\nparameters still performs better than transformer-based detectors. Our code and\nDFUI dataset can be found in\nhttps://github.com/xiaoDetection/Learning-Heavily-Degraed-Prior.\n","authors":["Chenping Fu","Xin Fan","Jiewen Xiao","Wanqi Yuan","Risheng Liu","Zhongxuan Luo"],"pdf_url":"https://arxiv.org/pdf/2308.12738v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12737v1","updated":"2023-08-24T12:27:03Z","published":"2023-08-24T12:27:03Z","title":"Asymmetric Co-Training with Explainable Cell Graph Ensembling for\n Histopathological Image Classification","summary":" Convolutional neural networks excel in histopathological image\nclassification, yet their pixel-level focus hampers explainability. Conversely,\nemerging graph convolutional networks spotlight cell-level features and medical\nimplications. However, limited by their shallowness and suboptimal use of\nhigh-dimensional pixel data, GCNs underperform in multi-class histopathological\nimage classification. To make full use of pixel-level and cell-level features\ndynamically, we propose an asymmetric co-training framework combining a deep\ngraph convolutional network and a convolutional neural network for multi-class\nhistopathological image classification. To improve the explainability of the\nentire framework by embedding morphological and topological distribution of\ncells, we build a 14-layer deep graph convolutional network to handle cell\ngraph data. For the further utilization and dynamic interactions between\npixel-level and cell-level information, we also design a co-training strategy\nto integrate the two asymmetric branches. Notably, we collect a private\nclinically acquired dataset termed LUAD7C, including seven subtypes of lung\nadenocarcinoma, which is rare and more challenging. We evaluated our approach\non the private LUAD7C and public colorectal cancer datasets, showcasing its\nsuperior performance, explainability, and generalizability in multi-class\nhistopathological image classification.\n","authors":["Ziqi Yang","Zhongyu Li","Chen Liu","Xiangde Luo","Xingguang Wang","Dou Xu","Chaoqun Li","Xiaoying Qin","Meng Yang","Long Jin"],"pdf_url":"https://arxiv.org/pdf/2308.12737v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12736v1","updated":"2023-08-24T12:26:38Z","published":"2023-08-24T12:26:38Z","title":"FastSurfer-HypVINN: Automated sub-segmentation of the hypothalamus and\n adjacent structures on high-resolutional brain MRI","summary":" The hypothalamus plays a crucial role in the regulation of a broad range of\nphysiological, behavioural, and cognitive functions. However, despite its\nimportance, only a few small-scale neuroimaging studies have investigated its\nsubstructures, likely due to the lack of fully automated segmentation tools to\naddress scalability and reproducibility issues of manual segmentation. While\nthe only previous attempt to automatically sub-segment the hypothalamus with a\nneural network showed promise for 1.0 mm isotropic T1-weighted (T1w) MRI, there\nis a need for an automated tool to sub-segment also high-resolutional (HiRes)\nMR scans, as they are becoming widely available, and include structural detail\nalso from multi-modal MRI. We, therefore, introduce a novel, fast, and fully\nautomated deep learning method named HypVINN for sub-segmentation of the\nhypothalamus and adjacent structures on 0.8 mm isotropic T1w and T2w brain MR\nimages that is robust to missing modalities. We extensively validate our model\nwith respect to segmentation accuracy, generalizability, in-session test-retest\nreliability, and sensitivity to replicate hypothalamic volume effects (e.g.\nsex-differences). The proposed method exhibits high segmentation performance\nboth for standalone T1w images as well as for T1w/T2w image pairs. Even with\nthe additional capability to accept flexible inputs, our model matches or\nexceeds the performance of state-of-the-art methods with fixed inputs. We,\nfurther, demonstrate the generalizability of our method in experiments with 1.0\nmm MR scans from both the Rhineland Study and the UK Biobank. Finally, HypVINN\ncan perform the segmentation in less than a minute (GPU) and will be available\nin the open source FastSurfer neuroimaging software suite, offering a\nvalidated, efficient, and scalable solution for evaluating imaging-derived\nphenotypes of the hypothalamus.\n","authors":["Santiago Estrada","David Kügler","Emad Bahrami","Peng Xu","Dilshad Mousa","Monique M. B. Breteler","N. Ahmad Aziz","Martin Reuter"],"pdf_url":"https://arxiv.org/pdf/2308.12736v1.pdf","comment":"Submitted to Imaging Neuroscience"},{"id":"http://arxiv.org/abs/2308.06534v2","updated":"2023-08-24T12:26:06Z","published":"2023-08-12T11:31:01Z","title":"Dealing with Small Datasets for Deep Learning in Medical Imaging: An\n Evaluation of Self-Supervised Pre-Training on CT Scans Comparing Contrastive\n and Masked Autoencoder Methods for Convolutional Models","summary":" Deep learning in medical imaging has the potential to minimize the risk of\ndiagnostic errors, reduce radiologist workload, and accelerate diagnosis.\nTraining such deep learning models requires large and accurate datasets, with\nannotations for all training samples. However, in the medical imaging domain,\nannotated datasets for specific tasks are often small due to the high\ncomplexity of annotations, limited access, or the rarity of diseases. To\naddress this challenge, deep learning models can be pre-trained on large image\ndatasets without annotations using methods from the field of self-supervised\nlearning. After pre-training, small annotated datasets are sufficient to\nfine-tune the models for a specific task. The most popular self-supervised\npre-training approaches in medical imaging are based on contrastive learning.\nHowever, recent studies in natural image processing indicate a strong potential\nfor masked autoencoder approaches. Our work compares state-of-the-art\ncontrastive learning methods with the recently introduced masked autoencoder\napproach \"SparK\" for convolutional neural networks (CNNs) on medical images.\nTherefore we pre-train on a large unannotated CT image dataset and fine-tune on\nseveral CT classification tasks. Due to the challenge of obtaining sufficient\nannotated training data in medical imaging, it is of particular interest to\nevaluate how the self-supervised pre-training methods perform when fine-tuning\non small datasets. By experimenting with gradually reducing the training\ndataset size for fine-tuning, we find that the reduction has different effects\ndepending on the type of pre-training chosen. The SparK pre-training method is\nmore robust to the training dataset size than the contrastive methods. Based on\nour results, we propose the SparK pre-training for medical imaging tasks with\nonly small annotated datasets.\n","authors":["Daniel Wolf","Tristan Payer","Catharina Silvia Lisson","Christoph Gerhard Lisson","Meinrad Beer","Timo Ropinski","Michael Götz"],"pdf_url":"https://arxiv.org/pdf/2308.06534v2.pdf","comment":"This paper is under review. The code will be released if accepted"},{"id":"http://arxiv.org/abs/2308.12727v1","updated":"2023-08-24T12:06:10Z","published":"2023-08-24T12:06:10Z","title":"DeepLOC: Deep Learning-based Bone Pathology Localization and\n Classification in Wrist X-ray Images","summary":" In recent years, computer-aided diagnosis systems have shown great potential\nin assisting radiologists with accurate and efficient medical image analysis.\nThis paper presents a novel approach for bone pathology localization and\nclassification in wrist X-ray images using a combination of YOLO (You Only Look\nOnce) and the Shifted Window Transformer (Swin) with a newly proposed block.\nThe proposed methodology addresses two critical challenges in wrist X-ray\nanalysis: accurate localization of bone pathologies and precise classification\nof abnormalities. The YOLO framework is employed to detect and localize bone\npathologies, leveraging its real-time object detection capabilities.\nAdditionally, the Swin, a transformer-based module, is utilized to extract\ncontextual information from the localized regions of interest (ROIs) for\naccurate classification.\n","authors":["Razan Dibo","Andrey Galichin","Pavel Astashev","Dmitry V. Dylov","Oleg Y. Rogov"],"pdf_url":"https://arxiv.org/pdf/2308.12727v1.pdf","comment":"AIST-2023 accepted paper"},{"id":"http://arxiv.org/abs/2206.11723v4","updated":"2023-08-24T11:35:01Z","published":"2022-06-23T14:16:30Z","title":"Self-Supervised Training with Autoencoders for Visual Anomaly Detection","summary":" Deep autoencoders provide an effective tool for learning non-linear\ndimensionality reduction in an unsupervised way. Recently, they have been used\nfor the task of anomaly detection in the visual domain. By optimizing for the\nreconstruction error using anomaly-free examples, the common belief is that a\ncorresponding network should fail to accurately reconstruct anomalous regions\nin the application phase. This goal is typically addressed by controlling the\ncapacity of the network, either by reducing the size of the bottleneck layer or\nby enforcing sparsity constraints on the activations. However, neither of these\ntechniques does explicitly penalize reconstruction of anomalous signals often\nresulting in poor detection. We tackle this problem by adapting a\nself-supervised learning regime that allows the use of discriminative\ninformation during training but focuses on the data manifold of normal\nexamples. We emphasize that inference with our approach is very efficient\nduring training and prediction requiring a single forward pass for each input\nimage. Our experiments on the MVTec AD dataset demonstrate high detection and\nlocalization performance. On the texture-subset, in particular, our approach\nconsistently outperforms recent anomaly detection methods by a significant\nmargin.\n","authors":["Alexander Bauer","Shinichi Nakajima","Klaus-Robert Müller"],"pdf_url":"https://arxiv.org/pdf/2206.11723v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08730v2","updated":"2023-08-24T11:31:43Z","published":"2023-08-17T01:59:59Z","title":"Learning A Coarse-to-Fine Diffusion Transformer for Image Restoration","summary":" Recent years have witnessed the remarkable performance of diffusion models in\nvarious vision tasks. However, for image restoration that aims to recover clear\nimages with sharper details from given degraded observations, diffusion-based\nmethods may fail to recover promising results due to inaccurate noise\nestimation. Moreover, simple constraining noises cannot effectively learn\ncomplex degradation information, which subsequently hinders the model capacity.\nTo solve the above problems, we propose a coarse-to-fine diffusion Transformer\n(C2F-DFT) for image restoration. Specifically, our C2F-DFT contains diffusion\nself-attention (DFSA) and diffusion feed-forward network (DFN) within a new\ncoarse-to-fine training scheme. The DFSA and DFN respectively capture the\nlong-range diffusion dependencies and learn hierarchy diffusion representation\nto facilitate better restoration. In the coarse training stage, our C2F-DFT\nestimates noises and then generates the final clean image by a sampling\nalgorithm. To further improve the restoration quality, we propose a simple yet\neffective fine training scheme. It first exploits the coarse-trained diffusion\nmodel with fixed steps to generate restoration results, which then would be\nconstrained with corresponding ground-truth ones to optimize the models to\nremedy the unsatisfactory results affected by inaccurate noise estimation.\nExtensive experiments show that C2F-DFT significantly outperforms\ndiffusion-based restoration method IR-SDE and achieves competitive performance\ncompared with Transformer-based state-of-the-art methods on $3$ tasks,\nincluding deraining, deblurring, and real denoising. The code is available at\nhttps://github.com/wlydlut/C2F-DFT.\n","authors":["Liyan Wang","Qinyu Yang","Cong Wang","Wei Wang","Jinshan Pan","Zhixun Su"],"pdf_url":"https://arxiv.org/pdf/2308.08730v2.pdf","comment":"9 pages, 8 figures"},{"id":"http://arxiv.org/abs/2307.09323v2","updated":"2023-08-24T11:25:43Z","published":"2023-07-18T15:07:39Z","title":"Efficient Region-Aware Neural Radiance Fields for High-Fidelity Talking\n Portrait Synthesis","summary":" This paper presents ER-NeRF, a novel conditional Neural Radiance Fields\n(NeRF) based architecture for talking portrait synthesis that can concurrently\nachieve fast convergence, real-time rendering, and state-of-the-art performance\nwith small model size. Our idea is to explicitly exploit the unequal\ncontribution of spatial regions to guide talking portrait modeling.\nSpecifically, to improve the accuracy of dynamic head reconstruction, a compact\nand expressive NeRF-based Tri-Plane Hash Representation is introduced by\npruning empty spatial regions with three planar hash encoders. For speech\naudio, we propose a Region Attention Module to generate region-aware condition\nfeature via an attention mechanism. Different from existing methods that\nutilize an MLP-based encoder to learn the cross-modal relation implicitly, the\nattention mechanism builds an explicit connection between audio features and\nspatial regions to capture the priors of local motions. Moreover, a direct and\nfast Adaptive Pose Encoding is introduced to optimize the head-torso separation\nproblem by mapping the complex transformation of the head pose into spatial\ncoordinates. Extensive experiments demonstrate that our method renders better\nhigh-fidelity and audio-lips synchronized talking portrait videos, with\nrealistic details and high efficiency compared to previous methods.\n","authors":["Jiahe Li","Jiawei Zhang","Xiao Bai","Jun Zhou","Lin Gu"],"pdf_url":"https://arxiv.org/pdf/2307.09323v2.pdf","comment":"Accepted by ICCV 2023. Project page:\n https://fictionarry.github.io/ER-NeRF/"},{"id":"http://arxiv.org/abs/2308.12714v1","updated":"2023-08-24T11:21:05Z","published":"2023-08-24T11:21:05Z","title":"VIGC: Visual Instruction Generation and Correction","summary":" The integration of visual encoders and large language models (LLMs) has\ndriven recent progress in multimodal large language models (MLLMs). However,\nthe scarcity of high-quality instruction-tuning data for vision-language tasks\nremains a challenge. The current leading paradigm, such as LLaVA, relies on\nlanguage-only GPT-4 to generate data, which requires pre-annotated image\ncaptions and detection bounding boxes, suffering from understanding image\ndetails. A practical solution to this problem would be to utilize the available\nmultimodal large language models (MLLMs) to generate instruction data for\nvision-language tasks. However, it's worth noting that the currently accessible\nMLLMs are not as powerful as their LLM counterparts, as they tend to produce\ninadequate responses and generate false information. As a solution for\naddressing the current issue, this paper proposes the Visual Instruction\nGeneration and Correction (VIGC) framework that enables multimodal large\nlanguage models to generate instruction-tuning data and progressively enhance\nits quality on-the-fly. Specifically, Visual Instruction Generation (VIG)\nguides the vision-language model to generate diverse instruction-tuning data.\nTo ensure generation quality, Visual Instruction Correction (VIC) adopts an\niterative update mechanism to correct any inaccuracies in data produced by VIG,\neffectively reducing the risk of hallucination. Leveraging the diverse,\nhigh-quality data generated by VIGC, we finetune mainstream models and validate\ndata quality based on various evaluations. Experimental results demonstrate\nthat VIGC not only compensates for the shortcomings of language-only data\ngeneration methods, but also effectively enhances the benchmark performance.\nThe models, datasets, and code will be made publicly available.\n","authors":["Bin Wang","Fan Wu","Xiao Han","Jiahui Peng","Huaping Zhong","Pan Zhang","Xiaoyi Dong","Weijia Li","Wei Li","Jiaqi Wang","Conghui He"],"pdf_url":"https://arxiv.org/pdf/2308.12714v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.12844v3","updated":"2023-08-24T11:11:55Z","published":"2021-10-25T12:13:45Z","title":"Reconstructing Pruned Filters using Cheap Spatial Transformations","summary":" We present an efficient alternative to the convolutional layer using cheap\nspatial transformations. This construction exploits an inherent spatial\nredundancy of the learned convolutional filters to enable a much greater\nparameter efficiency, while maintaining the top-end accuracy of their dense\ncounter-parts. Training these networks is modelled as a generalised pruning\nproblem, whereby the pruned filters are replaced with cheap transformations\nfrom the set of non-pruned filters. We provide an efficient implementation of\nthe proposed layer, followed by two natural extensions to avoid excessive\nfeature compression and to improve the expressivity of the transformed\nfeatures. We show that these networks can achieve comparable or improved\nperformance to state-of-the-art pruning models across both the CIFAR-10 and\nImageNet-1K datasets.\n","authors":["Roy Miles","Krystian Mikolajczyk"],"pdf_url":"https://arxiv.org/pdf/2110.12844v3.pdf","comment":"ICCV 2023 Workshop on Resource Efficient Deep Learning for Computer\n Vision"},{"id":"http://arxiv.org/abs/2308.12712v1","updated":"2023-08-24T11:11:26Z","published":"2023-08-24T11:11:26Z","title":"Ground-to-Aerial Person Search: Benchmark Dataset and Approach","summary":" In this work, we construct a large-scale dataset for Ground-to-Aerial Person\nSearch, named G2APS, which contains 31,770 images of 260,559 annotated bounding\nboxes for 2,644 identities appearing in both of the UAVs and ground\nsurveillance cameras. To our knowledge, this is the first dataset for\ncross-platform intelligent surveillance applications, where the UAVs could work\nas a powerful complement for the ground surveillance cameras. To more\nrealistically simulate the actual cross-platform Ground-to-Aerial surveillance\nscenarios, the surveillance cameras are fixed about 2 meters above the ground,\nwhile the UAVs capture videos of persons at different location, with a variety\nof view-angles, flight attitudes and flight modes. Therefore, the dataset has\nthe following unique characteristics: 1) drastic view-angle changes between\nquery and gallery person images from cross-platform cameras; 2) diverse\nresolutions, poses and views of the person images under 9 rich real-world\nscenarios. On basis of the G2APS benchmark dataset, we demonstrate detailed\nanalysis about current two-step and end-to-end person search methods, and\nfurther propose a simple yet effective knowledge distillation scheme on the\nhead of the ReID network, which achieves state-of-the-art performances on both\nof the G2APS and the previous two public person search datasets, i.e., PRW and\nCUHK-SYSU. The dataset and source code available on\n\\url{https://github.com/yqc123456/HKD_for_person_search}.\n","authors":["Shizhou Zhang","Qingchun Yang","De Cheng","Yinghui Xing","Guoqiang Liang","Peng Wang","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.12712v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2306.07703v2","updated":"2023-08-24T10:38:39Z","published":"2023-06-13T11:41:15Z","title":"E2E-LOAD: End-to-End Long-form Online Action Detection","summary":" Recently, there has been a growing trend toward feature-based approaches for\nOnline Action Detection (OAD). However, these approaches have limitations due\nto their fixed backbone design, which ignores the potential capability of a\ntrainable backbone. In this paper, we propose the first end-to-end OAD model,\ntermed E2E-LOAD, designed to address the major challenge of OAD, namely,\nlong-term understanding and efficient online reasoning. Specifically, our\nproposed approach adopts an initial spatial model that is shared by all frames\nand maintains a long sequence cache for inference at a low computational cost.\nWe also advocate an asymmetric spatial-temporal model for long-form and\nshort-form modeling effectively. Furthermore, we propose a novel and efficient\ninference mechanism that accelerates heavy spatial-temporal exploration.\nExtensive ablation studies and experiments demonstrate the effectiveness and\nefficiency of our proposed method. Notably, we achieve 17.3 (+12.6) FPS for\nend-to-end OAD with 72.4%~(+1.2%), 90.3%~(+0.7%), and 48.1%~(+26.0%) mAP on\nTHMOUS14, TVSeries, and HDD, respectively, which is 3x faster than previous\napproaches. The source code will be made publicly available.\n","authors":["Shuqiang Cao","Weixin Luo","Bairui Wang","Wei Zhang","Lin Ma"],"pdf_url":"https://arxiv.org/pdf/2306.07703v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12700v1","updated":"2023-08-24T10:37:00Z","published":"2023-08-24T10:37:00Z","title":"A Parse-Then-Place Approach for Generating Graphic Layouts from Textual\n Descriptions","summary":" Creating layouts is a fundamental step in graphic design. In this work, we\npropose to use text as the guidance to create graphic layouts, i.e.,\nText-to-Layout, aiming to lower the design barriers. Text-to-Layout is a\nchallenging task, because it needs to consider the implicit, combined, and\nincomplete layout constraints from text, each of which has not been studied in\nprevious work. To address this, we present a two-stage approach, named\nparse-then-place. The approach introduces an intermediate representation (IR)\nbetween text and layout to represent diverse layout constraints. With IR,\nText-to-Layout is decomposed into a parse stage and a place stage. The parse\nstage takes a textual description as input and generates an IR, in which the\nimplicit constraints from the text are transformed into explicit ones. The\nplace stage generates layouts based on the IR. To model combined and incomplete\nconstraints, we use a Transformer-based layout generation model and carefully\ndesign a way to represent constraints and layouts as sequences. Besides, we\nadopt the pretrain-then-finetune strategy to boost the performance of the\nlayout generation model with large-scale unlabeled layouts. To evaluate our\napproach, we construct two Text-to-Layout datasets and conduct experiments on\nthem. Quantitative results, qualitative analysis, and user studies demonstrate\nthe effectiveness of our approach.\n","authors":["Jiawei Lin","Jiaqi Guo","Shizhao Sun","Weijiang Xu","Ting Liu","Jian-Guang Lou","Dongmei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.12700v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2306.08713v2","updated":"2023-08-24T10:06:59Z","published":"2023-06-14T19:31:50Z","title":"What can a cook in Italy teach a mechanic in India? Action Recognition\n Generalisation Over Scenarios and Locations","summary":" We propose and address a new generalisation problem: can a model trained for\naction recognition successfully classify actions when they are performed within\na previously unseen scenario and in a previously unseen location? To answer\nthis question, we introduce the Action Recognition Generalisation Over\nscenarios and locations dataset (ARGO1M), which contains 1.1M video clips from\nthe large-scale Ego4D dataset, across 10 scenarios and 13 locations. We\ndemonstrate recognition models struggle to generalise over 10 proposed test\nsplits, each of an unseen scenario in an unseen location. We thus propose CIR,\na method to represent each video as a Cross-Instance Reconstruction of videos\nfrom other domains. Reconstructions are paired with text narrations to guide\nthe learning of a domain generalisable representation. We provide extensive\nanalysis and ablations on ARGO1M that show CIR outperforms prior domain\ngeneralisation works on all test splits. Code and data:\nhttps://chiaraplizz.github.io/what-can-a-cook/.\n","authors":["Chiara Plizzari","Toby Perrett","Barbara Caputo","Dima Damen"],"pdf_url":"https://arxiv.org/pdf/2306.08713v2.pdf","comment":"Accepted at ICCV 2023. Project page:\n https://chiaraplizz.github.io/what-can-a-cook/"},{"id":"http://arxiv.org/abs/2308.12679v1","updated":"2023-08-24T09:38:54Z","published":"2023-08-24T09:38:54Z","title":"A Continual Learning Approach for Cross-Domain White Blood Cell\n Classification","summary":" Accurate classification of white blood cells in peripheral blood is essential\nfor diagnosing hematological diseases. Due to constantly evolving clinical\nsettings, data sources, and disease classifications, it is necessary to update\nmachine learning classification models regularly for practical real-world use.\nSuch models significantly benefit from sequentially learning from incoming data\nstreams without forgetting previously acquired knowledge. However, models can\nsuffer from catastrophic forgetting, causing a drop in performance on previous\ntasks when fine-tuned on new data. Here, we propose a rehearsal-based continual\nlearning approach for class incremental and domain incremental scenarios in\nwhite blood cell classification. To choose representative samples from previous\ntasks, we employ exemplar set selection based on the model's predictions. This\ninvolves selecting the most confident samples and the most challenging samples\nidentified through uncertainty estimation of the model. We thoroughly evaluated\nour proposed approach on three white blood cell classification datasets that\ndiffer in color, resolution, and class composition, including scenarios where\nnew domains or new classes are introduced to the model with every task. We also\ntest a long class incremental experiment with both new domains and new classes.\nOur results demonstrate that our approach outperforms established baselines in\ncontinual learning, including existing iCaRL and EWC methods for classifying\nwhite blood cells in cross-domain environments.\n","authors":["Ario Sadafi","Raheleh Salehi","Armin Gruber","Sayedali Shetab Boushehri","Pascal Giehr","Nassir Navab","Carsten Marr"],"pdf_url":"https://arxiv.org/pdf/2308.12679v1.pdf","comment":"Accepted for publication at workshop on Domain Adaptation and\n Representation Transfer (DART) in International Conference on Medical Image\n Computing and Computer Assisted Intervention (MICCAI 2023)"},{"id":"http://arxiv.org/abs/2308.12675v1","updated":"2023-08-24T09:32:46Z","published":"2023-08-24T09:32:46Z","title":"A Study of Age and Sex Bias in Multiple Instance Learning based\n Classification of Acute Myeloid Leukemia Subtypes","summary":" Accurate classification of Acute Myeloid Leukemia (AML) subtypes is crucial\nfor clinical decision-making and patient care. In this study, we investigate\nthe potential presence of age and sex bias in AML subtype classification using\nMultiple Instance Learning (MIL) architectures. To that end, we train multiple\nMIL models using different levels of sex imbalance in the training set and\nexcluding certain age groups. To assess the sex bias, we evaluate the\nperformance of the models on male and female test sets. For age bias, models\nare tested against underrepresented age groups in the training data. We find a\nsignificant effect of sex and age bias on the performance of the model for AML\nsubtype classification. Specifically, we observe that females are more likely\nto be affected by sex imbalance dataset and certain age groups, such as\npatients with 72 to 86 years of age with the RUNX1::RUNX1T1 genetic subtype,\nare significantly affected by an age bias present in the training data.\nEnsuring inclusivity in the training data is thus essential for generating\nreliable and equitable outcomes in AML genetic subtype classification,\nultimately benefiting diverse patient populations.\n","authors":["Ario Sadafi","Matthias Hehr","Nassir Navab","Carsten Marr"],"pdf_url":"https://arxiv.org/pdf/2308.12675v1.pdf","comment":"Accepted for publication at workshop on Fairness of AI in Medical\n Imaging in International Conference on Medical Image Computing and Computer\n Assisted Intervention (MICCAI 2023)"},{"id":"http://arxiv.org/abs/2308.12673v1","updated":"2023-08-24T09:31:02Z","published":"2023-08-24T09:31:02Z","title":"Masked Feature Modelling: Feature Masking for the Unsupervised\n Pre-training of a Graph Attention Network Block for Bottom-up Video Event\n Recognition","summary":" In this paper, we introduce Masked Feature Modelling (MFM), a novel approach\nfor the unsupervised pre-training of a Graph Attention Network (GAT) block. MFM\nutilizes a pretrained Visual Tokenizer to reconstruct masked features of\nobjects within a video, leveraging the MiniKinetics dataset. We then\nincorporate the pre-trained GAT block into a state-of-the-art bottom-up\nsupervised video-event recognition architecture, ViGAT, to improve the model's\nstarting point and overall accuracy. Experimental evaluations on the YLI-MED\ndataset demonstrate the effectiveness of MFM in improving event recognition\nperformance.\n","authors":["Dimitrios Daskalakis","Nikolaos Gkalelis","Vasileios Mezaris"],"pdf_url":"https://arxiv.org/pdf/2308.12673v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2307.07540v2","updated":"2023-08-24T09:11:26Z","published":"2023-07-14T14:09:09Z","title":"Flow-Guided Controllable Line Drawing Generation","summary":" In this paper, we investigate the problem of automatically controllable\nartistic character line drawing generation from photographs by proposing a\nVector Flow Aware and Line Controllable Image-to-Image Translation\narchitecture, which can be viewed as an appealing intersection between\nArtificial Intelligence and Arts. Specifically, we first present an\nImage-to-Flow network (I2FNet) to efficiently and robustly create the vector\nflow field in a learning-based manner, which can provide a direction guide for\ndrawing lines. Then, we introduce our well-designed Double Flow Generator (DFG)\nframework to fuse features from learned vector flow and input image flow\nguaranteeing the spatial coherence of lines. Meanwhile, in order to allow for\ncontrollable character line drawing generation, we integrate a Line Control\nMatrix (LCM) into DFG and train a Line Control Regressor (LCR) to synthesize\ndrawings with different styles by elaborately controlling the level of details,\nsuch as thickness, smoothness, and continuity, of lines. Finally, we design a\nFourier Transformation Loss to further constrain the character line generation\nfrom the frequency domain view of the point. Quantitative and qualitative\nexperiments demonstrate that our approach can obtain superior performance in\nproducing high-resolution character line-drawing images with perceptually\nrealistic characteristics.\n","authors":["Chengyu Fang","Xianfeng Han"],"pdf_url":"https://arxiv.org/pdf/2307.07540v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12661v1","updated":"2023-08-24T09:10:10Z","published":"2023-08-24T09:10:10Z","title":"Don't Look into the Sun: Adversarial Solarization Attacks on Image\n Classifiers","summary":" Assessing the robustness of deep neural networks against out-of-distribution\ninputs is crucial, especially in safety-critical domains like autonomous\ndriving, but also in safety systems where malicious actors can digitally alter\ninputs to circumvent safety guards. However, designing effective\nout-of-distribution tests that encompass all possible scenarios while\npreserving accurate label information is a challenging task. Existing\nmethodologies often entail a compromise between variety and constraint levels\nfor attacks and sometimes even both. In a first step towards a more holistic\nrobustness evaluation of image classification models, we introduce an attack\nmethod based on image solarization that is conceptually straightforward yet\navoids jeopardizing the global structure of natural images independent of the\nintensity. Through comprehensive evaluations of multiple ImageNet models, we\ndemonstrate the attack's capacity to degrade accuracy significantly, provided\nit is not integrated into the training augmentations. Interestingly, even then,\nno full immunity to accuracy deterioration is achieved. In other settings, the\nattack can often be simplified into a black-box attack with model-independent\nparameters. Defenses against other corruptions do not consistently extend to be\neffective against our specific attack.\n Project website: https://github.com/paulgavrikov/adversarial_solarization\n","authors":["Paul Gavrikov","Janis Keuper"],"pdf_url":"https://arxiv.org/pdf/2308.12661v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11413v2","updated":"2023-08-24T08:44:25Z","published":"2023-07-21T08:15:39Z","title":"A Video-based Detector for Suspicious Activity in Examination with\n OpenPose","summary":" Examinations are a crucial part of the learning process, and academic\ninstitutions invest significant resources into maintaining their integrity by\npreventing cheating from students or facilitators. However, cheating has become\nrampant in examination setups, compromising their integrity. The traditional\nmethod of relying on invigilators to monitor every student is impractical and\nineffective. To address this issue, there is a need to continuously record exam\nsessions to monitor students for suspicious activities. However, these\nrecordings are often too lengthy for invigilators to analyze effectively, and\nfatigue may cause them to miss significant details. To widen the coverage,\ninvigilators could use fixed overhead or wearable cameras. This paper\nintroduces a framework that uses automation to analyze videos and detect\nsuspicious activities during examinations efficiently and effectively. We\nutilized the OpenPose framework and Convolutional Neural Network (CNN) to\nidentify students exchanging objects during exams. This detection system is\nvital in preventing cheating and promoting academic integrity, fairness, and\nquality education for institutions.\n","authors":["Reuben Moyo","Stanley Ndebvu","Michael Zimba","Jimmy Mbelwa"],"pdf_url":"https://arxiv.org/pdf/2307.11413v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12645v1","updated":"2023-08-24T08:41:40Z","published":"2023-08-24T08:41:40Z","title":"An All Deep System for Badminton Game Analysis","summary":" The CoachAI Badminton 2023 Track1 initiative aim to automatically detect\nevents within badminton match videos. Detecting small objects, especially the\nshuttlecock, is of quite importance and demands high precision within the\nchallenge. Such detection is crucial for tasks like hit count, hitting time,\nand hitting location. However, even after revising the well-regarded\nshuttlecock detecting model, TrackNet, our object detection models still fall\nshort of the desired accuracy. To address this issue, we've implemented various\ndeep learning methods to tackle the problems arising from noisy detectied data,\nleveraging diverse data types to improve precision. In this report, we detail\nthe detection model modifications we've made and our approach to the 11 tasks.\nNotably, our system garnered a score of 0.78 out of 1.0 in the challenge.\n","authors":["Po-Yung Chou","Yu-Chun Lo","Bo-Zheng Xie","Cheng-Hung Lin","Yu-Yung Kao"],"pdf_url":"https://arxiv.org/pdf/2308.12645v1.pdf","comment":"Golden Award for IJCAI CoachAI Challenge 2023: Team NTNUEE AIoTLab"},{"id":"http://arxiv.org/abs/2308.05983v2","updated":"2023-08-24T08:35:33Z","published":"2023-08-11T07:38:46Z","title":"Face Encryption via Frequency-Restricted Identity-Agnostic Attacks","summary":" Billions of people are sharing their daily live images on social media\neveryday. However, malicious collectors use deep face recognition systems to\neasily steal their biometric information (e.g., faces) from these images. Some\nstudies are being conducted to generate encrypted face photos using adversarial\nattacks by introducing imperceptible perturbations to reduce face information\nleakage. However, existing studies need stronger black-box scenario feasibility\nand more natural visual appearances, which challenge the feasibility of privacy\nprotection. To address these problems, we propose a frequency-restricted\nidentity-agnostic (FRIA) framework to encrypt face images from unauthorized\nface recognition without access to personal information. As for the weak\nblack-box scenario feasibility, we obverse that representations of the average\nfeature in multiple face recognition models are similar, thus we propose to\nutilize the average feature via the crawled dataset from the Internet as the\ntarget to guide the generation, which is also agnostic to identities of unknown\nface recognition systems; in nature, the low-frequency perturbations are more\nvisually perceptible by the human vision system. Inspired by this, we restrict\nthe perturbation in the low-frequency facial regions by discrete cosine\ntransform to achieve the visual naturalness guarantee. Extensive experiments on\nseveral face recognition models demonstrate that our FRIA outperforms other\nstate-of-the-art methods in generating more natural encrypted faces while\nattaining high black-box attack success rates of 96%. In addition, we validate\nthe efficacy of FRIA using real-world black-box commercial API, which reveals\nthe potential of FRIA in practice. Our codes can be found in\nhttps://github.com/XinDong10/FRIA.\n","authors":["Xin Dong","Rui Wang","Siyuan Liang","Aishan Liu","Lihua Jing"],"pdf_url":"https://arxiv.org/pdf/2308.05983v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12642v1","updated":"2023-08-24T08:35:12Z","published":"2023-08-24T08:35:12Z","title":"Tag-Based Annotation for Avatar Face Creation","summary":" Currently, digital avatars can be created manually using human images as\nreference. Systems such as Bitmoji are excellent producers of detailed avatar\ndesigns, with hundreds of choices for customization. A supervised learning\nmodel could be trained to generate avatars automatically, but the hundreds of\npossible options create difficulty in securing non-noisy data to train a model.\nAs a solution, we train a model to produce avatars from human images using\ntag-based annotations. This method provides better annotator agreement, leading\nto less noisy data and higher quality model predictions. Our contribution is an\napplication of tag-based annotation to train a model for avatar face creation.\nWe design tags for 3 different facial facial features offered by Bitmoji, and\ntrain a model using tag-based annotation to predict the nose.\n","authors":["An Ngo","Daniel Phelps","Derrick Lai","Thanyared Wong","Lucas Mathias","Anish Shivamurthy","Mustafa Ajmal","Minghao Liu","James Davis"],"pdf_url":"https://arxiv.org/pdf/2308.12642v1.pdf","comment":"9 pages, 5 figures, 18 tables"},{"id":"http://arxiv.org/abs/2304.08134v4","updated":"2023-08-24T08:31:31Z","published":"2023-04-17T10:29:26Z","title":"Tackling Face Verification Edge Cases: In-Depth Analysis and\n Human-Machine Fusion Approach","summary":" Nowadays, face recognition systems surpass human performance on several\ndatasets. However, there are still edge cases that the machine can't correctly\nclassify. This paper investigates the effect of a combination of machine and\nhuman operators in the face verification task. First, we look closer at the\nedge cases for several state-of-the-art models to discover common datasets'\nchallenging settings. Then, we conduct a study with 60 participants on these\nselected tasks with humans and provide an extensive analysis. Finally, we\ndemonstrate that combining machine and human decisions can further improve the\nperformance of state-of-the-art face verification systems on various benchmark\ndatasets. Code and data are publicly available on GitHub.\n","authors":["Martin Knoche","Gerhard Rigoll"],"pdf_url":"https://arxiv.org/pdf/2304.08134v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12634v1","updated":"2023-08-24T08:19:15Z","published":"2023-08-24T08:19:15Z","title":"Towards Hierarchical Regional Transformer-based Multiple Instance\n Learning","summary":" The classification of gigapixel histopathology images with deep multiple\ninstance learning models has become a critical task in digital pathology and\nprecision medicine. In this work, we propose a Transformer-based multiple\ninstance learning approach that replaces the traditional learned attention\nmechanism with a regional, Vision Transformer inspired self-attention\nmechanism. We present a method that fuses regional patch information to derive\nslide-level predictions and show how this regional aggregation can be stacked\nto hierarchically process features on different distance levels. To increase\npredictive accuracy, especially for datasets with small, local morphological\nfeatures, we introduce a method to focus the image processing on high attention\nregions during inference. Our approach is able to significantly improve\nperformance over the baseline on two histopathology datasets and points towards\npromising directions for further research.\n","authors":["Josef Cersovsky","Sadegh Mohammadi","Dagmar Kainmueller","Johannes Hoehne"],"pdf_url":"https://arxiv.org/pdf/2308.12634v1.pdf","comment":"To be published as ICCV 2023 workshop paper"},{"id":"http://arxiv.org/abs/2303.12077v3","updated":"2023-08-24T08:15:35Z","published":"2023-03-21T17:59:22Z","title":"VAD: Vectorized Scene Representation for Efficient Autonomous Driving","summary":" Autonomous driving requires a comprehensive understanding of the surrounding\nenvironment for reliable trajectory planning. Previous works rely on dense\nrasterized scene representation (e.g., agent occupancy and semantic map) to\nperform planning, which is computationally intensive and misses the\ninstance-level structure information. In this paper, we propose VAD, an\nend-to-end vectorized paradigm for autonomous driving, which models the driving\nscene as a fully vectorized representation. The proposed vectorized paradigm\nhas two significant advantages. On one hand, VAD exploits the vectorized agent\nmotion and map elements as explicit instance-level planning constraints which\neffectively improves planning safety. On the other hand, VAD runs much faster\nthan previous end-to-end planning methods by getting rid of\ncomputation-intensive rasterized representation and hand-designed\npost-processing steps. VAD achieves state-of-the-art end-to-end planning\nperformance on the nuScenes dataset, outperforming the previous best method by\na large margin. Our base model, VAD-Base, greatly reduces the average collision\nrate by 29.0% and runs 2.5x faster. Besides, a lightweight variant, VAD-Tiny,\ngreatly improves the inference speed (up to 9.3x) while achieving comparable\nplanning performance. We believe the excellent performance and the high\nefficiency of VAD are critical for the real-world deployment of an autonomous\ndriving system. Code and models are available at https://github.com/hustvl/VAD\nfor facilitating future research.\n","authors":["Bo Jiang","Shaoyu Chen","Qing Xu","Bencheng Liao","Jiajie Chen","Helong Zhou","Qian Zhang","Wenyu Liu","Chang Huang","Xinggang Wang"],"pdf_url":"https://arxiv.org/pdf/2303.12077v3.pdf","comment":"Accepted to ICCV 2023. Code&Demos: https://github.com/hustvl/VAD"},{"id":"http://arxiv.org/abs/2306.06494v2","updated":"2023-08-24T07:52:59Z","published":"2023-06-10T17:27:33Z","title":"Multi-modal Pre-training for Medical Vision-language Understanding and\n Generation: An Empirical Study with A New Benchmark","summary":" With the availability of large-scale, comprehensive, and general-purpose\nvision-language (VL) datasets such as MSCOCO, vision-language pre-training\n(VLP) has become an active area of research and proven to be effective for\nvarious VL tasks such as visual-question answering. However, studies on VLP in\nthe medical domain have so far been scanty. To provide a comprehensive\nperspective on VLP for medical VL tasks, we conduct a thorough experimental\nanalysis to study key factors that may affect the performance of VLP with a\nunified vision-language Transformer. To allow making sound and quick\npre-training decisions, we propose RadioGraphy Captions (RGC), a high-quality,\nmulti-modality radiographic dataset containing 18,434 image-caption pairs\ncollected from an open-access online database MedPix. RGC can be used as a\npre-training dataset or a new benchmark for medical report generation and\nmedical image-text retrieval. By utilizing RGC and other available datasets for\npre-training, we develop several key insights that can guide future medical VLP\nresearch and new strong baselines for various medical VL tasks.\n","authors":["Li Xu","Bo Liu","Ameer Hamza Khan","Lu Fan","Xiao-Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2306.06494v2.pdf","comment":"Published as oral paper in CHIL 2023"},{"id":"http://arxiv.org/abs/2308.10438v2","updated":"2023-08-24T07:43:18Z","published":"2023-08-21T03:22:47Z","title":"Efficient Joint Optimization of Layer-Adaptive Weight Pruning in Deep\n Neural Networks","summary":" In this paper, we propose a novel layer-adaptive weight-pruning approach for\nDeep Neural Networks (DNNs) that addresses the challenge of optimizing the\noutput distortion minimization while adhering to a target pruning ratio\nconstraint. Our approach takes into account the collective influence of all\nlayers to design a layer-adaptive pruning scheme. We discover and utilize a\nvery important additivity property of output distortion caused by pruning\nweights on multiple layers. This property enables us to formulate the pruning\nas a combinatorial optimization problem and efficiently solve it through\ndynamic programming. By decomposing the problem into sub-problems, we achieve\nlinear time complexity, making our optimization algorithm fast and feasible to\nrun on CPUs. Our extensive experiments demonstrate the superiority of our\napproach over existing methods on the ImageNet and CIFAR-10 datasets. On\nCIFAR-10, our method achieves remarkable improvements, outperforming others by\nup to 1.0% for ResNet-32, 0.5% for VGG-16, and 0.7% for DenseNet-121 in terms\nof top-1 accuracy. On ImageNet, we achieve up to 4.7% and 4.6% higher top-1\naccuracy compared to other methods for VGG-16 and ResNet-50, respectively.\nThese results highlight the effectiveness and practicality of our approach for\nenhancing DNN performance through layer-adaptive weight pruning. Code will be\navailable on https://github.com/Akimoto-Cris/RD_VIT_PRUNE.\n","authors":["Kaixin Xu","Zhe Wang","Xue Geng","Jie Lin","Min Wu","Xiaoli Li","Weisi Lin"],"pdf_url":"https://arxiv.org/pdf/2308.10438v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2109.05742v4","updated":"2023-08-24T07:31:59Z","published":"2021-09-13T07:07:23Z","title":"HCDG: A Hierarchical Consistency Framework for Domain Generalization on\n Medical Image Segmentation","summary":" Modern deep neural networks struggle to transfer knowledge and generalize\nacross diverse domains when deployed to real-world applications. Currently,\ndomain generalization (DG) is introduced to learn a universal representation\nfrom multiple domains to improve the network generalization ability on unseen\ndomains. However, previous DG methods only focus on the data-level consistency\nscheme without considering the synergistic regularization among different\nconsistency schemes. In this paper, we present a novel Hierarchical Consistency\nframework for Domain Generalization (HCDG) by integrating Extrinsic Consistency\nand Intrinsic Consistency synergistically. Particularly, for the Extrinsic\nConsistency, we leverage the knowledge across multiple source domains to\nenforce data-level consistency. To better enhance such consistency, we design a\nnovel Amplitude Gaussian-mixing strategy into Fourier-based data augmentation\ncalled DomainUp. For the Intrinsic Consistency, we perform task-level\nconsistency for the same instance under the dual-task scenario. We evaluate the\nproposed HCDG framework on two medical image segmentation tasks, i.e., optic\ncup/disc segmentation on fundus images and prostate MRI segmentation. Extensive\nexperimental results manifest the effectiveness and versatility of our HCDG\nframework.\n","authors":["Yijun Yang","Shujun Wang","Lei Zhu","Lequan Yu"],"pdf_url":"https://arxiv.org/pdf/2109.05742v4.pdf","comment":"this paper is currently not published"},{"id":"http://arxiv.org/abs/2308.12609v1","updated":"2023-08-24T07:19:59Z","published":"2023-08-24T07:19:59Z","title":"Cross-Video Contextual Knowledge Exploration and Exploitation for\n Ambiguity Reduction in Weakly Supervised Temporal Action Localization","summary":" Weakly supervised temporal action localization (WSTAL) aims to localize\nactions in untrimmed videos using video-level labels. Despite recent advances,\nexisting approaches mainly follow a localization-by-classification pipeline,\ngenerally processing each segment individually, thereby exploiting only limited\ncontextual information. As a result, the model will lack a comprehensive\nunderstanding (e.g. appearance and temporal structure) of various action\npatterns, leading to ambiguity in classification learning and temporal\nlocalization. Our work addresses this from a novel perspective, by exploring\nand exploiting the cross-video contextual knowledge within the dataset to\nrecover the dataset-level semantic structure of action instances via weak\nlabels only, thereby indirectly improving the holistic understanding of\nfine-grained action patterns and alleviating the aforementioned ambiguities.\nSpecifically, an end-to-end framework is proposed, including a Robust\nMemory-Guided Contrastive Learning (RMGCL) module and a Global Knowledge\nSummarization and Aggregation (GKSA) module. First, the RMGCL module explores\nthe contrast and consistency of cross-video action features, assisting in\nlearning more structured and compact embedding space, thus reducing ambiguity\nin classification learning. Further, the GKSA module is used to efficiently\nsummarize and propagate the cross-video representative action knowledge in a\nlearnable manner to promote holistic action patterns understanding, which in\nturn allows the generation of high-confidence pseudo-labels for self-learning,\nthus alleviating ambiguity in temporal localization. Extensive experiments on\nTHUMOS14, ActivityNet1.3, and FineAction demonstrate that our method\noutperforms the state-of-the-art methods, and can be easily plugged into other\nWSTAL methods.\n","authors":["Songchun Zhang","Chunhui Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.12609v1.pdf","comment":"Submitted to TCSVT. 14 pages and 7 figures"},{"id":"http://arxiv.org/abs/2308.12608v1","updated":"2023-08-24T07:19:11Z","published":"2023-08-24T07:19:11Z","title":"HR-Pro: Point-supervised Temporal Action Localization via Hierarchical\n Reliability Propagation","summary":" Point-supervised Temporal Action Localization (PSTAL) is an emerging research\ndirection for label-efficient learning. However, current methods mainly focus\non optimizing the network either at the snippet-level or the instance-level,\nneglecting the inherent reliability of point annotations at both levels. In\nthis paper, we propose a Hierarchical Reliability Propagation (HR-Pro)\nframework, which consists of two reliability-aware stages: Snippet-level\nDiscrimination Learning and Instance-level Completeness Learning, both stages\nexplore the efficient propagation of high-confidence cues in point annotations.\nFor snippet-level learning, we introduce an online-updated memory to store\nreliable snippet prototypes for each class. We then employ a Reliability-aware\nAttention Block to capture both intra-video and inter-video dependencies of\nsnippets, resulting in more discriminative and robust snippet representation.\nFor instance-level learning, we propose a point-based proposal generation\napproach as a means of connecting snippets and instances, which produces\nhigh-confidence proposals for further optimization at the instance level.\nThrough multi-level reliability-aware learning, we obtain more reliable\nconfidence scores and more accurate temporal boundaries of predicted proposals.\nOur HR-Pro achieves state-of-the-art performance on multiple challenging\nbenchmarks, including an impressive average mAP of 60.3% on THUMOS14. Notably,\nour HR-Pro largely surpasses all previous point-supervised methods, and even\noutperforms several competitive fully supervised methods. Code will be\navailable at https://github.com/pipixin321/HR-Pro.\n","authors":["Huaxin Zhang","Xiang Wang","Xiaohao Xu","Zhiwu Qing","Changxin Gao","Nong Sang"],"pdf_url":"https://arxiv.org/pdf/2308.12608v1.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.10677v2","updated":"2023-08-24T07:18:18Z","published":"2023-08-21T12:24:20Z","title":"Visual Crowd Analysis: Open Research Problems","summary":" Over the last decade, there has been a remarkable surge in interest in\nautomated crowd monitoring within the computer vision community. Modern\ndeep-learning approaches have made it possible to develop fully-automated\nvision-based crowd-monitoring applications. However, despite the magnitude of\nthe issue at hand, the significant technological advancements, and the\nconsistent interest of the research community, there are still numerous\nchallenges that need to be overcome. In this article, we delve into six major\nareas of visual crowd analysis, emphasizing the key developments in each of\nthese areas. We outline the crucial unresolved issues that must be tackled in\nfuture works, in order to ensure that the field of automated crowd monitoring\ncontinues to progress and thrive. Several surveys related to this topic have\nbeen conducted in the past. Nonetheless, this article thoroughly examines and\npresents a more intuitive categorization of works, while also depicting the\nlatest breakthroughs within the field, incorporating more recent studies\ncarried out within the last few years in a concise manner. By carefully\nchoosing prominent works with significant contributions in terms of novelty or\nperformance gains, this paper presents a more comprehensive exposition of\nadvancements in the current state-of-the-art.\n","authors":["Muhammad Asif Khan","Hamid Menouar","Ridha Hamila"],"pdf_url":"https://arxiv.org/pdf/2308.10677v2.pdf","comment":"Accepted in AI Magazine published by Wiley Periodicals LLC on behalf\n of the Association for the Advancement of Artificial Intelligence"},{"id":"http://arxiv.org/abs/2308.12605v1","updated":"2023-08-24T07:11:00Z","published":"2023-08-24T07:11:00Z","title":"APLA: Additional Perturbation for Latent Noise with Adversarial Training\n Enables Consistency","summary":" Diffusion models have exhibited promising progress in video generation.\nHowever, they often struggle to retain consistent details within local regions\nacross frames. One underlying cause is that traditional diffusion models\napproximate Gaussian noise distribution by utilizing predictive noise, without\nfully accounting for the impact of inherent information within the input\nitself. Additionally, these models emphasize the distinction between\npredictions and references, neglecting information intrinsic to the videos. To\naddress this limitation, inspired by the self-attention mechanism, we propose a\nnovel text-to-video (T2V) generation network structure based on diffusion\nmodels, dubbed Additional Perturbation for Latent noise with Adversarial\ntraining (APLA). Our approach only necessitates a single video as input and\nbuilds upon pre-trained stable diffusion networks. Notably, we introduce an\nadditional compact network, known as the Video Generation Transformer (VGT).\nThis auxiliary component is designed to extract perturbations from the inherent\ninformation contained within the input, thereby refining inconsistent pixels\nduring temporal predictions. We leverage a hybrid architecture of transformers\nand convolutions to compensate for temporal intricacies, enhancing consistency\nbetween different frames within the video. Experiments demonstrate a noticeable\nimprovement in the consistency of the generated videos both qualitatively and\nquantitatively.\n","authors":["Yupu Yao","Shangqi Deng","Zihan Cao","Harry Zhang","Liang-Jian Deng"],"pdf_url":"https://arxiv.org/pdf/2308.12605v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12604v1","updated":"2023-08-24T07:10:31Z","published":"2023-08-24T07:10:31Z","title":"PromptMRG: Diagnosis-Driven Prompts for Medical Report Generation","summary":" Automatic medical report generation (MRG) is of great research value as it\nhas the potential to relieve radiologists from the heavy burden of report\nwriting. Despite recent advancements, accurate MRG remains challenging due to\nthe need for precise clinical understanding and the identification of clinical\nfindings. Moreover, the imbalanced distribution of diseases makes the challenge\neven more pronounced, as rare diseases are underrepresented in training data,\nmaking their diagnostic performance unreliable. To address these challenges, we\npropose diagnosis-driven prompts for medical report generation (PromptMRG), a\nnovel framework that aims to improve the diagnostic accuracy of MRG with the\nguidance of diagnosis-aware prompts. Specifically, PromptMRG is based on\nencoder-decoder architecture with an extra disease classification branch. When\ngenerating reports, the diagnostic results from the classification branch are\nconverted into token prompts to explicitly guide the generation process. To\nfurther improve the diagnostic accuracy, we design cross-modal feature\nenhancement, which retrieves similar reports from the database to assist the\ndiagnosis of a query image by leveraging the knowledge from a pre-trained CLIP.\nMoreover, the disease imbalanced issue is addressed by applying an adaptive\nlogit-adjusted loss to the classification branch based on the individual\nlearning status of each disease, which overcomes the barrier of text decoder's\ninability to manipulate disease distributions. Experiments on two MRG\nbenchmarks show the effectiveness of the proposed method, where it obtains\nstate-of-the-art clinical efficacy performance on both datasets.\n","authors":["Haibo Jin","Haoxuan Che","Yi Lin","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2308.12604v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12600v1","updated":"2023-08-24T07:02:15Z","published":"2023-08-24T07:02:15Z","title":"PoseSync: Robust pose based video synchronization","summary":" Pose based video sychronization can have applications in multiple domains\nsuch as gameplay performance evaluation, choreography or guiding athletes. The\nsubject's actions could be compared and evaluated against those performed by\nprofessionals side by side. In this paper, we propose an end to end pipeline\nfor synchronizing videos based on pose. The first step crops the region where\nthe person present in the image followed by pose detection on the cropped\nimage. This is followed by application of Dynamic Time Warping(DTW) on angle/\ndistance measures between the pose keypoints leading to a scale and shift\ninvariant pose matching pipeline.\n","authors":["Rishit Javia","Falak Shah","Shivam Dave"],"pdf_url":"https://arxiv.org/pdf/2308.12600v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12595v1","updated":"2023-08-24T06:50:07Z","published":"2023-08-24T06:50:07Z","title":"Logic-induced Diagnostic Reasoning for Semi-supervised Semantic\n Segmentation","summary":" Recent advances in semi-supervised semantic segmentation have been heavily\nreliant on pseudo labeling to compensate for limited labeled data, disregarding\nthe valuable relational knowledge among semantic concepts. To bridge this gap,\nwe devise LogicDiag, a brand new neural-logic semi-supervised learning\nframework. Our key insight is that conflicts within pseudo labels, identified\nthrough symbolic knowledge, can serve as strong yet commonly ignored learning\nsignals. LogicDiag resolves such conflicts via reasoning with logic-induced\ndiagnoses, enabling the recovery of (potentially) erroneous pseudo labels,\nultimately alleviating the notorious error accumulation problem. We showcase\nthe practical application of LogicDiag in the data-hungry segmentation\nscenario, where we formalize the structured abstraction of semantic concepts as\na set of logic rules. Extensive experiments on three standard semi-supervised\nsemantic segmentation benchmarks demonstrate the effectiveness and generality\nof LogicDiag. Moreover, LogicDiag highlights the promising opportunities\narising from the systematic integration of symbolic reasoning into the\nprevalent statistical, neural learning approaches.\n","authors":["Chen Liang","Wenguan Wang","Jiaxu Miao","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2308.12595v1.pdf","comment":"Accepted to ICCV 2023; Code: https://github.com/leonnnop/LogicDiag"},{"id":"http://arxiv.org/abs/2303.05699v2","updated":"2023-08-24T06:46:13Z","published":"2023-03-10T04:49:01Z","title":"Feature Unlearning for Pre-trained GANs and VAEs","summary":" We tackle the problem of feature unlearning from a pre-trained image\ngenerative model: GANs and VAEs. Unlike a common unlearning task where an\nunlearning target is a subset of the training set, we aim to unlearn a specific\nfeature, such as hairstyle from facial images, from the pre-trained generative\nmodels. As the target feature is only presented in a local region of an image,\nunlearning the entire image from the pre-trained model may result in losing\nother details in the remaining region of the image. To specify which features\nto unlearn, we collect randomly generated images that contain the target\nfeatures. We then identify a latent representation corresponding to the target\nfeature and then use the representation to fine-tune the pre-trained model.\nThrough experiments on MNIST and CelebA datasets, we show that target features\nare successfully removed while keeping the fidelity of the original models.\nFurther experiments with an adversarial attack show that the unlearned model is\nmore robust under the presence of malicious parties.\n","authors":["Saemi Moon","Seunghyuk Cho","Dongwoo Kim"],"pdf_url":"https://arxiv.org/pdf/2303.05699v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17420v2","updated":"2023-08-24T06:39:18Z","published":"2023-05-27T08:55:56Z","title":"CCDWT-GAN: Generative Adversarial Networks Based on Color Channel Using\n Discrete Wavelet Transform for Document Image Binarization","summary":" To efficiently extract textual information from color degraded document\nimages is a significant research area. The prolonged imperfect preservation of\nancient documents has led to various types of degradation, such as page\nstaining, paper yellowing, and ink bleeding. These types of degradation badly\nimpact the image processing for features extraction. This paper introduces a\nnovelty method employing generative adversarial networks based on color channel\nusing discrete wavelet transform (CCDWT-GAN). The proposed method involves\nthree stages: image preprocessing, image enhancement, and image binarization.\nIn the initial step, we apply discrete wavelet transform (DWT) to retain the\nlow-low (LL) subband image, thereby enhancing image quality. Subsequently, we\ndivide the original input image into four single-channel colors (red, green,\nblue, and gray) to separately train adversarial networks. For the extraction of\nglobal and local features, we utilize the output image from the image\nenhancement stage and the entire input image to train adversarial networks\nindependently, and then combine these two results as the final output. To\nvalidate the positive impact of the image enhancement and binarization stages\non model performance, we conduct an ablation study. This work compares the\nperformance of the proposed method with other state-of-the-art (SOTA) methods\non DIBCO and H-DIBCO ((Handwritten) Document Image Binarization Competition)\ndatasets. The experimental results demonstrate that CCDWT-GAN achieves a top\ntwo performance on multiple benchmark datasets. Notably, on DIBCO 2013 and 2016\ndataset, our method achieves F-measure (FM) values of 95.24 and 91.46,\nrespectively.\n","authors":["Rui-Yang Ju","Yu-Shian Lin","Jen-Shiun Chiang","Chih-Chia Chen","Wei-Han Chen","Chun-Tse Chien"],"pdf_url":"https://arxiv.org/pdf/2305.17420v2.pdf","comment":"accepted by PRICAI 2023"},{"id":"http://arxiv.org/abs/2308.12590v1","updated":"2023-08-24T06:38:33Z","published":"2023-08-24T06:38:33Z","title":"Self-supervised Learning of Implicit Shape Representation with Dense\n Correspondence for Deformable Objects","summary":" Learning 3D shape representation with dense correspondence for deformable\nobjects is a fundamental problem in computer vision. Existing approaches often\nneed additional annotations of specific semantic domain, e.g., skeleton poses\nfor human bodies or animals, which require extra annotation effort and suffer\nfrom error accumulation, and they are limited to specific domain. In this\npaper, we propose a novel self-supervised approach to learn neural implicit\nshape representation for deformable objects, which can represent shapes with a\ntemplate shape and dense correspondence in 3D. Our method does not require the\npriors of skeleton and skinning weight, and only requires a collection of\nshapes represented in signed distance fields. To handle the large deformation,\nwe constrain the learned template shape in the same latent space with the\ntraining shapes, design a new formulation of local rigid constraint that\nenforces rigid transformation in local region and addresses local reflection\nissue, and present a new hierarchical rigid constraint to reduce the ambiguity\ndue to the joint learning of template shape and correspondences. Extensive\nexperiments show that our model can represent shapes with large deformations.\nWe also show that our shape representation can support two typical\napplications, such as texture transfer and shape editing, with competitive\nperformance. The code and models are available at\nhttps://iscas3dv.github.io/deformshape\n","authors":["Baowen Zhang","Jiahe Li","Xiaoming Deng","Yinda Zhang","Cuixia Ma","Hongan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.12590v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.11911v2","updated":"2023-08-24T06:35:22Z","published":"2023-08-23T04:52:48Z","title":"ACLS: Adaptive and Conditional Label Smoothing for Network Calibration","summary":" We address the problem of network calibration adjusting miscalibrated\nconfidences of deep neural networks. Many approaches to network calibration\nadopt a regularization-based method that exploits a regularization term to\nsmooth the miscalibrated confidences. Although these approaches have shown the\neffectiveness on calibrating the networks, there is still a lack of\nunderstanding on the underlying principles of regularization in terms of\nnetwork calibration. We present in this paper an in-depth analysis of existing\nregularization-based methods, providing a better understanding on how they\naffect to network calibration. Specifically, we have observed that 1) the\nregularization-based methods can be interpreted as variants of label smoothing,\nand 2) they do not always behave desirably. Based on the analysis, we introduce\na novel loss function, dubbed ACLS, that unifies the merits of existing\nregularization methods, while avoiding the limitations. We show extensive\nexperimental results for image classification and semantic segmentation on\nstandard benchmarks, including CIFAR10, Tiny-ImageNet, ImageNet, and PASCAL\nVOC, demonstrating the effectiveness of our loss function.\n","authors":["Hyekang Park","Jongyoun Noh","Youngmin Oh","Donghyeon Baek","Bumsub Ham"],"pdf_url":"https://arxiv.org/pdf/2308.11911v2.pdf","comment":"Accepted to ICCV 2023 (Oral presentation)"},{"id":"http://arxiv.org/abs/2305.10856v2","updated":"2023-08-24T06:33:09Z","published":"2023-05-18T10:18:59Z","title":"Towards an Accurate and Secure Detector against Adversarial\n Perturbations","summary":" The vulnerability of deep neural networks to adversarial perturbations has\nbeen widely perceived in the computer vision community. From a security\nperspective, it poses a critical risk for modern vision systems, e.g., the\npopular Deep Learning as a Service (DLaaS) frameworks. For protecting\noff-the-shelf deep models while not modifying them, current algorithms\ntypically detect adversarial patterns through discriminative decomposition of\nnatural-artificial data. However, these decompositions are biased towards\nfrequency or spatial discriminability, thus failing to capture adversarial\npatterns comprehensively. More seriously, successful defense-aware (secondary)\nadversarial attack (i.e., evading the detector as well as fooling the model) is\npractical under the assumption that the adversary is fully aware of the\ndetector (i.e., the Kerckhoffs's principle). Motivated by such facts, we\npropose an accurate and secure adversarial example detector, relying on a\nspatial-frequency discriminative decomposition with secret keys. It expands the\nabove works on two aspects: 1) the introduced Krawtchouk basis provides better\nspatial-frequency discriminability and thereby is more suitable for capturing\nadversarial patterns than the common trigonometric or wavelet basis; 2) the\nextensive parameters for decomposition are generated by a pseudo-random\nfunction with secret keys, hence blocking the defense-aware adversarial attack.\nTheoretical and numerical analysis demonstrates the increased accuracy and\nsecurity of our detector with respect to a number of state-of-the-art\nalgorithms.\n","authors":["Chao Wang","Shuren Qi","Zhiqiu Huang","Yushu Zhang","Rushi Lan","Xiaochun Cao"],"pdf_url":"https://arxiv.org/pdf/2305.10856v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13226v2","updated":"2023-08-24T06:29:35Z","published":"2023-07-25T03:30:09Z","title":"Strivec: Sparse Tri-Vector Radiance Fields","summary":" We propose Strivec, a novel neural representation that models a 3D scene as a\nradiance field with sparsely distributed and compactly factorized local tensor\nfeature grids. Our approach leverages tensor decomposition, following the\nrecent work TensoRF, to model the tensor grids. In contrast to TensoRF which\nuses a global tensor and focuses on their vector-matrix decomposition, we\npropose to utilize a cloud of local tensors and apply the classic\nCANDECOMP/PARAFAC (CP) decomposition to factorize each tensor into triple\nvectors that express local feature distributions along spatial axes and\ncompactly encode a local neural field. We also apply multi-scale tensor grids\nto discover the geometry and appearance commonalities and exploit spatial\ncoherence with the tri-vector factorization at multiple local scales. The final\nradiance field properties are regressed by aggregating neural features from\nmultiple local tensors across all scales. Our tri-vector tensors are sparsely\ndistributed around the actual scene surface, discovered by a fast coarse\nreconstruction, leveraging the sparsity of a 3D scene. We demonstrate that our\nmodel can achieve better rendering quality while using significantly fewer\nparameters than previous methods, including TensoRF and Instant-NGP.\n","authors":["Quankai Gao","Qiangeng Xu","Hao Su","Ulrich Neumann","Zexiang Xu"],"pdf_url":"https://arxiv.org/pdf/2307.13226v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.01805v2","updated":"2023-08-24T06:28:02Z","published":"2023-01-04T20:08:23Z","title":"Unsupervised Manifold Linearizing and Clustering","summary":" We consider the problem of simultaneously clustering and learning a linear\nrepresentation of data lying close to a union of low-dimensional manifolds, a\nfundamental task in machine learning and computer vision. When the manifolds\nare assumed to be linear subspaces, this reduces to the classical problem of\nsubspace clustering, which has been studied extensively over the past two\ndecades. Unfortunately, many real-world datasets such as natural images can not\nbe well approximated by linear subspaces. On the other hand, numerous works\nhave attempted to learn an appropriate transformation of the data, such that\ndata is mapped from a union of general non-linear manifolds to a union of\nlinear subspaces (with points from the same manifold being mapped to the same\nsubspace). However, many existing works have limitations such as assuming\nknowledge of the membership of samples to clusters, requiring high sampling\ndensity, or being shown theoretically to learn trivial representations. In this\npaper, we propose to optimize the Maximal Coding Rate Reduction metric with\nrespect to both the data representation and a novel doubly stochastic cluster\nmembership, inspired by state-of-the-art subspace clustering results. We give a\nparameterization of such a representation and membership, allowing efficient\nmini-batching and one-shot initialization. Experiments on CIFAR-10, -20, -100,\nand TinyImageNet-200 datasets show that the proposed method is much more\naccurate and scalable than state-of-the-art deep clustering methods, and\nfurther learns a latent linear representation of the data.\n","authors":["Tianjiao Ding","Shengbang Tong","Kwan Ho Ryan Chan","Xili Dai","Yi Ma","Benjamin D. Haeffele"],"pdf_url":"https://arxiv.org/pdf/2301.01805v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12587v1","updated":"2023-08-24T06:25:20Z","published":"2023-08-24T06:25:20Z","title":"Grounded Entity-Landmark Adaptive Pre-training for Vision-and-Language\n Navigation","summary":" Cross-modal alignment is one key challenge for Vision-and-Language Navigation\n(VLN). Most existing studies concentrate on mapping the global instruction or\nsingle sub-instruction to the corresponding trajectory. However, another\ncritical problem of achieving fine-grained alignment at the entity level is\nseldom considered. To address this problem, we propose a novel Grounded\nEntity-Landmark Adaptive (GELA) pre-training paradigm for VLN tasks. To achieve\nthe adaptive pre-training paradigm, we first introduce grounded entity-landmark\nhuman annotations into the Room-to-Room (R2R) dataset, named GEL-R2R.\nAdditionally, we adopt three grounded entity-landmark adaptive pre-training\nobjectives: 1) entity phrase prediction, 2) landmark bounding box prediction,\nand 3) entity-landmark semantic alignment, which explicitly supervise the\nlearning of fine-grained cross-modal alignment between entity phrases and\nenvironment landmarks. Finally, we validate our model on two downstream\nbenchmarks: VLN with descriptive instructions (R2R) and dialogue instructions\n(CVDN). The comprehensive experiments show that our GELA model achieves\nstate-of-the-art results on both tasks, demonstrating its effectiveness and\ngeneralizability.\n","authors":["Yibo Cui","Liang Xie","Yakun Zhang","Meishan Zhang","Ye Yan","Erwei Yin"],"pdf_url":"https://arxiv.org/pdf/2308.12587v1.pdf","comment":"ICCV 2023 Oral"},{"id":"http://arxiv.org/abs/2301.09091v3","updated":"2023-08-24T06:18:42Z","published":"2023-01-22T10:17:02Z","title":"BallGAN: 3D-aware Image Synthesis with a Spherical Background","summary":" 3D-aware GANs aim to synthesize realistic 3D scenes such that they can be\nrendered in arbitrary perspectives to produce images. Although previous methods\nproduce realistic images, they suffer from unstable training or degenerate\nsolutions where the 3D geometry is unnatural. We hypothesize that the 3D\ngeometry is underdetermined due to the insufficient constraint, i.e., being\nclassified as real image to the discriminator is not enough. To solve this\nproblem, we propose to approximate the background as a spherical surface and\nrepresent a scene as a union of the foreground placed in the sphere and the\nthin spherical background. It reduces the degree of freedom in the background\nfield. Accordingly, we modify the volume rendering equation and incorporate\ndedicated constraints to design a novel 3D-aware GAN framework named BallGAN.\nBallGAN has multiple advantages as follows. 1) It produces more reasonable 3D\ngeometry; the images of a scene across different viewpoints have better\nphotometric consistency and fidelity than the state-of-the-art methods. 2) The\ntraining becomes much more stable. 3) The foreground can be separately rendered\non top of different arbitrary backgrounds.\n","authors":["Minjung Shin","Yunji Seo","Jeongmin Bae","Young Sun Choi","Hyunsu Kim","Hyeran Byun","Youngjung Uh"],"pdf_url":"https://arxiv.org/pdf/2301.09091v3.pdf","comment":"ICCV 2023, Project Page: https://minjung-s.github.io/ballgan"},{"id":"http://arxiv.org/abs/2308.12584v1","updated":"2023-08-24T06:12:41Z","published":"2023-08-24T06:12:41Z","title":"LORD: Leveraging Open-Set Recognition with Unknown Data","summary":" Handling entirely unknown data is a challenge for any deployed classifier.\nClassification models are typically trained on a static pre-defined dataset and\nare kept in the dark for the open unassigned feature space. As a result, they\nstruggle to deal with out-of-distribution data during inference. Addressing\nthis task on the class-level is termed open-set recognition (OSR). However,\nmost OSR methods are inherently limited, as they train closed-set classifiers\nand only adapt the downstream predictions to OSR. This work presents LORD, a\nframework to Leverage Open-set Recognition by exploiting unknown Data. LORD\nexplicitly models open space during classifier training and provides a\nsystematic evaluation for such approaches. We identify three model-agnostic\ntraining strategies that exploit background data and applied them to\nwell-established classifiers. Due to LORD's extensive evaluation protocol, we\nconsistently demonstrate improved recognition of unknown data. The benchmarks\nfacilitate in-depth analysis across various requirement levels. To mitigate\ndependency on extensive and costly background datasets, we explore mixup as an\noff-the-shelf data generation technique. Our experiments highlight mixup's\neffectiveness as a substitute for background datasets. Lightweight constraints\non mixup synthesis further improve OSR performance.\n","authors":["Tobias Koch","Christian Riess","Thomas Köhler"],"pdf_url":"https://arxiv.org/pdf/2308.12584v1.pdf","comment":"Accepted at ICCV 2023 Workshop (Out-Of-Distribution Generalization in\n Computer Vision)"},{"id":"http://arxiv.org/abs/2308.12577v1","updated":"2023-08-24T05:32:29Z","published":"2023-08-24T05:32:29Z","title":"REB: Reducing Biases in Representation for Industrial Anomaly Detection","summary":" Existing K-nearest neighbor (KNN) retrieval-based methods usually conduct\nindustrial anomaly detection in two stages: obtain feature representations with\na pre-trained CNN model and perform distance measures for defect detection.\nHowever, the features are not fully exploited as they ignore domain bias and\nthe difference of local density in feature space, which limits the detection\nperformance. In this paper, we propose Reducing Biases (REB) in representation\nby considering the domain bias of the pre-trained model and building a\nself-supervised learning task for better domain adaption with a defect\ngeneration strategy (DefectMaker) imitating the natural defects. Additionally,\nwe propose a local density KNN (LDKNN) to reduce the local density bias and\nobtain effective anomaly detection. We achieve a promising result of 99.5\\%\nAUROC on the widely used MVTec AD benchmark. We also achieve 88.0\\% AUROC on\nthe challenging MVTec LOCO AD dataset and bring an improvement of 4.7\\% AUROC\nto the state-of-the-art result. All results are obtained with smaller backbone\nnetworks such as Vgg11 and Resnet18, which indicates the effectiveness and\nefficiency of REB for practical industrial applications.\n","authors":["Shuai Lyu","Dongmei Mo","Waikeung Wong"],"pdf_url":"https://arxiv.org/pdf/2308.12577v1.pdf","comment":"11 pages, 5 figures, 5 tables"},{"id":"http://arxiv.org/abs/2308.12570v1","updated":"2023-08-24T05:22:43Z","published":"2023-08-24T05:22:43Z","title":"StreamMapNet: Streaming Mapping Network for Vectorized Online HD Map\n Construction","summary":" High-Definition (HD) maps are essential for the safety of autonomous driving\nsystems. While existing techniques employ camera images and onboard sensors to\ngenerate vectorized high-precision maps, they are constrained by their reliance\non single-frame input. This approach limits their stability and performance in\ncomplex scenarios such as occlusions, largely due to the absence of temporal\ninformation. Moreover, their performance diminishes when applied to broader\nperception ranges. In this paper, we present StreamMapNet, a novel online\nmapping pipeline adept at long-sequence temporal modeling of videos.\nStreamMapNet employs multi-point attention and temporal information which\nempowers the construction of large-range local HD maps with high stability and\nfurther addresses the limitations of existing methods. Furthermore, we\ncritically examine widely used online HD Map construction benchmark and\ndatasets, Argoverse2 and nuScenes, revealing significant bias in the existing\nevaluation protocols. We propose to resplit the benchmarks according to\ngeographical spans, promoting fair and precise evaluations. Experimental\nresults validate that StreamMapNet significantly outperforms existing methods\nacross all settings while maintaining an online inference speed of $14.2$ FPS.\n","authors":["Tianyuan Yuan","Yicheng Liu","Yue Wang","Yilun Wang","Hang Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.12570v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11482v2","updated":"2023-08-24T05:14:34Z","published":"2023-07-21T10:36:05Z","title":"R2Det: Redemption from Range-view for Accurate 3D Object Detection","summary":" LiDAR-based 3D object detection is of paramount importance for autonomous\ndriving. Recent trends show a remarkable improvement for bird's-eye-view (BEV)\nbased and point-based methods as they demonstrate superior performance compared\nto range-view counterparts. This paper presents an insight that leverages\nrange-view representation to enhance 3D points for accurate 3D object\ndetection. Specifically, we introduce a Redemption from Range-view Module\n(R2M), a plug-and-play approach for 3D surface texture enhancement from the 2D\nrange view to the 3D point view. R2M comprises BasicBlock for 2D feature\nextraction, Hierarchical-dilated (HD) Meta Kernel for expanding the 3D\nreceptive field, and Feature Points Redemption (FPR) for recovering 3D surface\ntexture information. R2M can be seamlessly integrated into state-of-the-art\nLiDAR-based 3D object detectors as preprocessing and achieve appealing\nimprovement, e.g., 1.39%, 1.67%, and 1.97% mAP improvement on easy, moderate,\nand hard difficulty level of KITTI val set, respectively. Based on R2M, we\nfurther propose R2Detector (R2Det) with the Synchronous-Grid RoI Pooling for\naccurate box refinement. R2Det outperforms existing range-view-based methods by\na significant margin on both the KITTI benchmark and the Waymo Open Dataset.\nCodes will be made publicly available.\n","authors":["Yihan Wang","Qiao Yan","Yi Wang"],"pdf_url":"https://arxiv.org/pdf/2307.11482v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12560v1","updated":"2023-08-24T05:00:07Z","published":"2023-08-24T05:00:07Z","title":"NOVA: NOvel View Augmentation for Neural Composition of Dynamic Objects","summary":" We propose a novel-view augmentation (NOVA) strategy to train NeRFs for\nphoto-realistic 3D composition of dynamic objects in a static scene. Compared\nto prior work, our framework significantly reduces blending artifacts when\ninserting multiple dynamic objects into a 3D scene at novel views and times;\nachieves comparable PSNR without the need for additional ground truth\nmodalities like optical flow; and overall provides ease, flexibility, and\nscalability in neural composition. Our codebase is on GitHub.\n","authors":["Dakshit Agrawal","Jiajie Xu","Siva Karthik Mustikovela","Ioannis Gkioulekas","Ashish Shrivastava","Yuning Chai"],"pdf_url":"https://arxiv.org/pdf/2308.12560v1.pdf","comment":"Accepted for publication in ICCV Computer Vision for Metaverse\n Workshop 2023 (code is available at https://github.com/dakshitagrawal/NoVA)"},{"id":"http://arxiv.org/abs/2308.12558v1","updated":"2023-08-24T04:52:32Z","published":"2023-08-24T04:52:32Z","title":"Hyperbolic Audio-visual Zero-shot Learning","summary":" Audio-visual zero-shot learning aims to classify samples consisting of a pair\nof corresponding audio and video sequences from classes that are not present\nduring training. An analysis of the audio-visual data reveals a large degree of\nhyperbolicity, indicating the potential benefit of using a hyperbolic\ntransformation to achieve curvature-aware geometric learning, with the aim of\nexploring more complex hierarchical data structures for this task. The proposed\napproach employs a novel loss function that incorporates cross-modality\nalignment between video and audio features in the hyperbolic space.\nAdditionally, we explore the use of multiple adaptive curvatures for hyperbolic\nprojections. The experimental results on this very challenging task demonstrate\nthat our proposed hyperbolic approach for zero-shot learning outperforms the\nSOTA method on three datasets: VGGSound-GZSL, UCF-GZSL, and ActivityNet-GZSL\nachieving a harmonic mean (HM) improvement of around 3.0%, 7.0%, and 5.3%,\nrespectively.\n","authors":["Jie Hong","Zeeshan Hayder","Junlin Han","Pengfei Fang","Mehrtash Harandi","Lars Petersson"],"pdf_url":"https://arxiv.org/pdf/2308.12558v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.12907v4","updated":"2023-08-24T04:42:35Z","published":"2023-07-24T16:02:42Z","title":"GridMM: Grid Memory Map for Vision-and-Language Navigation","summary":" Vision-and-language navigation (VLN) enables the agent to navigate to a\nremote location following the natural language instruction in 3D environments.\nTo represent the previously visited environment, most approaches for VLN\nimplement memory using recurrent states, topological maps, or top-down semantic\nmaps. In contrast to these approaches, we build the top-down egocentric and\ndynamically growing Grid Memory Map (i.e., GridMM) to structure the visited\nenvironment. From a global perspective, historical observations are projected\ninto a unified grid map in a top-down view, which can better represent the\nspatial relations of the environment. From a local perspective, we further\npropose an instruction relevance aggregation method to capture fine-grained\nvisual clues in each grid region. Extensive experiments are conducted on both\nthe REVERIE, R2R, SOON datasets in the discrete environments, and the R2R-CE\ndataset in the continuous environments, showing the superiority of our proposed\nmethod.\n","authors":["Zihan Wang","Xiangyang Li","Jiahao Yang","Yeqi Liu","Shuqiang Jiang"],"pdf_url":"https://arxiv.org/pdf/2307.12907v4.pdf","comment":"Accepted by ICCV 2023. The code is available at\n https://github.com/MrZihan/GridMM"},{"id":"http://arxiv.org/abs/2212.01735v4","updated":"2023-08-24T04:39:38Z","published":"2022-12-04T03:45:08Z","title":"Neural Fourier Filter Bank","summary":" We present a novel method to provide efficient and highly detailed\nreconstructions. Inspired by wavelets, we learn a neural field that decompose\nthe signal both spatially and frequency-wise. We follow the recent grid-based\nparadigm for spatial decomposition, but unlike existing work, encourage\nspecific frequencies to be stored in each grid via Fourier features encodings.\nWe then apply a multi-layer perceptron with sine activations, taking these\nFourier encoded features in at appropriate layers so that higher-frequency\ncomponents are accumulated on top of lower-frequency components sequentially,\nwhich we sum up to form the final output. We demonstrate that our method\noutperforms the state of the art regarding model compactness and convergence\nspeed on multiple tasks: 2D image fitting, 3D shape reconstruction, and neural\nradiance fields. Our code is available at https://github.com/ubc-vision/NFFB.\n","authors":["Zhijie Wu","Yuhe Jin","Kwang Moo Yi"],"pdf_url":"https://arxiv.org/pdf/2212.01735v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12549v1","updated":"2023-08-24T04:28:08Z","published":"2023-08-24T04:28:08Z","title":"Synchronize Feature Extracting and Matching: A Single Branch Framework\n for 3D Object Tracking","summary":" Siamese network has been a de facto benchmark framework for 3D LiDAR object\ntracking with a shared-parametric encoder extracting features from template and\nsearch region, respectively. This paradigm relies heavily on an additional\nmatching network to model the cross-correlation/similarity of the template and\nsearch region. In this paper, we forsake the conventional Siamese paradigm and\npropose a novel single-branch framework, SyncTrack, synchronizing the feature\nextracting and matching to avoid forwarding encoder twice for template and\nsearch region as well as introducing extra parameters of matching network. The\nsynchronization mechanism is based on the dynamic affinity of the Transformer,\nand an in-depth analysis of the relevance is provided theoretically. Moreover,\nbased on the synchronization, we introduce a novel Attentive Points-Sampling\nstrategy into the Transformer layers (APST), replacing the random/Farthest\nPoints Sampling (FPS) method with sampling under the supervision of attentive\nrelations between the template and search region. It implies connecting\npoint-wise sampling with the feature learning, beneficial to aggregating more\ndistinctive and geometric features for tracking with sparse points. Extensive\nexperiments on two benchmark datasets (KITTI and NuScenes) show that SyncTrack\nachieves state-of-the-art performance in real-time tracking.\n","authors":["Teli Ma","Mengmeng Wang","Jimin Xiao","Huifeng Wu","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2308.12549v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2212.02011v2","updated":"2023-08-24T04:21:17Z","published":"2022-12-05T03:53:51Z","title":"PointCaM: Cut-and-Mix for Open-Set Point Cloud Learning","summary":" Point cloud learning is receiving increasing attention, however, most\nexisting point cloud models lack the practical ability to deal with the\nunavoidable presence of unknown objects. This paper mainly discusses point\ncloud learning under open-set settings, where we train the model without data\nfrom unknown classes and identify them in the inference stage. Basically, we\npropose to solve open-set point cloud learning using a novel Point Cut-and-Mix\nmechanism consisting of Unknown-Point Simulator and Unknown-Point Estimator\nmodules. Specifically, we use the Unknown-Point Simulator to simulate\nout-of-distribution data in the training stage by manipulating the geometric\ncontext of partial known data. Based on this, the Unknown-Point Estimator\nmodule learns to exploit the point cloud's feature context for discriminating\nthe known and unknown data. Extensive experiments show the plausibility of\nopen-set point cloud learning and the effectiveness of our proposed solutions.\nOur code is available at \\url{https://github.com/ShiQiu0419/pointcam}.\n","authors":["Jie Hong","Shi Qiu","Weihao Li","Saeed Anwar","Mehrtash Harandi","Nick Barnes","Lars Petersson"],"pdf_url":"https://arxiv.org/pdf/2212.02011v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12547v1","updated":"2023-08-24T04:20:20Z","published":"2023-08-24T04:20:20Z","title":"Hybrid Models for Facial Emotion Recognition in Children","summary":" This paper focuses on the use of emotion recognition techniques to assist\npsychologists in performing children's therapy through remotely robot operated\nsessions. In the field of psychology, the use of agent-mediated therapy is\ngrowing increasingly given recent advances in robotics and computer science.\nSpecifically, the use of Embodied Conversational Agents (ECA) as an\nintermediary tool can help professionals connect with children who face social\nchallenges such as Attention Deficit Hyperactivity Disorder (ADHD), Autism\nSpectrum Disorder (ASD) or even who are physically unavailable due to being in\nregions of armed conflict, natural disasters, or other circumstances. In this\ncontext, emotion recognition represents an important feedback for the\npsychotherapist. In this article, we initially present the result of a\nbibliographical research associated with emotion recognition in children. This\nresearch revealed an initial overview on algorithms and datasets widely used by\nthe community. Then, based on the analysis carried out on the results of the\nbibliographical research, we used the technique of dense optical flow features\nto improve the ability of identifying emotions in children in uncontrolled\nenvironments. From the output of a hybrid model of Convolutional Neural\nNetwork, two intermediary features are fused before being processed by a final\nclassifier. The proposed architecture was called HybridCNNFusion. Finally, we\npresent the initial results achieved in the recognition of children's emotions\nusing a dataset of Brazilian children.\n","authors":["Rafael Zimmer","Marcos Sobral","Helio Azevedo"],"pdf_url":"https://arxiv.org/pdf/2308.12547v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.13310v4","updated":"2023-08-24T04:18:17Z","published":"2022-03-24T19:28:54Z","title":"MonoDETR: Depth-guided Transformer for Monocular 3D Object Detection","summary":" Monocular 3D object detection has long been a challenging task in autonomous\ndriving. Most existing methods follow conventional 2D detectors to first\nlocalize object centers, and then predict 3D attributes by neighboring\nfeatures. However, only using local visual features is insufficient to\nunderstand the scene-level 3D spatial structures and ignores the long-range\ninter-object depth relations. In this paper, we introduce the first DETR\nframework for Monocular DEtection with a depth-guided TRansformer, named\nMonoDETR. We modify the vanilla transformer to be depth-aware and guide the\nwhole detection process by contextual depth cues. Specifically, concurrent to\nthe visual encoder that captures object appearances, we introduce to predict a\nforeground depth map, and specialize a depth encoder to extract non-local depth\nembeddings. Then, we formulate 3D object candidates as learnable queries and\npropose a depth-guided decoder to conduct object-scene depth interactions. In\nthis way, each object query estimates its 3D attributes adaptively from the\ndepth-guided regions on the image and is no longer constrained to local visual\nfeatures. On KITTI benchmark with monocular images as input, MonoDETR achieves\nstate-of-the-art performance and requires no extra dense depth annotations.\nBesides, our depth-guided modules can also be plug-and-play to enhance\nmulti-view 3D object detectors on nuScenes dataset, demonstrating our superior\ngeneralization capacity. Code is available at\nhttps://github.com/ZrrSkywalker/MonoDETR.\n","authors":["Renrui Zhang","Han Qiu","Tai Wang","Ziyu Guo","Xuanzhuo Xu","Ziteng Cui","Yu Qiao","Peng Gao","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2203.13310v4.pdf","comment":"Accepted by ICCV 2023. Code is available at\n https://github.com/ZrrSkywalker/MonoDETR"},{"id":"http://arxiv.org/abs/2303.11086v2","updated":"2023-08-24T03:53:35Z","published":"2023-03-20T13:20:14Z","title":"Pluralistic Aging Diffusion Autoencoder","summary":" Face aging is an ill-posed problem because multiple plausible aging patterns\nmay correspond to a given input. Most existing methods often produce one\ndeterministic estimation. This paper proposes a novel CLIP-driven Pluralistic\nAging Diffusion Autoencoder (PADA) to enhance the diversity of aging patterns.\nFirst, we employ diffusion models to generate diverse low-level aging details\nvia a sequential denoising reverse process. Second, we present Probabilistic\nAging Embedding (PAE) to capture diverse high-level aging patterns, which\nrepresents age information as probabilistic distributions in the common CLIP\nlatent space. A text-guided KL-divergence loss is designed to guide this\nlearning. Our method can achieve pluralistic face aging conditioned on\nopen-world aging texts and arbitrary unseen face images. Qualitative and\nquantitative experiments demonstrate that our method can generate more diverse\nand high-quality plausible aging results.\n","authors":["Peipei Li","Rui Wang","Huaibo Huang","Ran He","Zhaofeng He"],"pdf_url":"https://arxiv.org/pdf/2303.11086v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.12538v1","updated":"2023-08-24T03:50:37Z","published":"2023-08-24T03:50:37Z","title":"Mutual-Guided Dynamic Network for Image Fusion","summary":" Image fusion aims to generate a high-quality image from multiple images\ncaptured under varying conditions. The key problem of this task is to preserve\ncomplementary information while filtering out irrelevant information for the\nfused result. However, existing methods address this problem by leveraging\nstatic convolutional neural networks (CNNs), suffering two inherent limitations\nduring feature extraction, i.e., being unable to handle spatial-variant\ncontents and lacking guidance from multiple inputs. In this paper, we propose a\nnovel mutual-guided dynamic network (MGDN) for image fusion, which allows for\neffective information utilization across different locations and inputs.\nSpecifically, we design a mutual-guided dynamic filter (MGDF) for adaptive\nfeature extraction, composed of a mutual-guided cross-attention (MGCA) module\nand a dynamic filter predictor, where the former incorporates additional\nguidance from different inputs and the latter generates spatial-variant kernels\nfor different locations. In addition, we introduce a parallel feature fusion\n(PFF) module to effectively fuse local and global information of the extracted\nfeatures. To further reduce the redundancy among the extracted features while\nsimultaneously preserving their shared structural information, we devise a\nnovel loss function that combines the minimization of normalized mutual\ninformation (NMI) with an estimated gradient mask. Experimental results on five\nbenchmark datasets demonstrate that our proposed method outperforms existing\nmethods on four image fusion tasks. The code and model are publicly available\nat: https://github.com/Guanys-dar/MGDN.\n","authors":["Yuanshen Guan","Ruikang Xu","Mingde Yao","Lizhi Wang","Zhiwei Xiong"],"pdf_url":"https://arxiv.org/pdf/2308.12538v1.pdf","comment":"ACMMM 2023 accepted"},{"id":"http://arxiv.org/abs/2308.12537v1","updated":"2023-08-24T03:47:27Z","published":"2023-08-24T03:47:27Z","title":"HuBo-VLM: Unified Vision-Language Model designed for HUman roBOt\n interaction tasks","summary":" Human robot interaction is an exciting task, which aimed to guide robots\nfollowing instructions from human. Since huge gap lies between human natural\nlanguage and machine codes, end to end human robot interaction models is fair\nchallenging. Further, visual information receiving from sensors of robot is\nalso a hard language for robot to perceive. In this work, HuBo-VLM is proposed\nto tackle perception tasks associated with human robot interaction including\nobject detection and visual grounding by a unified transformer based vision\nlanguage model. Extensive experiments on the Talk2Car benchmark demonstrate the\neffectiveness of our approach. Code would be publicly available in\nhttps://github.com/dzcgaara/HuBo-VLM.\n","authors":["Zichao Dong","Weikun Zhang","Xufeng Huang","Hang Ji","Xin Zhan","Junbo Chen"],"pdf_url":"https://arxiv.org/pdf/2308.12537v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12535v1","updated":"2023-08-24T03:44:05Z","published":"2023-08-24T03:44:05Z","title":"SCP: Spherical-Coordinate-based Learned Point Cloud Compression","summary":" In recent years, the task of learned point cloud compression has gained\nprominence. An important type of point cloud, the spinning LiDAR point cloud,\nis generated by spinning LiDAR on vehicles. This process results in numerous\ncircular shapes and azimuthal angle invariance features within the point\nclouds. However, these two features have been largely overlooked by previous\nmethodologies. In this paper, we introduce a model-agnostic method called\nSpherical-Coordinate-based learned Point cloud compression (SCP), designed to\nleverage the aforementioned features fully. Additionally, we propose a\nmulti-level Octree for SCP to mitigate the reconstruction error for distant\nareas within the Spherical-coordinate-based Octree. SCP exhibits excellent\nuniversality, making it applicable to various learned point cloud compression\ntechniques. Experimental results demonstrate that SCP surpasses previous\nstate-of-the-art methods by up to 29.14% in point-to-point PSNR BD-Rate.\n","authors":["Ao Luo","Linxin Song","Keisuke Nonaka","Kyohei Unno","Heming Sun","Masayuki Goto","Jiro Katto"],"pdf_url":"https://arxiv.org/pdf/2308.12535v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12534v1","updated":"2023-08-24T03:43:47Z","published":"2023-08-24T03:43:47Z","title":"Channel and Spatial Relation-Propagation Network for RGB-Thermal\n Semantic Segmentation","summary":" RGB-Thermal (RGB-T) semantic segmentation has shown great potential in\nhandling low-light conditions where RGB-based segmentation is hindered by poor\nRGB imaging quality. The key to RGB-T semantic segmentation is to effectively\nleverage the complementarity nature of RGB and thermal images. Most existing\nalgorithms fuse RGB and thermal information in feature space via concatenation,\nelement-wise summation, or attention operations in either unidirectional\nenhancement or bidirectional aggregation manners. However, they usually\noverlook the modality gap between RGB and thermal images during feature fusion,\nresulting in modality-specific information from one modality contaminating the\nother. In this paper, we propose a Channel and Spatial Relation-Propagation\nNetwork (CSRPNet) for RGB-T semantic segmentation, which propagates only\nmodality-shared information across different modalities and alleviates the\nmodality-specific information contamination issue. Our CSRPNet first performs\nrelation-propagation in channel and spatial dimensions to capture the\nmodality-shared features from the RGB and thermal features. CSRPNet then\naggregates the modality-shared features captured from one modality with the\ninput feature from the other modality to enhance the input feature without the\ncontamination issue. While being fused together, the enhanced RGB and thermal\nfeatures will be also fed into the subsequent RGB or thermal feature extraction\nlayers for interactive feature fusion, respectively. We also introduce a\ndual-path cascaded feature refinement module that aggregates multi-layer\nfeatures to produce two refined features for semantic and boundary prediction.\nExtensive experimental results demonstrate that CSRPNet performs favorably\nagainst state-of-the-art algorithms.\n","authors":["Zikun Zhou","Shukun Wu","Guoqing Zhu","Hongpeng Wang","Zhenyu He"],"pdf_url":"https://arxiv.org/pdf/2308.12534v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12532v1","updated":"2023-08-24T03:43:02Z","published":"2023-08-24T03:43:02Z","title":"FedSoL: Bridging Global Alignment and Local Generality in Federated\n Learning","summary":" Federated Learning (FL) aggregates locally trained models from individual\nclients to construct a global model. While FL enables learning a model with\ndata privacy, it often suffers from significant performance degradation when\nclient data distributions are heterogeneous. Many previous FL algorithms have\naddressed this issue by introducing various proximal restrictions. These\nrestrictions aim to encourage global alignment by constraining the deviation of\nlocal learning from the global objective. However, they inherently limit local\nlearning by interfering with the original local objectives. Recently, an\nalternative approach has emerged to improve local learning generality. By\nobtaining local models within a smooth loss landscape, this approach mitigates\nconflicts among different local objectives of the clients. Yet, it does not\nensure stable global alignment, as local learning does not take the global\nobjective into account. In this study, we propose Federated Stability on\nLearning (FedSoL), which combines both the concepts of global alignment and\nlocal generality. In FedSoL, the local learning seeks a parameter region robust\nagainst proximal perturbations. This strategy introduces an implicit proximal\nrestriction effect in local learning while maintaining the original local\nobjective for parameter update. Our experiments show that FedSoL consistently\nachieves state-of-the-art performance on various setups.\n","authors":["Gihun Lee","Minchan Jeong","Sangmook Kim","Jaehoon Oh","Se-Young Yun"],"pdf_url":"https://arxiv.org/pdf/2308.12532v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12530v1","updated":"2023-08-24T03:40:16Z","published":"2023-08-24T03:40:16Z","title":"SieveNet: Selecting Point-Based Features for Mesh Networks","summary":" Meshes are widely used in 3D computer vision and graphics, but their\nirregular topology poses challenges in applying them to existing neural network\narchitectures. Recent advances in mesh neural networks turn to remeshing and\npush the boundary of pioneer methods that solely take the raw meshes as input.\nAlthough the remeshing offers a regular topology that significantly facilitates\nthe design of mesh network architectures, features extracted from such remeshed\nproxies may struggle to retain the underlying geometry faithfully, limiting the\nsubsequent neural network's capacity. To address this issue, we propose\nSieveNet, a novel paradigm that takes into account both the regular topology\nand the exact geometry. Specifically, this method utilizes structured mesh\ntopology from remeshing and accurate geometric information from\ndistortion-aware point sampling on the surface of the original mesh.\nFurthermore, our method eliminates the need for hand-crafted feature\nengineering and can leverage off-the-shelf network architectures such as the\nvision transformer. Comprehensive experimental results on classification and\nsegmentation tasks well demonstrate the effectiveness and superiority of our\nmethod.\n","authors":["Shengchao Yuan","Yishun Dou","Rui Shi","Bingbing Ni","Zhong Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.12530v1.pdf","comment":"The project homepage is https://sievenet.github.io/"},{"id":"http://arxiv.org/abs/2308.11877v2","updated":"2023-08-24T03:38:31Z","published":"2023-08-23T02:49:22Z","title":"Integrated Image and Location Analysis for Wound Classification: A Deep\n Learning Approach","summary":" The global burden of acute and chronic wounds presents a compelling case for\nenhancing wound classification methods, a vital step in diagnosing and\ndetermining optimal treatments. Recognizing this need, we introduce an\ninnovative multi-modal network based on a deep convolutional neural network for\ncategorizing wounds into four categories: diabetic, pressure, surgical, and\nvenous ulcers. Our multi-modal network uses wound images and their\ncorresponding body locations for more precise classification. A unique aspect\nof our methodology is incorporating a body map system that facilitates accurate\nwound location tagging, improving upon traditional wound image classification\ntechniques. A distinctive feature of our approach is the integration of models\nsuch as VGG16, ResNet152, and EfficientNet within a novel architecture. This\narchitecture includes elements like spatial and channel-wise\nSqueeze-and-Excitation modules, Axial Attention, and an Adaptive Gated\nMulti-Layer Perceptron, providing a robust foundation for classification. Our\nmulti-modal network was trained and evaluated on two distinct datasets\ncomprising relevant images and corresponding location information. Notably, our\nproposed network outperformed traditional methods, reaching an accuracy range\nof 74.79% to 100% for Region of Interest (ROI) without location\nclassifications, 73.98% to 100% for ROI with location classifications, and\n78.10% to 100% for whole image classifications. This marks a significant\nenhancement over previously reported performance metrics in the literature. Our\nresults indicate the potential of our multi-modal network as an effective\ndecision-support tool for wound image classification, paving the way for its\napplication in various clinical contexts.\n","authors":["Yash Patel","Tirth Shah","Mrinal Kanti Dhar","Taiyu Zhang","Jeffrey Niezgoda","Sandeep Gopalakrishnan","Zeyun Yu"],"pdf_url":"https://arxiv.org/pdf/2308.11877v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12522v1","updated":"2023-08-24T03:21:28Z","published":"2023-08-24T03:21:28Z","title":"Uniformly Distributed Category Prototype-Guided Vision-Language\n Framework for Long-Tail Recognition","summary":" Recently, large-scale pre-trained vision-language models have presented\nbenefits for alleviating class imbalance in long-tailed recognition. However,\nthe long-tailed data distribution can corrupt the representation space, where\nthe distance between head and tail categories is much larger than the distance\nbetween two tail categories. This uneven feature space distribution causes the\nmodel to exhibit unclear and inseparable decision boundaries on the uniformly\ndistributed test set, which lowers its performance. To address these\nchallenges, we propose the uniformly category prototype-guided vision-language\nframework to effectively mitigate feature space bias caused by data imbalance.\nEspecially, we generate a set of category prototypes uniformly distributed on a\nhypersphere. Category prototype-guided mechanism for image-text matching makes\nthe features of different classes converge to these distinct and uniformly\ndistributed category prototypes, which maintain a uniform distribution in the\nfeature space, and improve class boundaries. Additionally, our proposed\nirrelevant text filtering and attribute enhancement module allows the model to\nignore irrelevant noisy text and focus more on key attribute information,\nthereby enhancing the robustness of our framework. In the image recognition\nfine-tuning stage, to address the positive bias problem of the learnable\nclassifier, we design the class feature prototype-guided classifier, which\ncompensates for the performance of tail classes while maintaining the\nperformance of head classes. Our method outperforms previous vision-language\nmethods for long-tailed learning work by a large margin and achieves\nstate-of-the-art performance.\n","authors":["Siming Fu","Xiaoxuan He","Xinpeng Ding","Yuchen Cao","Hualiang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.12522v1.pdf","comment":"11pages, 5figures"},{"id":"http://arxiv.org/abs/2107.10419v3","updated":"2023-08-24T03:09:41Z","published":"2021-07-22T02:06:38Z","title":"Trip-ROMA: Self-Supervised Learning with Triplets and Random Mappings","summary":" Contrastive self-supervised learning (SSL) methods, such as MoCo and SimCLR,\nhave achieved great success in unsupervised visual representation learning.\nThey rely on a large number of negative pairs and thus require either large\nmemory banks or large batches. Some recent non-contrastive SSL methods, such as\nBYOL and SimSiam, attempt to discard negative pairs and have also shown\nremarkable performance. To avoid collapsed solutions caused by not using\nnegative pairs, these methods require non-trivial asymmetry designs. However,\nin small data regimes, we can not obtain a sufficient number of negative pairs\nor effectively avoid the over-fitting problem when negatives are not used at\nall. To address this situation, we argue that negative pairs are still\nimportant but one is generally sufficient for each positive pair. We show that\na simple Triplet-based loss (Trip) can achieve surprisingly good performance\nwithout requiring large batches or asymmetry designs. Moreover, to alleviate\nthe over-fitting problem in small data regimes and further enhance the effect\nof Trip, we propose a simple plug-and-play RandOm MApping (ROMA) strategy by\nrandomly mapping samples into other spaces and requiring these randomly\nprojected samples to satisfy the same relationship indicated by the triplets.\nIntegrating the triplet-based loss with random mapping, we obtain the proposed\nmethod Trip-ROMA. Extensive experiments, including unsupervised representation\nlearning and unsupervised few-shot learning, have been conducted on ImageNet-1K\nand seven small datasets. They successfully demonstrate the effectiveness of\nTrip-ROMA and consistently show that ROMA can further effectively boost other\nSSL methods. Code is available at https://github.com/WenbinLee/Trip-ROMA.\n","authors":["Wenbin Li","Xuesong Yang","Meihao Kong","Lei Wang","Jing Huo","Yang Gao","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/2107.10419v3.pdf","comment":"Accepted to Transactions on Machine Learning Research (TMLR) 2023"},{"id":"http://arxiv.org/abs/2308.12512v1","updated":"2023-08-24T02:54:38Z","published":"2023-08-24T02:54:38Z","title":"I3DOD: Towards Incremental 3D Object Detection via Prompting","summary":" 3D object detection has achieved significant performance in many fields,\ne.g., robotics system, autonomous driving, and augmented reality. However, most\nexisting methods could cause catastrophic forgetting of old classes when\nperforming on the class-incremental scenarios. Meanwhile, the current\nclass-incremental 3D object detection methods neglect the relationships between\nthe object localization information and category semantic information and\nassume all the knowledge of old model is reliable. To address the above\nchallenge, we present a novel Incremental 3D Object Detection framework with\nthe guidance of prompting, i.e., I3DOD. Specifically, we propose a task-shared\nprompts mechanism to learn the matching relationships between the object\nlocalization information and category semantic information. After training on\nthe current task, these prompts will be stored in our prompt pool, and perform\nthe relationship of old classes in the next task. Moreover, we design a\nreliable distillation strategy to transfer knowledge from two aspects: a\nreliable dynamic distillation is developed to filter out the negative knowledge\nand transfer the reliable 3D knowledge to new detection model; the relation\nfeature is proposed to capture the responses relation in feature space and\nprotect plasticity of the model when learning novel 3D classes. To the end, we\nconduct comprehensive experiments on two benchmark datasets and our method\noutperforms the state-of-the-art object detection methods by 0.6% - 2.7% in\nterms of mAP@0.25.\n","authors":["Wenqi Liang","Gan Sun","Chenxi Liu","Jiahua Dong","Kangru Wang"],"pdf_url":"https://arxiv.org/pdf/2308.12512v1.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.12510v1","updated":"2023-08-24T02:49:30Z","published":"2023-08-24T02:49:30Z","title":"Masked Autoencoders are Efficient Class Incremental Learners","summary":" Class Incremental Learning (CIL) aims to sequentially learn new classes while\navoiding catastrophic forgetting of previous knowledge. We propose to use\nMasked Autoencoders (MAEs) as efficient learners for CIL. MAEs were originally\ndesigned to learn useful representations through reconstructive unsupervised\nlearning, and they can be easily integrated with a supervised loss for\nclassification. Moreover, MAEs can reliably reconstruct original input images\nfrom randomly selected patches, which we use to store exemplars from past tasks\nmore efficiently for CIL. We also propose a bilateral MAE framework to learn\nfrom image-level and embedding-level fusion, which produces better-quality\nreconstructed images and more stable representations. Our experiments confirm\nthat our approach performs better than the state-of-the-art on CIFAR-100,\nImageNet-Subset, and ImageNet-Full. The code is available at\nhttps://github.com/scok30/MAE-CIL .\n","authors":["Jiang-Tian Zhai","Xialei Liu","Andrew D. Bagdanov","Ke Li","Ming-Ming Cheng"],"pdf_url":"https://arxiv.org/pdf/2308.12510v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2308.12509v1","updated":"2023-08-24T02:43:53Z","published":"2023-08-24T02:43:53Z","title":"Parameter-Efficient Transfer Learning for Remote Sensing Image-Text\n Retrieval","summary":" Vision-and-language pre-training (VLP) models have experienced a surge in\npopularity recently. By fine-tuning them on specific datasets, significant\nperformance improvements have been observed in various tasks. However, full\nfine-tuning of VLP models not only consumes a significant amount of\ncomputational resources but also has a significant environmental impact.\nMoreover, as remote sensing (RS) data is constantly being updated, full\nfine-tuning may not be practical for real-world applications. To address this\nissue, in this work, we investigate the parameter-efficient transfer learning\n(PETL) method to effectively and efficiently transfer visual-language knowledge\nfrom the natural domain to the RS domain on the image-text retrieval task. To\nthis end, we make the following contributions. 1) We construct a novel and\nsophisticated PETL framework for the RS image-text retrieval (RSITR) task,\nwhich includes the pretrained CLIP model, a multimodal remote sensing adapter,\nand a hybrid multi-modal contrastive (HMMC) learning objective; 2) To deal with\nthe problem of high intra-modal similarity in RS data, we design a simple yet\neffective HMMC loss; 3) We provide comprehensive empirical studies for\nPETL-based RS image-text retrieval. Our results demonstrate that the proposed\nmethod is promising and of great potential for practical applications. 4) We\nbenchmark extensive state-of-the-art PETL methods on the RSITR task. Our\nproposed model only contains 0.16M training parameters, which can achieve a\nparameter reduction of 98.9% compared to full fine-tuning, resulting in\nsubstantial savings in training costs. Our retrieval performance exceeds\ntraditional methods by 7-13% and achieves comparable or better performance than\nfull fine-tuning. This work can provide new ideas and useful insights for RS\nvision-language tasks.\n","authors":["Yuan Yuan","Yang Zhan","Zhitong Xiong"],"pdf_url":"https://arxiv.org/pdf/2308.12509v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12508v1","updated":"2023-08-24T02:28:18Z","published":"2023-08-24T02:28:18Z","title":"FFEINR: Flow Feature-Enhanced Implicit Neural Representation for\n Spatio-temporal Super-Resolution","summary":" Large-scale numerical simulations are capable of generating data up to\nterabytes or even petabytes. As a promising method of data reduction,\nsuper-resolution (SR) has been widely studied in the scientific visualization\ncommunity. However, most of them are based on deep convolutional neural\nnetworks (CNNs) or generative adversarial networks (GANs) and the scale factor\nneeds to be determined before constructing the network. As a result, a single\ntraining session only supports a fixed factor and has poor generalization\nability. To address these problems, this paper proposes a Feature-Enhanced\nImplicit Neural Representation (FFEINR) for spatio-temporal super-resolution of\nflow field data. It can take full advantage of the implicit neural\nrepresentation in terms of model structure and sampling resolution. The neural\nrepresentation is based on a fully connected network with periodic activation\nfunctions, which enables us to obtain lightweight models. The learned\ncontinuous representation can decode the low-resolution flow field input data\nto arbitrary spatial and temporal resolutions, allowing for flexible\nupsampling. The training process of FFEINR is facilitated by introducing\nfeature enhancements for the input layer, which complements the contextual\ninformation of the flow field.To demonstrate the effectiveness of the proposed\nmethod, a series of experiments are conducted on different datasets by setting\ndifferent hyperparameters. The results show that FFEINR achieves significantly\nbetter results than the trilinear interpolation method.\n","authors":["Chenyue Jiao","Chongke Bi","Lu Yang"],"pdf_url":"https://arxiv.org/pdf/2308.12508v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04928v2","updated":"2023-08-24T02:11:15Z","published":"2023-08-09T12:54:27Z","title":"GeodesicPSIM: Predicting the Quality of Static Mesh with Texture Map via\n Geodesic Patch Similarity","summary":" Static meshes with texture maps have attracted considerable attention in both\nindustrial manufacturing and academic research, leading to an urgent\nrequirement for effective and robust objective quality evaluation. However,\ncurrent model-based static mesh quality metrics have obvious limitations: most\nof them only consider geometry information, while color information is ignored,\nand they have strict constraints for the meshes' geometrical topology. Other\nmetrics, such as image-based and point-based metrics, are easily influenced by\nthe prepossessing algorithms, e.g., projection and sampling, hampering their\nability to perform at their best. In this paper, we propose Geodesic Patch\nSimilarity (GeodesicPSIM), a novel model-based metric to accurately predict\nhuman perception quality for static meshes. After selecting a group keypoints,\n1-hop geodesic patches are constructed based on both the reference and\ndistorted meshes cleaned by an effective mesh cleaning algorithm. A two-step\npatch cropping algorithm and a patch texture mapping module refine the size of\n1-hop geodesic patches and build the relationship between the mesh geometry and\ncolor information, resulting in the generation of 1-hop textured geodesic\npatches. Three types of features are extracted to quantify the distortion:\npatch color smoothness, patch discrete mean curvature, and patch pixel color\naverage and variance. To the best of our knowledge, GeodesicPSIM is the first\nmodel-based metric especially designed for static meshes with texture maps.\nGeodesicPSIM provides state-of-the-art performance in comparison with\nimage-based, point-based, and video-based metrics on a newly created and\nchallenging database. We also prove the robustness of GeodesicPSIM by\nintroducing different settings of hyperparameters. Ablation studies also\nexhibit the effectiveness of three proposed features and the patch cropping\nalgorithm.\n","authors":["Qi Yang","Joel Jung","Xiaozhong Xu","Shan Liu"],"pdf_url":"https://arxiv.org/pdf/2308.04928v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12501v1","updated":"2023-08-24T01:53:59Z","published":"2023-08-24T01:53:59Z","title":"DD-GCN: Directed Diffusion Graph Convolutional Network for\n Skeleton-based Human Action Recognition","summary":" Graph Convolutional Networks (GCNs) have been widely used in skeleton-based\nhuman action recognition. In GCN-based methods, the spatio-temporal graph is\nfundamental for capturing motion patterns. However, existing approaches ignore\nthe physical dependency and synchronized spatio-temporal correlations between\njoints, which limits the representation capability of GCNs. To solve these\nproblems, we construct the directed diffusion graph for action modeling and\nintroduce the activity partition strategy to optimize the weight sharing\nmechanism of graph convolution kernels. In addition, we present the\nspatio-temporal synchronization encoder to embed synchronized spatio-temporal\nsemantics. Finally, we propose Directed Diffusion Graph Convolutional Network\n(DD-GCN) for action recognition, and the experiments on three public datasets:\nNTU-RGB+D, NTU-RGB+D 120, and NW-UCLA, demonstrate the state-of-the-art\nperformance of our method.\n","authors":["Chang Li","Qian Huang","Yingchi Mao"],"pdf_url":"https://arxiv.org/pdf/2308.12501v1.pdf","comment":"ICEM 2023"},{"id":"http://arxiv.org/abs/2208.11945v3","updated":"2023-08-24T01:53:24Z","published":"2022-08-25T09:02:32Z","title":"Efficient Adaptive Activation Rounding for Post-Training Quantization","summary":" Post-training quantization attracts increasing attention due to its\nconvenience in deploying quantized neural networks. Although\nrounding-to-nearest remains the prevailing method for DNN quantization, prior\nresearch has demonstrated its suboptimal nature when applied to weight\nquantization. They propose optimizing weight rounding schemes by leveraging\noutput error rather than the traditional weight quantization error. Our study\nreveals that similar rounding challenges also extend to activation\nquantization. Despite the easy generalization, the challenges lie in the\ndynamic nature of activation. Adaptive rounding is expected for varying\nactivations and the method is subjected to runtime overhead. To tackle this, we\npropose the AQuant quantization framework with a novel perspective to reduce\noutput error by adjusting rounding schemes of activations. Instead of using the\nconstant rounding border 0.5 of the rounding-to-nearest operation, we make the\nborder become a function w.r.t. the activation value to change the activation\nrounding by the adaptive border. To deal with the runtime overhead, we use a\ncoarse-grained version of the border function. Finally, we introduce our\nframework to optimize the border function. Extensive experiments show that\nAQuant achieves notable improvements compared to state-of-the-art works and\npushes the accuracy of ResNet-18 up to 60.31% under the 2-bit weight and\nactivation quantization.\n","authors":["Zhengyi Li","Cong Guo","Zhanda Zhu","Yangjie Zhou","Yuxian Qiu","Xiaotian Gao","Jingwen Leng","Minyi Guo"],"pdf_url":"https://arxiv.org/pdf/2208.11945v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03374v2","updated":"2023-08-24T01:43:59Z","published":"2023-08-07T07:53:39Z","title":"Heterogeneous Forgetting Compensation for Class-Incremental Learning","summary":" Class-incremental learning (CIL) has achieved remarkable successes in\nlearning new classes consecutively while overcoming catastrophic forgetting on\nold categories. However, most existing CIL methods unreasonably assume that all\nold categories have the same forgetting pace, and neglect negative influence of\nforgetting heterogeneity among different old classes on forgetting\ncompensation. To surmount the above challenges, we develop a novel\nHeterogeneous Forgetting Compensation (HFC) model, which can resolve\nheterogeneous forgetting of easy-to-forget and hard-to-forget old categories\nfrom both representation and gradient aspects. Specifically, we design a\ntask-semantic aggregation block to alleviate heterogeneous forgetting from\nrepresentation aspect. It aggregates local category information within each\ntask to learn task-shared global representations. Moreover, we develop two\nnovel plug-and-play losses: a gradient-balanced forgetting compensation loss\nand a gradient-balanced relation distillation loss to alleviate forgetting from\ngradient aspect. They consider gradient-balanced compensation to rectify\nforgetting heterogeneity of old categories and heterogeneous relation\nconsistency. Experiments on several representative datasets illustrate\neffectiveness of our HFC model. The code is available at\nhttps://github.com/JiahuaDong/HFC.\n","authors":["Jiahua Dong","Wenqi Liang","Yang Cong","Gan Sun"],"pdf_url":"https://arxiv.org/pdf/2308.03374v2.pdf","comment":"Accepted to ICCV2023"},{"id":"http://arxiv.org/abs/2308.12495v1","updated":"2023-08-24T01:30:18Z","published":"2023-08-24T01:30:18Z","title":"Source-Free Collaborative Domain Adaptation via Multi-Perspective\n Feature Enrichment for Functional MRI Analysis","summary":" Resting-state functional MRI (rs-fMRI) is increasingly employed in multi-site\nresearch to aid neurological disorder analysis. Existing studies usually suffer\nfrom significant cross-site/domain data heterogeneity caused by site effects\nsuch as differences in scanners/protocols. Many methods have been proposed to\nreduce fMRI heterogeneity between source and target domains, heavily relying on\nthe availability of source data. But acquiring source data is challenging due\nto privacy concerns and/or data storage burdens in multi-site studies. To this\nend, we design a source-free collaborative domain adaptation (SCDA) framework\nfor fMRI analysis, where only a pretrained source model and unlabeled target\ndata are accessible. Specifically, a multi-perspective feature enrichment\nmethod (MFE) is developed for target fMRI analysis, consisting of multiple\ncollaborative branches to dynamically capture fMRI features of unlabeled target\ndata from multiple views. Each branch has a data-feeding module, a\nspatiotemporal feature encoder, and a class predictor. A mutual-consistency\nconstraint is designed to encourage pair-wise consistency of latent features of\nthe same input generated from these branches for robust representation\nlearning. To facilitate efficient cross-domain knowledge transfer without\nsource data, we initialize MFE using parameters of a pretrained source model.\nWe also introduce an unsupervised pretraining strategy using 3,806 unlabeled\nfMRIs from three large-scale auxiliary databases, aiming to obtain a general\nfeature encoder. Experimental results on three public datasets and one private\ndataset demonstrate the efficacy of our method in cross-scanner and cross-study\nprediction tasks. The model pretrained on large-scale rs-fMRI data has been\nreleased to the public.\n","authors":["Yuqi Fang","Jinjian Wu","Qianqian Wang","Shijun Qiu","Andrea Bozoki","Huaicheng Yan","Mingxia Liu"],"pdf_url":"https://arxiv.org/pdf/2308.12495v1.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.12494v1","updated":"2023-08-24T01:29:15Z","published":"2023-08-24T01:29:15Z","title":"MOFA: A Model Simplification Roadmap for Image Restoration on Mobile\n Devices","summary":" Image restoration aims to restore high-quality images from degraded\ncounterparts and has seen significant advancements through deep learning\ntechniques. The technique has been widely applied to mobile devices for tasks\nsuch as mobile photography. Given the resource limitations on mobile devices,\nsuch as memory constraints and runtime requirements, the efficiency of models\nduring deployment becomes paramount. Nevertheless, most previous works have\nprimarily concentrated on analyzing the efficiency of single modules and\nimproving them individually. This paper examines the efficiency across\ndifferent layers. We propose a roadmap that can be applied to further\naccelerate image restoration models prior to deployment while simultaneously\nincreasing PSNR (Peak Signal-to-Noise Ratio) and SSIM (Structural Similarity\nIndex). The roadmap first increases the model capacity by adding more\nparameters to partial convolutions on FLOPs non-sensitive layers. Then, it\napplies partial depthwise convolution coupled with decoupling\nupsampling/downsampling layers to accelerate the model speed. Extensive\nexperiments demonstrate that our approach decreases runtime by up to 13% and\nreduces the number of parameters by up to 23%, while increasing PSNR and SSIM\non several image restoration datasets. Source Code of our method is available\nat \\href{https://github.com/xiangyu8/MOFA}{https://github.com/xiangyu8/MOFA}.\n","authors":["Xiangyu Chen","Ruiwen Zhen","Shuai Li","Xiaotian Li","Guanghui Wang"],"pdf_url":"https://arxiv.org/pdf/2308.12494v1.pdf","comment":"Accepted by 2023 ICCV Workshop (RCV)"},{"id":"http://arxiv.org/abs/2302.00903v2","updated":"2023-08-24T01:24:19Z","published":"2023-02-02T06:41:02Z","title":"No One Left Behind: Real-World Federated Class-Incremental Learning","summary":" Federated learning (FL) is a hot collaborative training framework via\naggregating model parameters of decentralized local clients. However, most FL\nmethods unreasonably assume data categories of FL framework are known and fixed\nin advance. Moreover, some new local clients that collect novel categories\nunseen by other clients may be introduced to FL training irregularly. These\nissues render global model to undergo catastrophic forgetting on old\ncategories, when local clients receive new categories consecutively under\nlimited memory of storing old categories. To tackle the above issues, we\npropose a novel Local-Global Anti-forgetting (LGA) model. It ensures no local\nclients are left behind as they learn new classes continually, by addressing\nlocal and global catastrophic forgetting. Specifically, considering tackling\nclass imbalance of local client to surmount local forgetting, we develop a\ncategory-balanced gradient-adaptive compensation loss and a category\ngradient-induced semantic distillation loss. They can balance heterogeneous\nforgetting speeds of hard-to-forget and easy-to-forget old categories, while\nensure consistent class-relations within different tasks. Moreover, a proxy\nserver is designed to tackle global forgetting caused by Non-IID class\nimbalance between different clients. It augments perturbed prototype images of\nnew categories collected from local clients via self-supervised prototype\naugmentation, thus improving robustness to choose the best old global model for\nlocal-side semantic distillation loss. Experiments on representative datasets\nverify superior performance of our model against comparison methods. The code\nis available at https://github.com/JiahuaDong/LGA.\n","authors":["Jiahua Dong","Hongliu Li","Yang Cong","Gan Sun","Yulun Zhang","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2302.00903v2.pdf","comment":"17 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.12213v2","updated":"2023-08-24T00:48:47Z","published":"2023-08-23T15:51:36Z","title":"CLIPN for Zero-Shot OOD Detection: Teaching CLIP to Say No","summary":" Out-of-distribution (OOD) detection refers to training the model on an\nin-distribution (ID) dataset to classify whether the input images come from\nunknown classes. Considerable effort has been invested in designing various OOD\ndetection methods based on either convolutional neural networks or\ntransformers. However, zero-shot OOD detection methods driven by CLIP, which\nonly require class names for ID, have received less attention. This paper\npresents a novel method, namely CLIP saying no (CLIPN), which empowers the\nlogic of saying no within CLIP. Our key motivation is to equip CLIP with the\ncapability of distinguishing OOD and ID samples using positive-semantic prompts\nand negation-semantic prompts. Specifically, we design a novel learnable no\nprompt and a no text encoder to capture negation semantics within images.\nSubsequently, we introduce two loss functions: the image-text binary-opposite\nloss and the text semantic-opposite loss, which we use to teach CLIPN to\nassociate images with no prompts, thereby enabling it to identify unknown\nsamples. Furthermore, we propose two threshold-free inference algorithms to\nperform OOD detection by utilizing negation semantics from no prompts and the\ntext encoder. Experimental results on 9 benchmark datasets (3 ID datasets and 6\nOOD datasets) for the OOD detection task demonstrate that CLIPN, based on\nViT-B-16, outperforms 7 well-used algorithms by at least 2.34% and 11.64% in\nterms of AUROC and FPR95 for zero-shot OOD detection on ImageNet-1K. Our CLIPN\ncan serve as a solid foundation for effectively leveraging CLIP in downstream\nOOD tasks. The code is available on https://github.com/xmed-lab/CLIPN.\n","authors":["Hualiang Wang","Yi Li","Huifeng Yao","Xiaomeng Li"],"pdf_url":"https://arxiv.org/pdf/2308.12213v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.11381v2","updated":"2023-08-24T00:34:54Z","published":"2023-08-22T12:12:59Z","title":"DALNet: A Rail Detection Network Based on Dynamic Anchor Line","summary":" Rail detection is one of the key factors for intelligent train. In the paper,\nmotivated by the anchor line-based lane detection methods, we propose a rail\ndetection network called DALNet based on dynamic anchor line. Aiming to solve\nthe problem that the predefined anchor line is image agnostic, we design a\nnovel dynamic anchor line mechanism. It utilizes a dynamic anchor line\ngenerator to dynamically generate an appropriate anchor line for each rail\ninstance based on the position and shape of the rails in the input image. These\ndynamically generated anchor lines can be considered as better position\nreferences to accurately localize the rails than the predefined anchor lines.\nIn addition, we present a challenging urban rail detection dataset DL-Rail with\nhigh-quality annotations and scenario diversity. DL-Rail contains 7000 pairs of\nimages and annotations along with scene tags, and it is expected to encourage\nthe development of rail detection. We extensively compare DALNet with many\ncompetitive lane methods. The results show that our DALNet achieves\nstate-of-the-art performance on our DL-Rail rail detection dataset and the\npopular Tusimple and LLAMAS lane detection benchmarks. The code will be\nreleased at https://github.com/Yzichen/mmLaneDet.\n","authors":["Zichen Yu","Quanli Liu","Wei Wang","Liyong Zhang","Xiaoguang Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.11381v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12477v1","updated":"2023-08-24T00:24:42Z","published":"2023-08-24T00:24:42Z","title":"American Stories: A Large-Scale Structured Text Dataset of Historical\n U.S. Newspapers","summary":" Existing full text datasets of U.S. public domain newspapers do not recognize\nthe often complex layouts of newspaper scans, and as a result the digitized\ncontent scrambles texts from articles, headlines, captions, advertisements, and\nother layout regions. OCR quality can also be low. This study develops a novel,\ndeep learning pipeline for extracting full article texts from newspaper images\nand applies it to the nearly 20 million scans in Library of Congress's public\ndomain Chronicling America collection. The pipeline includes layout detection,\nlegibility classification, custom OCR, and association of article texts\nspanning multiple bounding boxes. To achieve high scalability, it is built with\nefficient architectures designed for mobile phones. The resulting American\nStories dataset provides high quality data that could be used for pre-training\na large language model to achieve better understanding of historical English\nand historical world knowledge. The dataset could also be added to the external\ndatabase of a retrieval-augmented language model to make historical information\n- ranging from interpretations of political events to minutiae about the lives\nof people's ancestors - more widely accessible. Furthermore, structured article\ntexts facilitate using transformer-based methods for popular social science\napplications like topic classification, detection of reproduced content, and\nnews story clustering. Finally, American Stories provides a massive silver\nquality dataset for innovating multimodal layout analysis models and other\nmultimodal applications.\n","authors":["Melissa Dell","Jacob Carlson","Tom Bryan","Emily Silcock","Abhishek Arora","Zejiang Shen","Luca D'Amico-Wong","Quan Le","Pablo Querubin","Leander Heldring"],"pdf_url":"https://arxiv.org/pdf/2308.12477v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11471v2","updated":"2023-08-24T00:17:17Z","published":"2023-08-22T14:36:59Z","title":"Dynamic Open Vocabulary Enhanced Safe-landing with Intelligence\n (DOVESEI)","summary":" This work targets what we consider to be the foundational step for urban\nairborne robots, a safe landing. Our attention is directed toward what we deem\nthe most crucial aspect of the safe landing perception stack: segmentation. We\npresent a streamlined reactive UAV system that employs visual servoing by\nharnessing the capabilities of open vocabulary image segmentation. This\napproach can adapt to various scenarios with minimal adjustments, bypassing the\nnecessity for extensive data accumulation for refining internal models, thanks\nto its open vocabulary methodology. Given the limitations imposed by local\nauthorities, our primary focus centers on operations originating from altitudes\nof 100 meters. This choice is deliberate, as numerous preceding works have\ndealt with altitudes up to 30 meters, aligning with the capabilities of small\nstereo cameras. Consequently, we leave the remaining 20m to be navigated using\nconventional 3D path planning methods. Utilizing monocular cameras and image\nsegmentation, our findings demonstrate the system's capability to successfully\nexecute landing maneuvers at altitudes as low as 20 meters. However, this\napproach is vulnerable to intermittent and occasionally abrupt fluctuations in\nthe segmentation between frames in a video stream. To address this challenge,\nwe enhance the image segmentation output by introducing what we call a dynamic\nfocus: a masking mechanism that self adjusts according to the current landing\nstage. This dynamic focus guides the control system to avoid regions beyond the\ndrone's safety radius projected onto the ground, thus mitigating the problems\nwith fluctuations. Through the implementation of this supplementary layer, our\nexperiments have reached improvements in the landing success rate of almost\ntenfold when compared to global segmentation. All the source code is open\nsource and available online (github.com/MISTLab/DOVESEI).\n","authors":["Haechan Mark Bong","Rongge Zhang","Ricardo de Azambuja","Giovanni Beltrame"],"pdf_url":"https://arxiv.org/pdf/2308.11471v2.pdf","comment":"Submitted to IROS 2023 The Last-Mile Robotics Workshop"},{"id":"http://arxiv.org/abs/2305.07011v3","updated":"2023-08-24T23:21:27Z","published":"2023-05-11T17:53:29Z","title":"Region-Aware Pretraining for Open-Vocabulary Object Detection with\n Vision Transformers","summary":" We present Region-aware Open-vocabulary Vision Transformers (RO-ViT) - a\ncontrastive image-text pretraining recipe to bridge the gap between image-level\npretraining and open-vocabulary object detection. At the pretraining phase, we\npropose to randomly crop and resize regions of positional embeddings instead of\nusing the whole image positional embeddings. This better matches the use of\npositional embeddings at region-level in the detection finetuning phase. In\naddition, we replace the common softmax cross entropy loss in contrastive\nlearning with focal loss to better learn the informative yet difficult\nexamples. Finally, we leverage recent advances in novel object proposals to\nimprove open-vocabulary detection finetuning. We evaluate our full model on the\nLVIS and COCO open-vocabulary detection benchmarks and zero-shot transfer.\nRO-ViT achieves a state-of-the-art 34.1 $AP_r$ on LVIS, surpassing the best\nexisting approach by +7.8 points in addition to competitive zero-shot transfer\ndetection. Surprisingly, RO-ViT improves the image-level representation as well\nand achieves the state of the art on 9 out of 12 metrics on COCO and Flickr\nimage-text retrieval benchmarks, outperforming competitive approaches with\nlarger models.\n","authors":["Dahun Kim","Anelia Angelova","Weicheng Kuo"],"pdf_url":"https://arxiv.org/pdf/2305.07011v3.pdf","comment":"CVPR 2023 Highlight (https://github.com/mcahny/rovit); adds LAION-2B\n result"},{"id":"http://arxiv.org/abs/2212.02501v4","updated":"2023-08-24T22:14:53Z","published":"2022-12-05T18:59:57Z","title":"SceneRF: Self-Supervised Monocular 3D Scene Reconstruction with Radiance\n Fields","summary":" 3D reconstruction from a single 2D image was extensively covered in the\nliterature but relies on depth supervision at training time, which limits its\napplicability. To relax the dependence to depth we propose SceneRF, a\nself-supervised monocular scene reconstruction method using only posed image\nsequences for training. Fueled by the recent progress in neural radiance fields\n(NeRF) we optimize a radiance field though with explicit depth optimization and\na novel probabilistic sampling strategy to efficiently handle large scenes. At\ninference, a single input image suffices to hallucinate novel depth views which\nare fused together to obtain 3D scene reconstruction. Thorough experiments\ndemonstrate that we outperform all baselines for novel depth views synthesis\nand scene reconstruction, on indoor BundleFusion and outdoor SemanticKITTI.\nCode is available at https://astra-vision.github.io/SceneRF .\n","authors":["Anh-Quan Cao","Raoul de Charette"],"pdf_url":"https://arxiv.org/pdf/2212.02501v4.pdf","comment":"ICCV 2023. Project page: https://astra-vision.github.io/SceneRF"},{"id":"http://arxiv.org/abs/2201.08865v2","updated":"2023-08-24T21:58:18Z","published":"2022-01-21T19:18:42Z","title":"On the in vivo recognition of kidney stones using machine learning","summary":" Determining the type of kidney stones allows urologists to prescribe a\ntreatment to avoid recurrence of renal lithiasis. An automated in-vivo\nimage-based classification method would be an important step towards an\nimmediate identification of the kidney stone type required as a first phase of\nthe diagnosis. In the literature it was shown on ex-vivo data (i.e., in very\ncontrolled scene and image acquisition conditions) that an automated kidney\nstone classification is indeed feasible. This pilot study compares the kidney\nstone recognition performances of six shallow machine learning methods and\nthree deep-learning architectures which were tested with in-vivo images of the\nfour most frequent urinary calculi types acquired with an endoscope during\nstandard ureteroscopies. This contribution details the database construction\nand the design of the tested kidney stones classifiers. Even if the best\nresults were obtained by the Inception v3 architecture (weighted precision,\nrecall and F1-score of 0.97, 0.98 and 0.97, respectively), it is also shown\nthat choosing an appropriate colour space and texture features allows a shallow\nmachine learning method to approach closely the performances of the most\npromising deep-learning methods (the XGBoost classifier led to weighted\nprecision, recall and F1-score values of 0.96). This paper is the first one\nthat explores the most discriminant features to be extracted from images\nacquired during ureteroscopies.\n","authors":["Francisco Lopez-Tiro","Vincent Estrade","Jacques Hubert","Daniel Flores-Araiza","Miguel Gonzalez-Mendoza","Gilberto Ochoa-Ruiz","Christian Daul"],"pdf_url":"https://arxiv.org/pdf/2201.08865v2.pdf","comment":"Paper submitted to IEEE Access"},{"id":"http://arxiv.org/abs/2308.13097v1","updated":"2023-08-24T21:43:04Z","published":"2023-08-24T21:43:04Z","title":"CompaCT: Fractal-Based Heuristic Pixel Segmentation for Lossless\n Compression of High-Color DICOM Medical Images","summary":" Medical image compression is a widely studied field of data processing due to\nits prevalence in modern digital databases. This domain requires a high color\ndepth of 12 bits per pixel component for accurate analysis by physicians,\nprimarily in the DICOM format. Standard raster-based compression of images via\nfiltering is well-known; however, it remains suboptimal in the medical domain\ndue to non-specialized implementations. This study proposes a lossless medical\nimage compression algorithm, CompaCT, that aims to target spatial features and\npatterns of pixel concentration for dynamically enhanced data processing. The\nalgorithm employs fractal pixel traversal coupled with a novel approach of\nsegmentation and meshing between pixel blocks for preprocessing. Furthermore,\ndelta and entropy coding are applied to this concept for a complete compression\npipeline. The proposal demonstrates that the data compression achieved via\nfractal segmentation preprocessing yields enhanced image compression results\nwhile remaining lossless in its reconstruction accuracy. CompaCT is evaluated\nin its compression ratios on 3954 high-color CT scans against the efficiency of\nindustry-standard compression techniques (i.e., JPEG2000, RLE, ZIP, PNG). Its\nreconstruction performance is assessed with error metrics to verify lossless\nimage recovery after decompression. The results demonstrate that CompaCT can\ncompress and losslessly reconstruct medical images, being 37% more\nspace-efficient than industry-standard compression systems.\n","authors":["Taaha Khan"],"pdf_url":"https://arxiv.org/pdf/2308.13097v1.pdf","comment":"(8/24/2023) v1a: 16 pages, 9 figures, Word PDF"},{"id":"http://arxiv.org/abs/2308.13094v1","updated":"2023-08-24T21:37:00Z","published":"2023-08-24T21:37:00Z","title":"Interpretable Image Quality Assessment via CLIP with Multiple\n Antonym-Prompt Pairs","summary":" No reference image quality assessment (NR-IQA) is a task to estimate the\nperceptual quality of an image without its corresponding original image. It is\neven more difficult to perform this task in a zero-shot manner, i.e., without\ntask-specific training. In this paper, we propose a new zero-shot and\ninterpretable NRIQA method that exploits the ability of a pre-trained\nvisionlanguage model to estimate the correlation between an image and a textual\nprompt. The proposed method employs a prompt pairing strategy and multiple\nantonym-prompt pairs corresponding to carefully selected descriptive features\ncorresponding to the perceptual image quality. Thus, the proposed method is\nable to identify not only the perceptual quality evaluation of the image, but\nalso the cause on which the quality evaluation is based. Experimental results\nshow that the proposed method outperforms existing zero-shot NR-IQA methods in\nterms of accuracy and can evaluate the causes of perceptual quality\ndegradation.\n","authors":["Takamichi Miyata"],"pdf_url":"https://arxiv.org/pdf/2308.13094v1.pdf","comment":"2pages, 1 figure"},{"id":"http://arxiv.org/abs/2308.13093v1","updated":"2023-08-24T21:36:11Z","published":"2023-08-24T21:36:11Z","title":"EgoBlur: Responsible Innovation in Aria","summary":" Project Aria pushes the frontiers of Egocentric AI with large-scale\nreal-world data collection using purposely designed glasses with privacy first\napproach. To protect the privacy of bystanders being recorded by the glasses,\nour research protocols are designed to ensure recorded video is processed by an\nAI anonymization model that removes bystander faces and vehicle license plates.\nDetected face and license plate regions are processed with a Gaussian blur such\nthat these personal identification information (PII) regions are obscured. This\nprocess helps to ensure that anonymized versions of the video is retained for\nresearch purposes. In Project Aria, we have developed a state-of-the-art\nanonymization system EgoBlur. In this paper, we present extensive analysis of\nEgoBlur on challenging datasets comparing its performance with other\nstate-of-the-art systems from industry and academia including extensive\nResponsible AI analysis on recently released Casual Conversations V2 dataset.\n","authors":["Nikhil Raina","Guruprasad Somasundaram","Kang Zheng","Steve Saarinen","Jeff Messiner","Mark Schwesinger","Luis Pesqueira","Ishita Prasad","Edward Miller","Prince Gupta","Mingfei Yan","Richard Newcombe","Carl Ren","Omkar M Parkhi"],"pdf_url":"https://arxiv.org/pdf/2308.13093v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.12589v2","updated":"2023-08-24T21:26:05Z","published":"2023-06-21T22:01:12Z","title":"Rapid building damage assessment workflow: An implementation for the\n 2023 Rolling Fork, Mississippi tornado event","summary":" Rapid and accurate building damage assessments from high-resolution satellite\nimagery following a natural disaster is essential to inform and optimize first\nresponder efforts. However, performing such building damage assessments in an\nautomated manner is non-trivial due to the challenges posed by variations in\ndisaster-specific damage, diversity in satellite imagery, and the dearth of\nextensive, labeled datasets. To circumvent these issues, this paper introduces\na human-in-the-loop workflow for rapidly training building damage assessment\nmodels after a natural disaster. This article details a case study using this\nworkflow, executed in partnership with the American Red Cross during a tornado\nevent in Rolling Fork, Mississippi in March, 2023. The output from our\nhuman-in-the-loop modeling process achieved a precision of 0.86 and recall of\n0.80 for damaged buildings when compared to ground truth data collected\npost-disaster. This workflow was implemented end-to-end in under 2 hours per\nsatellite imagery scene, highlighting its potential for real-time deployment.\n","authors":["Caleb Robinson","Simone Fobi Nsutezo","Anthony Ortiz","Tina Sederholm","Rahul Dodhia","Cameron Birge","Kasie Richards","Kris Pitcher","Paulo Duarte","Juan M. Lavista Ferres"],"pdf_url":"https://arxiv.org/pdf/2306.12589v2.pdf","comment":"Accepted at the 2023 ICCV Humanitarian Assistance and Disaster\n Response workshop"},{"id":"http://arxiv.org/abs/2303.09608v2","updated":"2023-08-24T21:20:00Z","published":"2023-03-16T19:28:37Z","title":"VEIL: Vetting Extracted Image Labels from In-the-Wild Captions for\n Weakly-Supervised Object Detection","summary":" The use of large-scale vision-language datasets is limited for object\ndetection due to the negative impact of label noise on localization. Prior\nmethods have shown how such large-scale datasets can be used for pretraining,\nwhich can provide initial signal for localization, but is insufficient without\nclean bounding-box data for at least some categories. We propose a technique to\n\"vet\" labels extracted from noisy captions, and use them for weakly-supervised\nobject detection (WSOD). We conduct analysis of the types of label noise in\ncaptions, and train a classifier that predicts if an extracted label is\nactually present in the image or not. Our classifier generalizes across dataset\nboundaries and across categories. We compare the classifier to eleven baselines\non five datasets, and demonstrate that it can improve WSOD without label\nvetting by 30% (31.2 to 40.5 mAP when evaluated on PASCAL VOC)\n","authors":["Arushi Rai","Adriana Kovashka"],"pdf_url":"https://arxiv.org/pdf/2303.09608v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13082v1","updated":"2023-08-24T20:59:55Z","published":"2023-08-24T20:59:55Z","title":"Benchmarking Data Efficiency and Computational Efficiency of Temporal\n Action Localization Models","summary":" In temporal action localization, given an input video, the goal is to predict\nwhich actions it contains, where they begin, and where they end. Training and\ntesting current state-of-the-art deep learning models requires access to large\namounts of data and computational power. However, gathering such data is\nchallenging and computational resources might be limited. This work explores\nand measures how current deep temporal action localization models perform in\nsettings constrained by the amount of data or computational power. We measure\ndata efficiency by training each model on a subset of the training set. We find\nthat TemporalMaxer outperforms other models in data-limited settings.\nFurthermore, we recommend TriDet when training time is limited. To test the\nefficiency of the models during inference, we pass videos of different lengths\nthrough each model. We find that TemporalMaxer requires the least computational\nresources, likely due to its simple architecture.\n","authors":["Jan Warchocki","Teodor Oprescu","Yunhan Wang","Alexandru Damacus","Paul Misterka","Robert-Jan Bruintjes","Attila Lengyel","Ombretta Strafforello","Jan van Gemert"],"pdf_url":"https://arxiv.org/pdf/2308.13082v1.pdf","comment":"Accepted to the CVEU workshop at ICCV 2023"},{"id":"http://arxiv.org/abs/2308.13077v1","updated":"2023-08-24T20:46:48Z","published":"2023-08-24T20:46:48Z","title":"Preserving Modality Structure Improves Multi-Modal Learning","summary":" Self-supervised learning on large-scale multi-modal datasets allows learning\nsemantically meaningful embeddings in a joint multi-modal representation space\nwithout relying on human annotations. These joint embeddings enable zero-shot\ncross-modal tasks like retrieval and classification. However, these methods\noften struggle to generalize well on out-of-domain data as they ignore the\nsemantic structure present in modality-specific embeddings. In this context, we\npropose a novel Semantic-Structure-Preserving Consistency approach to improve\ngeneralizability by preserving the modality-specific relationships in the joint\nembedding space. To capture modality-specific semantic relationships between\nsamples, we propose to learn multiple anchors and represent the multifaceted\nrelationship between samples with respect to their relationship with these\nanchors. To assign multiple anchors to each sample, we propose a novel\nMulti-Assignment Sinkhorn-Knopp algorithm. Our experimentation demonstrates\nthat our proposed approach learns semantically meaningful anchors in a\nself-supervised manner. Furthermore, our evaluation on MSR-VTT and YouCook2\ndatasets demonstrates that our proposed multi-anchor assignment based solution\nachieves state-of-the-art performance and generalizes to both inand\nout-of-domain datasets. Code: https://github.com/Swetha5/Multi_Sinkhorn_Knopp\n","authors":["Swetha Sirnam","Mamshad Nayeem Rizve","Nina Shvetsova","Hilde Kuehne","Mubarak Shah"],"pdf_url":"https://arxiv.org/pdf/2308.13077v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2308.13073v1","updated":"2023-08-24T20:32:57Z","published":"2023-08-24T20:32:57Z","title":"SurGNN: Explainable visual scene understanding and assessment of\n surgical skill using graph neural networks","summary":" This paper explores how graph neural networks (GNNs) can be used to enhance\nvisual scene understanding and surgical skill assessment. By using GNNs to\nanalyze the complex visual data of surgical procedures represented as graph\nstructures, relevant features can be extracted and surgical skill can be\npredicted. Additionally, GNNs provide interpretable results, revealing the\nspecific actions, instruments, or anatomical structures that contribute to the\npredicted skill metrics. This can be highly beneficial for surgical educators\nand trainees, as it provides valuable insights into the factors that contribute\nto successful surgical performance and outcomes. SurGNN proposes two concurrent\napproaches -- one supervised and the other self-supervised. The paper also\nbriefly discusses other automated surgical skill evaluation techniques and\nhighlights the limitations of hand-crafted features in capturing the\nintricacies of surgical expertise. We use the proposed methods to achieve\nstate-of-the-art results on EndoVis19, and custom datasets. The working\nimplementation of the code can be found at https://github.com/.\n","authors":["Shuja Khalid","Frank Rudzicz"],"pdf_url":"https://arxiv.org/pdf/2308.13073v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2308.13072v1","updated":"2023-08-24T20:29:09Z","published":"2023-08-24T20:29:09Z","title":"Full-dose PET Synthesis from Low-dose PET Using High-efficiency\n Diffusion Denoising Probabilistic Model","summary":" To reduce the risks associated with ionizing radiation, a reduction of\nradiation exposure in PET imaging is needed. However, this leads to a\ndetrimental effect on image contrast and quantification. High-quality PET\nimages synthesized from low-dose data offer a solution to reduce radiation\nexposure. We introduce a diffusion-model-based approach for estimating\nfull-dose PET images from low-dose ones: the PET Consistency Model (PET-CM)\nyielding synthetic quality comparable to state-of-the-art diffusion-based\nsynthesis models, but with greater efficiency. There are two steps: a forward\nprocess that adds Gaussian noise to a full dose PET image at multiple\ntimesteps, and a reverse diffusion process that employs a PET Shifted-window\nVision Transformer (PET-VIT) network to learn the denoising procedure\nconditioned on the corresponding low-dose PETs. In PET-CM, the reverse process\nlearns a consistency function for direct denoising of Gaussian noise to a clean\nfull-dose PET. We evaluated the PET-CM in generating full-dose images using\nonly 1/8 and 1/4 of the standard PET dose. Comparing 1/8 dose to full-dose\nimages, PET-CM demonstrated impressive performance with normalized mean\nabsolute error (NMAE) of 1.233+/-0.131%, peak signal-to-noise ratio (PSNR) of\n33.915+/-0.933dB, structural similarity index (SSIM) of 0.964+/-0.009, and\nnormalized cross-correlation (NCC) of 0.968+/-0.011, with an average generation\ntime of 62 seconds per patient. This is a significant improvement compared to\nthe state-of-the-art diffusion-based model with PET-CM reaching this result 12x\nfaster. In the 1/4 dose to full-dose image experiments, PET-CM is also\ncompetitive, achieving an NMAE 1.058+/-0.092%, PSNR of 35.548+/-0.805dB, SSIM\nof 0.978+/-0.005, and NCC 0.981+/-0.007 The results indicate promising low-dose\nPET image quality improvements for clinical applications.\n","authors":["Shaoyan Pan","Elham Abouei","Junbo Peng","Joshua Qian","Jacob F Wynne","Tonghe Wang","Chih-Wei Chang","Justin Roper","Jonathon A Nye","Hui Mao","Xiaofeng Yang"],"pdf_url":"https://arxiv.org/pdf/2308.13072v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13755v3","updated":"2023-08-24T19:57:07Z","published":"2023-07-25T18:26:22Z","title":"Training-based Model Refinement and Representation Disagreement for\n Semi-Supervised Object Detection","summary":" Semi-supervised object detection (SSOD) aims to improve the performance and\ngeneralization of existing object detectors by utilizing limited labeled data\nand extensive unlabeled data. Despite many advances, recent SSOD methods are\nstill challenged by inadequate model refinement using the classical exponential\nmoving average (EMA) strategy, the consensus of Teacher-Student models in the\nlatter stages of training (i.e., losing their distinctiveness), and\nnoisy/misleading pseudo-labels. This paper proposes a novel training-based\nmodel refinement (TMR) stage and a simple yet effective representation\ndisagreement (RD) strategy to address the limitations of classical EMA and the\nconsensus problem. The TMR stage of Teacher-Student models optimizes the\nlightweight scaling operation to refine the model's weights and prevent\noverfitting or forgetting learned patterns from unlabeled data. Meanwhile, the\nRD strategy helps keep these models diverged to encourage the student model to\nexplore complementary representations. Our approach can be integrated into\nestablished SSOD methods and is empirically validated using two baseline\nmethods, with and without cascade regression, to generate more reliable\npseudo-labels. Extensive experiments demonstrate the superior performance of\nour approach over state-of-the-art SSOD methods. Specifically, the proposed\napproach outperforms the baseline Unbiased-Teacher-v2 (& Unbiased-Teacher-v1)\nmethod by an average mAP margin of 2.23, 2.1, and 3.36 (& 2.07, 1.9, and 3.27)\non COCO-standard, COCO-additional, and Pascal VOC datasets, respectively.\n","authors":["Seyed Mojtaba Marvasti-Zadeh","Nilanjan Ray","Nadir Erbilgin"],"pdf_url":"https://arxiv.org/pdf/2307.13755v3.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2308.13057v1","updated":"2023-08-24T19:50:25Z","published":"2023-08-24T19:50:25Z","title":"Data-Side Efficiencies for Lightweight Convolutional Neural Networks","summary":" We examine how the choice of data-side attributes for two important visual\ntasks of image classification and object detection can aid in the choice or\ndesign of lightweight convolutional neural networks. We show by experimentation\nhow four data attributes - number of classes, object color, image resolution,\nand object scale affect neural network model size and efficiency. Intra- and\ninter-class similarity metrics, based on metric learning, are defined to guide\nthe evaluation of these attributes toward achieving lightweight models.\nEvaluations made using these metrics are shown to require 30x less computation\nthan running full inference tests. We provide, as an example, applying the\nmetrics and methods to choose a lightweight model for a robot path planning\napplication and achieve computation reduction of 66% and accuracy gain of 3.5%\nover the pre-method model.\n","authors":["Bryan Bo Cao","Lawrence O'Gorman","Michael Coss","Shubham Jain"],"pdf_url":"https://arxiv.org/pdf/2308.13057v1.pdf","comment":"10 pages, 5 figures, 6 tables"},{"id":"http://arxiv.org/abs/2203.15441v2","updated":"2023-08-24T19:29:18Z","published":"2022-03-29T11:17:02Z","title":"UnShadowNet: Illumination Critic Guided Contrastive Learning For Shadow\n Removal","summary":" Shadows are frequently encountered natural phenomena that significantly\nhinder the performance of computer vision perception systems in practical\nsettings, e.g., autonomous driving. A solution to this would be to eliminate\nshadow regions from the images before the processing of the perception system.\nYet, training such a solution requires pairs of aligned shadowed and\nnon-shadowed images which are difficult to obtain. We introduce a novel weakly\nsupervised shadow removal framework UnShadowNet trained using contrastive\nlearning. It is composed of a DeShadower network responsible for the removal of\nthe extracted shadow under the guidance of an Illumination network which is\ntrained adversarially by the illumination critic and a Refinement network to\nfurther remove artefacts. We show that UnShadowNet can be easily extended to a\nfully-supervised set-up to exploit the ground-truth when available. UnShadowNet\noutperforms existing state-of-the-art approaches on three publicly available\nshadow datasets (ISTD, adjusted ISTD, SRD) in both the weakly and fully\nsupervised setups.\n","authors":["Subhrajyoti Dasgupta","Arindam Das","Senthil Yogamani","Sudip Das","Ciaran Eising","Andrei Bursuc","Ujjwal Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2203.15441v2.pdf","comment":"Accepted for publication at IEEE Access, vol. 11, pp. 87760-87774,\n 2023"},{"id":"http://arxiv.org/abs/2211.14575v2","updated":"2023-08-24T19:28:10Z","published":"2022-11-26T14:18:50Z","title":"Efficient Video Prediction via Sparsely Conditioned Flow Matching","summary":" We introduce a novel generative model for video prediction based on latent\nflow matching, an efficient alternative to diffusion-based models. In contrast\nto prior work, we keep the high costs of modeling the past during training and\ninference at bay by conditioning only on a small random set of past frames at\neach integration step of the image generation process. Moreover, to enable the\ngeneration of high-resolution videos and to speed up the training, we work in\nthe latent space of a pretrained VQGAN. Finally, we propose to approximate the\ninitial condition of the flow ODE with the previous noisy frame. This allows to\nreduce the number of integration steps and hence, speed up the sampling at\ninference time. We call our model Random frame conditioned flow Integration for\nVidEo pRediction, or, in short, RIVER. We show that RIVER achieves superior or\non par performance compared to prior work on common video prediction\nbenchmarks, while requiring an order of magnitude fewer computational\nresources.\n","authors":["Aram Davtyan","Sepehr Sameni","Paolo Favaro"],"pdf_url":"https://arxiv.org/pdf/2211.14575v2.pdf","comment":"Accepted to ICCV 2023. Project page: https://araachie.github.io/river"},{"id":"http://arxiv.org/abs/2211.13856v2","updated":"2023-08-24T19:28:08Z","published":"2022-11-25T01:50:33Z","title":"WSSL: Weighted Self-supervised Learning Framework For Image-inpainting","summary":" Image inpainting is the process of regenerating lost parts of the image.\nSupervised algorithm-based methods have shown excellent results but have two\nsignificant drawbacks. They do not perform well when tested with unseen data.\nThey fail to capture the global context of the image, resulting in a visually\nunappealing result. We propose a novel self-supervised learning framework for\nimage-inpainting: Weighted Self-Supervised Learning (WSSL) to tackle these\nproblems. We designed WSSL to learn features from multiple weighted pretext\ntasks. These features are then utilized for the downstream task,\nimage-inpainting. To improve the performance of our framework and produce more\nvisually appealing images, we also present a novel loss function for image\ninpainting. The loss function takes advantage of both reconstruction loss and\nperceptual loss functions to regenerate the image. Our experimentation shows\nWSSL outperforms previous methods, and our loss function helps produce better\nresults.\n","authors":["Shubham Gupta","Rahul Kunigal Ravishankar","Madhoolika Gangaraju","Poojasree Dwarkanath","Natarajan Subramanyam"],"pdf_url":"https://arxiv.org/pdf/2211.13856v2.pdf","comment":"9 Pages, document submitted for publication at CGVCVIP 2022 - ISBN\n 978-989-8704-42-9"},{"id":"http://arxiv.org/abs/2308.13042v1","updated":"2023-08-24T19:14:28Z","published":"2023-08-24T19:14:28Z","title":"Enhancing Perception and Immersion in Pre-Captured Environments through\n Learning-Based Eye Height Adaptation","summary":" Pre-captured immersive environments using omnidirectional cameras provide a\nwide range of virtual reality applications. Previous research has shown that\nmanipulating the eye height in egocentric virtual environments can\nsignificantly affect distance perception and immersion. However, the influence\nof eye height in pre-captured real environments has received less attention due\nto the difficulty of altering the perspective after finishing the capture\nprocess. To explore this influence, we first propose a pilot study that\ncaptures real environments with multiple eye heights and asks participants to\njudge the egocentric distances and immersion. If a significant influence is\nconfirmed, an effective image-based approach to adapt pre-captured real-world\nenvironments to the user's eye height would be desirable. Motivated by the\nstudy, we propose a learning-based approach for synthesizing novel views for\nomnidirectional images with altered eye heights. This approach employs a\nmultitask architecture that learns depth and semantic segmentation in two\nformats, and generates high-quality depth and semantic segmentation to\nfacilitate the inpainting stage. With the improved omnidirectional-aware\nlayered depth image, our approach synthesizes natural and realistic visuals for\neye height adaptation. Quantitative and qualitative evaluation shows favorable\nresults against state-of-the-art methods, and an extensive user study verifies\nimproved perception and immersion for pre-captured real-world environments.\n","authors":["Qi Feng","Hubert P. H. Shum","Shigeo Morishima"],"pdf_url":"https://arxiv.org/pdf/2308.13042v1.pdf","comment":"10 pages, 13 figures, 3 tables, submitted to ISMAR 2023"},{"id":"http://arxiv.org/abs/2303.16894v3","updated":"2023-08-24T18:45:43Z","published":"2023-03-29T17:59:10Z","title":"ViewRefer: Grasp the Multi-view Knowledge for 3D Visual Grounding with\n GPT and Prototype Guidance","summary":" Understanding 3D scenes from multi-view inputs has been proven to alleviate\nthe view discrepancy issue in 3D visual grounding. However, existing methods\nnormally neglect the view cues embedded in the text modality and fail to weigh\nthe relative importance of different views. In this paper, we propose\nViewRefer, a multi-view framework for 3D visual grounding exploring how to\ngrasp the view knowledge from both text and 3D modalities. For the text branch,\nViewRefer leverages the diverse linguistic knowledge of large-scale language\nmodels, e.g., GPT, to expand a single grounding text to multiple\ngeometry-consistent descriptions. Meanwhile, in the 3D modality, a transformer\nfusion module with inter-view attention is introduced to boost the interaction\nof objects across views. On top of that, we further present a set of learnable\nmulti-view prototypes, which memorize scene-agnostic knowledge for different\nviews, and enhance the framework from two perspectives: a view-guided attention\nmodule for more robust text features, and a view-guided scoring strategy during\nthe final prediction. With our designed paradigm, ViewRefer achieves superior\nperformance on three benchmarks and surpasses the second-best by +2.8%, +1.5%,\nand +1.35% on Sr3D, Nr3D, and ScanRefer.\n","authors":["Zoey Guo","Yiwen Tang","Ray Zhang","Dong Wang","Zhigang Wang","Bin Zhao","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2303.16894v3.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.13004v1","updated":"2023-08-24T18:07:37Z","published":"2023-08-24T18:07:37Z","title":"Spherical Vision Transformer for 360-degree Video Saliency Prediction","summary":" The growing interest in omnidirectional videos (ODVs) that capture the full\nfield-of-view (FOV) has gained 360-degree saliency prediction importance in\ncomputer vision. However, predicting where humans look in 360-degree scenes\npresents unique challenges, including spherical distortion, high resolution,\nand limited labelled data. We propose a novel vision-transformer-based model\nfor omnidirectional videos named SalViT360 that leverages tangent image\nrepresentations. We introduce a spherical geometry-aware spatiotemporal\nself-attention mechanism that is capable of effective omnidirectional video\nunderstanding. Furthermore, we present a consistency-based unsupervised\nregularization term for projection-based 360-degree dense-prediction models to\nreduce artefacts in the predictions that occur after inverse projection. Our\napproach is the first to employ tangent images for omnidirectional saliency\nprediction, and our experimental results on three ODV saliency datasets\ndemonstrate its effectiveness compared to the state-of-the-art.\n","authors":["Mert Cokelek","Nevrez Imamoglu","Cagri Ozcinar","Erkut Erdem","Aykut Erdem"],"pdf_url":"https://arxiv.org/pdf/2308.13004v1.pdf","comment":"12 pages, 4 figures, accepted to BMVC 2023"},{"id":"http://arxiv.org/abs/2202.08806v2","updated":"2023-08-24T17:46:12Z","published":"2022-02-17T18:19:53Z","title":"Grammar-Based Grounded Lexicon Learning","summary":" We present Grammar-Based Grounded Lexicon Learning (G2L2), a lexicalist\napproach toward learning a compositional and grounded meaning representation of\nlanguage from grounded data, such as paired images and texts. At the core of\nG2L2 is a collection of lexicon entries, which map each word to a tuple of a\nsyntactic type and a neuro-symbolic semantic program. For example, the word\nshiny has a syntactic type of adjective; its neuro-symbolic semantic program\nhas the symbolic form {\\lambda}x. filter(x, SHINY), where the concept SHINY is\nassociated with a neural network embedding, which will be used to classify\nshiny objects. Given an input sentence, G2L2 first looks up the lexicon entries\nassociated with each token. It then derives the meaning of the sentence as an\nexecutable neuro-symbolic program by composing lexical meanings based on\nsyntax. The recovered meaning programs can be executed on grounded inputs. To\nfacilitate learning in an exponentially-growing compositional space, we\nintroduce a joint parsing and expected execution algorithm, which does local\nmarginalization over derivations to reduce the training time. We evaluate G2L2\non two domains: visual reasoning and language-driven navigation. Results show\nthat G2L2 can generalize from small amounts of data to novel compositions of\nwords.\n","authors":["Jiayuan Mao","Haoyue Shi","Jiajun Wu","Roger P. Levy","Joshua B. Tenenbaum"],"pdf_url":"https://arxiv.org/pdf/2202.08806v2.pdf","comment":"Minor typo fixes. NeurIPS 2021. Project page:\n https://g2l2.csail.mit.edu/"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.12911v1","updated":"2023-08-24T16:37:39Z","published":"2023-08-24T16:37:39Z","title":"On Popularity Bias of Multimodal-aware Recommender Systems: a\n Modalities-driven Analysis","summary":" Multimodal-aware recommender systems (MRSs) exploit multimodal content (e.g.,\nproduct images or descriptions) as items' side information to improve\nrecommendation accuracy. While most of such methods rely on factorization\nmodels (e.g., MFBPR) as base architecture, it has been shown that MFBPR may be\naffected by popularity bias, meaning that it inherently tends to boost the\nrecommendation of popular (i.e., short-head) items at the detriment of niche\n(i.e., long-tail) items from the catalog. Motivated by this assumption, in this\nwork, we provide one of the first analyses on how multimodality in\nrecommendation could further amplify popularity bias. Concretely, we evaluate\nthe performance of four state-of-the-art MRSs algorithms (i.e., VBPR, MMGCN,\nGRCN, LATTICE) on three datasets from Amazon by assessing, along with\nrecommendation accuracy metrics, performance measures accounting for the\ndiversity of recommended items and the portion of retrieved niche items. To\nbetter investigate this aspect, we decide to study the separate influence of\neach modality (i.e., visual and textual) on popularity bias in different\nevaluation dimensions. Results, which demonstrate how the single modality may\naugment the negative effect of popularity bias, shed light on the importance to\nprovide a more rigorous analysis of the performance of such models.\n","authors":["Daniele Malitesta","Giandomenico Cornacchia","Claudio Pomo","Tommaso Di Noia"],"pdf_url":"https://arxiv.org/pdf/2308.12911v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12777v1","updated":"2023-08-24T13:27:58Z","published":"2023-08-24T13:27:58Z","title":"Towards Communication-Efficient Model Updating for On-Device\n Session-Based Recommendation","summary":" On-device recommender systems recently have garnered increasing attention due\nto their advantages of providing prompt response and securing privacy. To stay\ncurrent with evolving user interests, cloud-based recommender systems are\nperiodically updated with new interaction data. However, on-device models\nstruggle to retrain themselves because of limited onboard computing resources.\nAs a solution, we consider the scenario where the model retraining occurs on\nthe server side and then the updated parameters are transferred to edge devices\nvia network communication. While this eliminates the need for local retraining,\nit incurs a regular transfer of parameters that significantly taxes network\nbandwidth. To mitigate this issue, we develop an efficient approach based on\ncompositional codes to compress the model update. This approach ensures the\non-device model is updated flexibly with minimal additional parameters whilst\nutilizing previous knowledge. The extensive experiments conducted on multiple\nsession-based recommendation models with distinctive architectures demonstrate\nthat the on-device model can achieve comparable accuracy to the retrained\nserver-side counterpart through transferring an update 60x smaller in size. The\ncodes are available at \\url{https://github.com/xiaxin1998/ODUpdate}.\n","authors":["Xin Xia","Junliang Yu","Guandong Xu","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2308.12777v1.pdf","comment":"cikm2023"},{"id":"http://arxiv.org/abs/2308.12767v1","updated":"2023-08-24T13:14:49Z","published":"2023-08-24T13:14:49Z","title":"On the Consistency of Average Embeddings for Item Recommendation","summary":" A prevalent practice in recommender systems consists of averaging item\nembeddings to represent users or higher-level concepts in the same embedding\nspace. This paper investigates the relevance of such a practice. For this\npurpose, we propose an expected precision score, designed to measure the\nconsistency of an average embedding relative to the items used for its\nconstruction. We subsequently analyze the mathematical expression of this score\nin a theoretical setting with specific assumptions, as well as its empirical\nbehavior on real-world data from music streaming services. Our results\nemphasize that real-world averages are less consistent for recommendation,\nwhich paves the way for future research to better align real-world embeddings\nwith assumptions from our theoretical setting.\n","authors":["Walid Bendada","Guillaume Salha-Galvan","Romain Hennequin","Thomas Bouabça","Tristan Cazenave"],"pdf_url":"https://arxiv.org/pdf/2308.12767v1.pdf","comment":"17th ACM Conference on Recommender Systems (RecSys 2023)"},{"id":"http://arxiv.org/abs/2308.12743v1","updated":"2023-08-24T12:45:02Z","published":"2023-08-24T12:45:02Z","title":"Video Recommendation Using Social Network Analysis and User Viewing\n Patterns","summary":" With the meteoric rise of video-on-demand (VOD) platforms, users face the\nchallenge of sifting through an expansive sea of content to uncover shows that\nclosely match their preferences. To address this information overload dilemma,\nVOD services have increasingly incorporated recommender systems powered by\nalgorithms that analyze user behavior and suggest personalized content.\nHowever, a majority of existing recommender systems depend on explicit user\nfeedback in the form of ratings and reviews, which can be difficult and\ntime-consuming to collect at scale. This presents a key research gap, as\nleveraging users' implicit feedback patterns could provide an alternative\navenue for building effective video recommendation models, circumventing the\nneed for explicit ratings. However, prior literature lacks sufficient\nexploration into implicit feedback-based recommender systems, especially in the\ncontext of modeling video viewing behavior. Therefore, this paper aims to\nbridge this research gap by proposing a novel video recommendation technique\nthat relies solely on users' implicit feedback in the form of their content\nviewing percentages.\n","authors":["Mehrdad Maghsoudi","MohammdHossein Valikhan","MohammdHossein Zohdi"],"pdf_url":"https://arxiv.org/pdf/2308.12743v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12729v1","updated":"2023-08-24T12:08:07Z","published":"2023-08-24T12:08:07Z","title":"Out of the Box Thinking: Improving Customer Lifetime Value Modelling via\n Expert Routing and Game Whale Detection","summary":" Customer lifetime value (LTV) prediction is essential for mobile game\npublishers trying to optimize the advertising investment for each user\nacquisition based on the estimated worth. In mobile games, deploying\nmicrotransactions is a simple yet effective monetization strategy, which\nattracts a tiny group of game whales who splurge on in-game purchases. The\npresence of such game whales may impede the practicality of existing LTV\nprediction models, since game whales' purchase behaviours always exhibit varied\ndistribution from general users. Consequently, identifying game whales can open\nup new opportunities to improve the accuracy of LTV prediction models. However,\nlittle attention has been paid to applying game whale detection in LTV\nprediction, and existing works are mainly specialized for the long-term LTV\nprediction with the assumption that the high-quality user features are\navailable, which is not applicable in the UA stage. In this paper, we propose\nExpLTV, a novel multi-task framework to perform LTV prediction and game whale\ndetection in a unified way. In ExpLTV, we first innovatively design a deep\nneural network-based game whale detector that can not only infer the intrinsic\norder in accordance with monetary value, but also precisely identify high\nspenders (i.e., game whales) and low spenders. Then, by treating the game whale\ndetector as a gating network to decide the different mixture patterns of LTV\nexperts assembling, we can thoroughly leverage the shared information and\nscenario-specific information (i.e., game whales modelling and low spenders\nmodelling). Finally, instead of separately designing a purchase rate estimator\nfor two tasks, we design a shared estimator that can preserve the inner task\nrelationships. The superiority of ExpLTV is further validated via extensive\nexperiments on three industrial datasets.\n","authors":["Shijie Zhang","Xin Yan","Xuejiao Yang","Binfeng Jia","Shuangyang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.12729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.02182v3","updated":"2023-08-24T06:10:48Z","published":"2023-05-03T15:24:41Z","title":"Uncovering ChatGPT's Capabilities in Recommender Systems","summary":" The debut of ChatGPT has recently attracted the attention of the natural\nlanguage processing (NLP) community and beyond. Existing studies have\ndemonstrated that ChatGPT shows significant improvement in a range of\ndownstream NLP tasks, but the capabilities and limitations of ChatGPT in terms\nof recommendations remain unclear. In this study, we aim to conduct an\nempirical analysis of ChatGPT's recommendation ability from an Information\nRetrieval (IR) perspective, including point-wise, pair-wise, and list-wise\nranking. To achieve this goal, we re-formulate the above three recommendation\npolicies into a domain-specific prompt format. Through extensive experiments on\nfour datasets from different domains, we demonstrate that ChatGPT outperforms\nother large language models across all three ranking policies. Based on the\nanalysis of unit cost improvements, we identify that ChatGPT with list-wise\nranking achieves the best trade-off between cost and performance compared to\npoint-wise and pair-wise ranking. Moreover, ChatGPT shows the potential for\nmitigating the cold start problem and explainable recommendation. To facilitate\nfurther explorations in this area, the full code and detailed original results\nare open-sourced at https://github.com/rainym00d/LLM4RS.\n","authors":["Sunhao Dai","Ninglu Shao","Haiyuan Zhao","Weijie Yu","Zihua Si","Chen Xu","Zhongxiang Sun","Xiao Zhang","Jun Xu"],"pdf_url":"https://arxiv.org/pdf/2305.02182v3.pdf","comment":"Accepted by RecSys 2023"},{"id":"http://arxiv.org/abs/2308.12580v1","updated":"2023-08-24T05:44:25Z","published":"2023-08-24T05:44:25Z","title":"Laying foundations to quantify the \"Effort of Reproducibility\"","summary":" Why are some research studies easy to reproduce while others are difficult?\nCasting doubt on the accuracy of scientific work is not fruitful, especially\nwhen an individual researcher cannot reproduce the claims made in the paper.\nThere could be many subjective reasons behind the inability to reproduce a\nscientific paper. The field of Machine Learning (ML) faces a reproducibility\ncrisis, and surveying a portion of published articles has resulted in a group\nrealization that although sharing code repositories would be appreciable, code\nbases are not the end all be all for determining the reproducibility of an\narticle. Various parties involved in the publication process have come forward\nto address the reproducibility crisis and solutions such as badging articles as\nreproducible, reproducibility checklists at conferences (\\textit{NeurIPS, ICML,\nICLR, etc.}), and sharing artifacts on \\textit{OpenReview} come across as\npromising solutions to the core problem. The breadth of literature on\nreproducibility focuses on measures required to avoid ir-reproducibility, and\nthere is not much research into the effort behind reproducing these articles.\nIn this paper, we investigate the factors that contribute to the easiness and\ndifficulty of reproducing previously published studies and report on the\nfoundational framework to quantify effort of reproducibility.\n","authors":["Akhil Pandey Akella","David Koop","Hamed Alhoori"],"pdf_url":"https://arxiv.org/pdf/2308.12580v1.pdf","comment":"Accepted at ACM/IEEE conference JCDL' 2023. Refer\n https://2023.jcdl.org/program/schedule-printable/ for confirmation"},{"id":"http://arxiv.org/abs/2302.01733v2","updated":"2023-08-24T05:43:39Z","published":"2023-02-03T13:48:59Z","title":"Committed Private Information Retrieval","summary":" A private information retrieval (PIR) scheme allows a client to retrieve a\ndata item $x_i$ among $n$ items $x_1,x_2,\\ldots,x_n$ from $k$ servers, without\nrevealing what $i$ is even when $t < k$ servers collude and try to learn $i$.\nSuch a PIR scheme is said to be $t$-private. A PIR scheme is $v$-verifiable if\nthe client can verify the correctness of the retrieved $x_i$ even when $v \\leq\nk$ servers collude and try to fool the client by sending manipulated data. Most\nof the previous works in the literature on PIR assumed that $v < k$, leaving\nthe case of all-colluding servers open. We propose a generic construction that\ncombines a linear map commitment (LMC) and an arbitrary linear PIR scheme to\nproduce a $k$-verifiable PIR scheme, termed a committed PIR scheme. Such a\nscheme guarantees that even in the worst scenario, when all servers are under\nthe control of an attacker, although the privacy is unavoidably lost, the\nclient won't be fooled into accepting an incorrect $x_i$. We demonstrate the\npracticality of our proposal by implementing the committed PIR schemes based on\nthe Lai-Malavolta LMC and three well-known PIR schemes using the GMP library\nand blst, the current fastest C library for elliptic curve pairings.\n","authors":["Quang Cao","Hong Yen Tran","Son Hoang Dau","Xun Yi","Emanuele Viterbo","Chen Feng","Yu-Chih Huang","Jingge Zhu","Stanislav Kruglik","Han Mao Kiah"],"pdf_url":"https://arxiv.org/pdf/2302.01733v2.pdf","comment":"Accepted at ESORICS 2023"},{"id":"http://arxiv.org/abs/2308.12574v1","updated":"2023-08-24T05:26:54Z","published":"2023-08-24T05:26:54Z","title":"Exploring the Integration Strategies of Retriever and Large Language\n Models","summary":" The integration of retrieved passages and large language models (LLMs), such\nas ChatGPTs, has significantly contributed to improving open-domain question\nanswering. However, there is still a lack of exploration regarding the optimal\napproach for incorporating retrieved passages into the answer generation\nprocess. This paper aims to fill this gap by investigating different methods of\ncombining retrieved passages with LLMs to enhance answer generation. We begin\nby examining the limitations of a commonly-used concatenation approach.\nSurprisingly, this approach often results in generating \"unknown\" outputs, even\nwhen the correct document is among the top-k retrieved passages. To address\nthis issue, we explore four alternative strategies for integrating the\nretrieved passages with the LLMs. These strategies include two single-round\nmethods that utilize chain-of-thought reasoning and two multi-round strategies\nthat incorporate feedback loops. Through comprehensive analyses and\nexperiments, we provide insightful observations on how to effectively leverage\nretrieved passages to enhance the answer generation capability of LLMs.\n","authors":["Ye Liu","Semih Yavuz","Rui Meng","Meghana Moorthy","Shafiq Joty","Caiming Xiong","Yingbo Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.12574v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07134v3","updated":"2023-08-24T03:54:45Z","published":"2023-08-14T13:41:09Z","title":"Natural Language is All a Graph Needs","summary":" The emergence of large-scale pre-trained language models, such as ChatGPT,\nhas revolutionized various research fields in artificial intelligence.\nTransformers-based large language models (LLMs) have gradually replaced CNNs\nand RNNs to unify fields of computer vision and natural language processing.\nCompared with the data that exists relatively independently such as images,\nvideos or texts, graph is a type of data that contains rich structural and\nrelational information. Meanwhile, natural language, as one of the most\nexpressive mediums, excels in describing complex structures. However, existing\nwork on incorporating graph learning problems into the generative language\nmodeling framework remains very limited. As the importance of large language\nmodels continues to grow, it becomes essential to explore whether LLMs can also\nreplace GNNs as the foundation model for graphs. In this paper, we propose\nInstructGLM (Instruction-finetuned Graph Language Model), systematically design\nhighly scalable prompts based on natural language instructions, and use natural\nlanguage to describe the geometric structure and node features of the graph for\ninstruction tuning an LLM to perform learning and inference on graphs in a\ngenerative manner. Our method exceeds all competitive GNN baselines on\nogbn-arxiv, Cora and PubMed datasets, which demonstrates the effectiveness of\nour method and sheds light on generative large language models as the\nfoundation model for graph machine learning.\n","authors":["Ruosong Ye","Caiqi Zhang","Runhui Wang","Shuyuan Xu","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.07134v3.pdf","comment":"21 pages, 2 figures, 5 tables"},{"id":"http://arxiv.org/abs/2304.07944v2","updated":"2023-08-24T01:00:37Z","published":"2023-04-17T01:55:40Z","title":"An In-depth Investigation of User Response Simulation for Conversational\n Search","summary":" Conversational search has seen increased recent attention in both the IR and\nNLP communities. It seeks to clarify and solve a user's search need through\nmulti-turn natural language interactions. However, most existing systems are\ntrained and demonstrated with recorded or artificial conversation logs.\nEventually, conversational search systems should be trained, evaluated, and\ndeployed in an open-ended setting with unseen conversation trajectories. A key\nchallenge is that training and evaluating such systems both require a\nhuman-in-the-loop, which is expensive and does not scale. One strategy for this\nis to simulate users, thereby reducing the scaling costs. However, current user\nsimulators are either limited to only respond to yes-no questions from the\nconversational search system, or unable to produce high quality responses in\ngeneral.\n In this paper, we show that current state-of-the-art user simulation system\ncould be significantly improved by replacing it with a smaller but advanced\nnatural language generation model. But rather than merely reporting this new\nstate-of-the-art, we present an in-depth investigation of the task of\nsimulating user response for conversational search. Our goal is to supplement\nexisting works with an insightful hand-analysis of what challenges are still\nunsolved by the advanced model, as well as to propose our solutions for them.\nThe challenges we identified include (1) dataset noise, (2) a blind spot that\nis difficult for existing models to learn, and (3) a specific type of\nmisevaluation in the standard empirical setup. Except for the dataset noise\nissue, we propose solutions to cover the training blind spot and to avoid the\nmisevaluation. Our proposed solutions lead to further improvements. Our best\nsystem improves the previous state-of-the-art significantly.\n","authors":["Zhenduo Wang","Zhichao Xu","Qingyao Ai","Vivek Srikumar"],"pdf_url":"https://arxiv.org/pdf/2304.07944v2.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2308.13050v1","updated":"2023-08-24T19:36:05Z","published":"2023-08-24T19:36:05Z","title":"Multi-BERT for Embeddings for Recommendation System","summary":" In this paper, we propose a novel approach for generating document embeddings\nusing a combination of Sentence-BERT (SBERT) and RoBERTa, two state-of-the-art\nnatural language processing models. Our approach treats sentences as tokens and\ngenerates embeddings for them, allowing the model to capture both\nintra-sentence and inter-sentence relations within a document. We evaluate our\nmodel on a book recommendation task and demonstrate its effectiveness in\ngenerating more semantically rich and accurate document embeddings. To assess\nthe performance of our approach, we conducted experiments on a book\nrecommendation task using the Goodreads dataset. We compared the document\nembeddings generated using our MULTI-BERT model to those generated using SBERT\nalone. We used precision as our evaluation metric to compare the quality of the\ngenerated embeddings. Our results showed that our model consistently\noutperformed SBERT in terms of the quality of the generated embeddings.\nFurthermore, we found that our model was able to capture more nuanced semantic\nrelations within documents, leading to more accurate recommendations. Overall,\nour results demonstrate the effectiveness of our approach and suggest that it\nis a promising direction for improving the performance of recommendation\nsystems\n","authors":["Shashidhar Reddy Javaji","Krutika Sarode"],"pdf_url":"https://arxiv.org/pdf/2308.13050v1.pdf","comment":"5 pages, 1 figure, 1 table"},{"id":"http://arxiv.org/abs/2308.13032v1","updated":"2023-08-24T18:58:10Z","published":"2023-08-24T18:58:10Z","title":"Financial News Analytics Using Fine-Tuned Llama 2 GPT Model","summary":" The paper considers the possibility to fine-tune Llama 2 Large Language Model\n(LLM) for the multitask analysis of financial news. For fine-tuning, the\nPEFT/LoRA based approach was used. In the study, the model was fine-tuned for\nthe following tasks: analysing a text from financial market perspectives,\nhighlighting main points of a text, summarizing a text and extracting named\nentities with appropriate sentiments. The obtained results show that the\nfine-tuned Llama 2 model can perform a multitask financial news analysis with a\nspecified structure of response, part of response can be a structured text and\nanother part of data can have JSON format for further processing. Extracted\nsentiments for named entities can be considered as predictive features in\nsupervised machine learning models with quantitative target variables.\n","authors":["Bohdan M. Pavlyshenko"],"pdf_url":"https://arxiv.org/pdf/2308.13032v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2308.12970v1","updated":"2023-08-24T17:59:54Z","published":"2023-08-24T17:59:54Z","title":"NeuralClothSim: Neural Deformation Fields Meet the Kirchhoff-Love Thin\n Shell Theory","summary":" Cloth simulation is an extensively studied problem, with a plethora of\nsolutions available in computer graphics literature. Existing cloth simulators\nproduce realistic cloth deformations that obey different types of boundary\nconditions. Nevertheless, their operational principle remains limited in\nseveral ways: They operate on explicit surface representations with a fixed\nspatial resolution, perform a series of discretised updates (which bounds their\ntemporal resolution), and require comparably large amounts of storage.\nMoreover, back-propagating gradients through the existing solvers is often not\nstraightforward, which poses additional challenges when integrating them into\nmodern neural architectures. In response to the limitations mentioned above,\nthis paper takes a fundamentally different perspective on physically-plausible\ncloth simulation and re-thinks this long-standing problem: We propose\nNeuralClothSim, i.e., a new cloth simulation approach using thin shells, in\nwhich surface evolution is encoded in neural network weights. Our\nmemory-efficient and differentiable solver operates on a new continuous\ncoordinate-based representation of dynamic surfaces, i.e., neural deformation\nfields (NDFs); it supervises NDF evolution with the rules of the non-linear\nKirchhoff-Love shell theory. NDFs are adaptive in the sense that they 1)\nallocate their capacity to the deformation details as the latter arise during\nthe cloth evolution and 2) allow surface state queries at arbitrary spatial and\ntemporal resolutions without retraining. We show how to train our\nNeuralClothSim solver while imposing hard boundary conditions and demonstrate\nmultiple applications, such as material interpolation and simulation editing.\nThe experimental results highlight the effectiveness of our formulation and its\npotential impact.\n","authors":["Navami Kairanda","Marc Habermann","Christian Theobalt","Vladislav Golyanik"],"pdf_url":"https://arxiv.org/pdf/2308.12970v1.pdf","comment":"27 pages, 22 figures and 3 tables; project page:\n https://4dqv.mpi-inf.mpg.de/NeuralClothSim/"},{"id":"http://arxiv.org/abs/2308.12967v1","updated":"2023-08-24T17:59:50Z","published":"2023-08-24T17:59:50Z","title":"NeO 360: Neural Fields for Sparse View Synthesis of Outdoor Scenes","summary":" Recent implicit neural representations have shown great results for novel\nview synthesis. However, existing methods require expensive per-scene\noptimization from many views hence limiting their application to real-world\nunbounded urban settings where the objects of interest or backgrounds are\nobserved from very few views. To mitigate this challenge, we introduce a new\napproach called NeO 360, Neural fields for sparse view synthesis of outdoor\nscenes. NeO 360 is a generalizable method that reconstructs 360{\\deg} scenes\nfrom a single or a few posed RGB images. The essence of our approach is in\ncapturing the distribution of complex real-world outdoor 3D scenes and using a\nhybrid image-conditional triplanar representation that can be queried from any\nworld point. Our representation combines the best of both voxel-based and\nbird's-eye-view (BEV) representations and is more effective and expressive than\neach. NeO 360's representation allows us to learn from a large collection of\nunbounded 3D scenes while offering generalizability to new views and novel\nscenes from as few as a single image during inference. We demonstrate our\napproach on the proposed challenging 360{\\deg} unbounded dataset, called NeRDS\n360, and show that NeO 360 outperforms state-of-the-art generalizable methods\nfor novel view synthesis while also offering editing and composition\ncapabilities. Project page:\nhttps://zubair-irshad.github.io/projects/neo360.html\n","authors":["Muhammad Zubair Irshad","Sergey Zakharov","Katherine Liu","Vitor Guizilini","Thomas Kollar","Adrien Gaidon","Zsolt Kira","Rares Ambrus"],"pdf_url":"https://arxiv.org/pdf/2308.12967v1.pdf","comment":"Accepted to International Conference on Computer Vision (ICCV), 2023.\n Project page: https://zubair-irshad.github.io/projects/neo360.html"},{"id":"http://arxiv.org/abs/2308.12968v1","updated":"2023-08-24T17:59:50Z","published":"2023-08-24T17:59:50Z","title":"Scenimefy: Learning to Craft Anime Scene via Semi-Supervised\n Image-to-Image Translation","summary":" Automatic high-quality rendering of anime scenes from complex real-world\nimages is of significant practical value. The challenges of this task lie in\nthe complexity of the scenes, the unique features of anime style, and the lack\nof high-quality datasets to bridge the domain gap. Despite promising attempts,\nprevious efforts are still incompetent in achieving satisfactory results with\nconsistent semantic preservation, evident stylization, and fine details. In\nthis study, we propose Scenimefy, a novel semi-supervised image-to-image\ntranslation framework that addresses these challenges. Our approach guides the\nlearning with structure-consistent pseudo paired data, simplifying the pure\nunsupervised setting. The pseudo data are derived uniquely from a\nsemantic-constrained StyleGAN leveraging rich model priors like CLIP. We\nfurther apply segmentation-guided data selection to obtain high-quality pseudo\nsupervision. A patch-wise contrastive style loss is introduced to improve\nstylization and fine details. Besides, we contribute a high-resolution anime\nscene dataset to facilitate future research. Our extensive experiments\ndemonstrate the superiority of our method over state-of-the-art baselines in\nterms of both perceptual quality and quantitative performance.\n","authors":["Yuxin Jiang","Liming Jiang","Shuai Yang","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2308.12968v1.pdf","comment":"ICCV 2023. The first two authors contributed equally. Code:\n https://github.com/Yuxinn-J/Scenimefy Project page:\n https://yuxinn-j.github.io/projects/Scenimefy.html"},{"id":"http://arxiv.org/abs/2308.12964v1","updated":"2023-08-24T17:59:01Z","published":"2023-08-24T17:59:01Z","title":"Dense Text-to-Image Generation with Attention Modulation","summary":" Existing text-to-image diffusion models struggle to synthesize realistic\nimages given dense captions, where each text prompt provides a detailed\ndescription for a specific image region. To address this, we propose\nDenseDiffusion, a training-free method that adapts a pre-trained text-to-image\nmodel to handle such dense captions while offering control over the scene\nlayout. We first analyze the relationship between generated images' layouts and\nthe pre-trained model's intermediate attention maps. Next, we develop an\nattention modulation method that guides objects to appear in specific regions\naccording to layout guidance. Without requiring additional fine-tuning or\ndatasets, we improve image generation performance given dense captions\nregarding both automatic and human evaluation scores. In addition, we achieve\nsimilar-quality visual results with models specifically trained with layout\nconditions.\n","authors":["Yunji Kim","Jiyoung Lee","Jin-Hwa Kim","Jung-Woo Ha","Jun-Yan Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.12964v1.pdf","comment":"Accepted by ICCV2023. Code and data are available at\n https://github.com/naver-ai/DenseDiffusion"},{"id":"http://arxiv.org/abs/2308.12956v1","updated":"2023-08-24T17:50:21Z","published":"2023-08-24T17:50:21Z","title":"DLIP: Distilling Language-Image Pre-training","summary":" Vision-Language Pre-training (VLP) shows remarkable progress with the\nassistance of extremely heavy parameters, which challenges deployment in real\napplications. Knowledge distillation is well recognized as the essential\nprocedure in model compression. However, existing knowledge distillation\ntechniques lack an in-depth investigation and analysis of VLP, and practical\nguidelines for VLP-oriented distillation are still not yet explored. In this\npaper, we present DLIP, a simple yet efficient Distilling Language-Image\nPre-training framework, through which we investigate how to distill a light VLP\nmodel. Specifically, we dissect the model distillation from multiple\ndimensions, such as the architecture characteristics of different modules and\nthe information transfer of different modalities. We conduct comprehensive\nexperiments and provide insights on distilling a light but performant VLP\nmodel. Experimental results reveal that DLIP can achieve a state-of-the-art\naccuracy/efficiency trade-off across diverse cross-modal tasks, e.g.,\nimage-text retrieval, image captioning and visual question answering. For\nexample, DLIP compresses BLIP by 1.9x, from 213M to 108M parameters, while\nachieving comparable or better performance. Furthermore, DLIP succeeds in\nretaining more than 95% of the performance with 22.4% parameters and 24.8%\nFLOPs compared to the teacher model and accelerates inference speed by 2.7x.\n","authors":["Huafeng Kuang","Jie Wu","Xiawu Zheng","Ming Li","Xuefeng Xiao","Rui Wang","Min Zheng","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2308.12956v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.00347v2","updated":"2023-08-24T17:42:22Z","published":"2023-02-01T10:11:03Z","title":"Anderson Acceleration For Bioinformatics-Based Machine Learning","summary":" Anderson acceleration (AA) is a well-known method for accelerating the\nconvergence of iterative algorithms, with applications in various fields\nincluding deep learning and optimization. Despite its popularity in these\nareas, the effectiveness of AA in classical machine learning classifiers has\nnot been thoroughly studied. Tabular data, in particular, presents a unique\nchallenge for deep learning models, and classical machine learning models are\nknown to perform better in these scenarios. However, the convergence analysis\nof these models has received limited attention. To address this gap in\nresearch, we implement a support vector machine (SVM) classifier variant that\nincorporates AA to speed up convergence. We evaluate the performance of our SVM\nwith and without Anderson acceleration on several datasets from the biology\ndomain and demonstrate that the use of AA significantly improves convergence\nand reduces the training loss as the number of iterations increases. Our\nfindings provide a promising perspective on the potential of Anderson\nacceleration in the training of simple machine learning classifiers and\nunderscore the importance of further research in this area. By showing the\neffectiveness of AA in this setting, we aim to inspire more studies that\nexplore the applications of AA in classical machine learning.\n","authors":["Sarwan Ali","Prakash Chourasia","Murray Patterson"],"pdf_url":"https://arxiv.org/pdf/2302.00347v2.pdf","comment":"Accepted in KDH-2023: Knowledge Discovery in Healthcare Data (IJCAI\n Workshop)"},{"id":"http://arxiv.org/abs/2308.12952v1","updated":"2023-08-24T17:41:20Z","published":"2023-08-24T17:41:20Z","title":"BridgeData V2: A Dataset for Robot Learning at Scale","summary":" We introduce BridgeData V2, a large and diverse dataset of robotic\nmanipulation behaviors designed to facilitate research on scalable robot\nlearning. BridgeData V2 contains 60,096 trajectories collected across 24\nenvironments on a publicly available low-cost robot. BridgeData V2 provides\nextensive task and environment variability, leading to skills that can\ngeneralize across environments, domains, and institutions, making the dataset a\nuseful resource for a broad range of researchers. Additionally, the dataset is\ncompatible with a wide variety of open-vocabulary, multi-task learning methods\nconditioned on goal images or natural language instructions. In our\nexperiments, we train 6 state-of-the-art imitation learning and offline\nreinforcement learning methods on our dataset, and find that they succeed on a\nsuite of tasks requiring varying amounts of generalization. We also demonstrate\nthat the performance of these methods improves with more data and higher\ncapacity models, and that training on a greater variety of skills leads to\nimproved generalization. By publicly sharing BridgeData V2 and our pre-trained\nmodels, we aim to accelerate research in scalable robot learning methods.\nProject page at https://rail-berkeley.github.io/bridgedata\n","authors":["Homer Walke","Kevin Black","Abraham Lee","Moo Jin Kim","Max Du","Chongyi Zheng","Tony Zhao","Philippe Hansen-Estruch","Quan Vuong","Andre He","Vivek Myers","Kuan Fang","Chelsea Finn","Sergey Levine"],"pdf_url":"https://arxiv.org/pdf/2308.12952v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2308.12949v1","updated":"2023-08-24T17:38:14Z","published":"2023-08-24T17:38:14Z","title":"Label Budget Allocation in Multi-Task Learning","summary":" The cost of labeling data often limits the performance of machine learning\nsystems. In multi-task learning, related tasks provide information to each\nother and improve overall performance, but the label cost can vary among tasks.\nHow should the label budget (i.e. the amount of money spent on labeling) be\nallocated among different tasks to achieve optimal multi-task performance? We\nare the first to propose and formally define the label budget allocation\nproblem in multi-task learning and to empirically show that different budget\nallocation strategies make a big difference to its performance. We propose a\nTask-Adaptive Budget Allocation algorithm to robustly generate the optimal\nbudget allocation adaptive to different multi-task learning settings.\nSpecifically, we estimate and then maximize the extent of new information\nobtained from the allocated budget as a proxy for multi-task learning\nperformance. Experiments on PASCAL VOC and Taskonomy demonstrate the efficacy\nof our approach over other widely used heuristic labeling strategies.\n","authors":["Ximeng Sun","Kihyuk Sohn","Kate Saenko","Clayton Mellina","Xiao Bian"],"pdf_url":"https://arxiv.org/pdf/2308.12949v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12939v1","updated":"2023-08-24T17:29:57Z","published":"2023-08-24T17:29:57Z","title":"Learning Only On Boundaries: a Physics-Informed Neural operator for\n Solving Parametric Partial Differential Equations in Complex Geometries","summary":" Recently deep learning surrogates and neural operators have shown promise in\nsolving partial differential equations (PDEs). However, they often require a\nlarge amount of training data and are limited to bounded domains. In this work,\nwe present a novel physics-informed neural operator method to solve\nparametrized boundary value problems without labeled data. By reformulating the\nPDEs into boundary integral equations (BIEs), we can train the operator network\nsolely on the boundary of the domain. This approach reduces the number of\nrequired sample points from $O(N^d)$ to $O(N^{d-1})$, where $d$ is the domain's\ndimension, leading to a significant acceleration of the training process.\nAdditionally, our method can handle unbounded problems, which are unattainable\nfor existing physics-informed neural networks (PINNs) and neural operators. Our\nnumerical experiments show the effectiveness of parametrized complex geometries\nand unbounded problems.\n","authors":["Zhiwei Fang","Sifan Wang","Paris Perdikaris"],"pdf_url":"https://arxiv.org/pdf/2308.12939v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.00143v3","updated":"2023-08-24T17:29:24Z","published":"2022-11-30T22:28:24Z","title":"FIESTA: Autoencoders for accurate fiber segmentation in tractography","summary":" White matter bundle segmentation is a cornerstone of modern tractography to\nstudy the brain's structural connectivity in domains such as neurological\ndisorders, neurosurgery, and aging. In this study, we present FIESTA (FIbEr\nSegmentation in Tractography using Autoencoders), a reliable and robust, fully\nautomated, and easily semi-automatically calibrated pipeline based on deep\nautoencoders that can dissect and fully populate white matter bundles. This\npipeline is built upon previous works that demonstrated how autoencoders can be\nused successfully for streamline filtering, bundle segmentation, and streamline\ngeneration in tractography. Our proposed method improves bundle segmentation\ncoverage by recovering hard-to-track bundles with generative sampling through\nthe latent space seeding of the subject bundle and the atlas bundle. A latent\nspace of streamlines is learned using autoencoder-based modeling combined with\ncontrastive learning. Using an atlas of bundles in standard space (MNI), our\nproposed method segments new tractograms using the autoencoder latent distance\nbetween each tractogram streamline and its closest neighbor bundle in the atlas\nof bundles. Intra-subject bundle reliability is improved by recovering\nhard-to-track streamlines, using the autoencoder to generate new streamlines\nthat increase the spatial coverage of each bundle while remaining anatomically\ncorrect. Results show that our method is more reliable than state-of-the-art\nautomated virtual dissection methods such as RecoBundles, RecoBundlesX,\nTractSeg, White Matter Analysis and XTRACT. Our framework allows for the\ntransition from one anatomical bundle definition to another with marginal\ncalibration efforts. Overall, these results show that our framework improves\nthe practicality and usability of current state-of-the-art bundle segmentation\nframework.\n","authors":["Félix Dumais","Jon Haitz Legarreta","Carl Lemaire","Philippe Poulin","François Rheault","Laurent Petit","Muhamed Barakovic","Stefano Magon","Maxime Descoteaux","Pierre-Marc Jodoin"],"pdf_url":"https://arxiv.org/pdf/2212.00143v3.pdf","comment":"36 pages, 13 figures, accepted in NeuroImage"},{"id":"http://arxiv.org/abs/2306.08451v2","updated":"2023-08-24T17:16:00Z","published":"2023-06-14T11:51:11Z","title":"A Survey on Blood Pressure Measurement Technologies: Addressing\n Potential Sources of Bias","summary":" Regular blood pressure (BP) monitoring in clinical and ambulatory settings\nplays a crucial role in the prevention, diagnosis, treatment, and management of\ncardiovascular diseases. Recently, the widespread adoption of ambulatory BP\nmeasurement devices has been driven predominantly by the increased prevalence\nof hypertension and its associated risks and clinical conditions. Recent\nguidelines advocate for regular BP monitoring as part of regular clinical\nvisits or even at home. This increased utilization of BP measurement\ntechnologies has brought up significant concerns, regarding the accuracy of\nreported BP values across settings.\n In this survey, focusing mainly on cuff-based BP monitoring technologies, we\nhighlight how BP measurements can demonstrate substantial biases and variances\ndue to factors such as measurement and device errors, demographics, and body\nhabitus. With these inherent biases, the development of a new generation of\ncuff-based BP devices which use artificial-intelligence (AI) has significant\npotential. We present future avenues where AI-assisted technologies can\nleverage the extensive clinical literature on BP-related studies together with\nthe large collections of BP records available in electronic health records.\nThese resources can be combined with machine learning approaches, including\ndeep learning and Bayesian inference, to remove BP measurement biases and to\nprovide individualized BP-related cardiovascular risk indexes.\n","authors":["Seyedeh Somayyeh Mousavi","Matthew A. Reyna","Gari D. Clifford","Reza Sameni"],"pdf_url":"https://arxiv.org/pdf/2306.08451v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12925v1","updated":"2023-08-24T16:58:30Z","published":"2023-08-24T16:58:30Z","title":"Low-count Time Series Anomaly Detection","summary":" Low-count time series describe sparse or intermittent events, which are\nprevalent in large-scale online platforms that capture and monitor diverse data\ntypes. Several distinct challenges surface when modelling low-count time\nseries, particularly low signal-to-noise ratios (when anomaly signatures are\nprovably undetectable), and non-uniform performance (when average metrics are\nnot representative of local behaviour). The time series anomaly detection\ncommunity currently lacks explicit tooling and processes to model and reliably\ndetect anomalies in these settings. We address this gap by introducing a novel\ngenerative procedure for creating benchmark datasets comprising of low-count\ntime series with anomalous segments. Via a mixture of theoretical and empirical\nanalysis, our work explains how widely-used algorithms struggle with the\ndistribution overlap between normal and anomalous segments. In order to\nmitigate this shortcoming, we then leverage our findings to demonstrate how\nanomaly score smoothing consistently improves performance. The practical\nutility of our analysis and recommendation is validated on a real-world dataset\ncontaining sales data for retail stores.\n","authors":["Philipp Renz","Kurt Cutajar","Niall Twomey","Gavin K. C. Cheung","Hanting Xie"],"pdf_url":"https://arxiv.org/pdf/2308.12925v1.pdf","comment":"6 pages, 7 figures, to be published in IEEE 2023 Workshop on Machine\n Learning for Signal Processing (MLSP)"},{"id":"http://arxiv.org/abs/2308.12921v1","updated":"2023-08-24T16:53:52Z","published":"2023-08-24T16:53:52Z","title":"An Efficient Distributed Multi-Agent Reinforcement Learning for EV\n Charging Network Control","summary":" The increasing trend in adopting electric vehicles (EVs) will significantly\nimpact the residential electricity demand, which results in an increased risk\nof transformer overload in the distribution grid. To mitigate such risks, there\nare urgent needs to develop effective EV charging controllers. Currently, the\nmajority of the EV charge controllers are based on a centralized approach for\nmanaging individual EVs or a group of EVs. In this paper, we introduce a\ndecentralized Multi-agent Reinforcement Learning (MARL) charging framework that\nprioritizes the preservation of privacy for EV owners. We employ the\nCentralized Training Decentralized Execution-Deep Deterministic Policy Gradient\n(CTDE-DDPG) scheme, which provides valuable information to users during\ntraining while maintaining privacy during execution. Our results demonstrate\nthat the CTDE framework improves the performance of the charging network by\nreducing the network costs. Moreover, we show that the Peak-to-Average Ratio\n(PAR) of the total demand is reduced, which, in turn, reduces the risk of\ntransformer overload during the peak hours.\n","authors":["Amin Shojaeighadikolaei","Morteza Hashemi"],"pdf_url":"https://arxiv.org/pdf/2308.12921v1.pdf","comment":"8 pages, 4 figures, accepted at Allerton 2023"},{"id":"http://arxiv.org/abs/2308.12919v1","updated":"2023-08-24T16:47:17Z","published":"2023-08-24T16:47:17Z","title":"Towards Realistic Unsupervised Fine-tuning with CLIP","summary":" The emergence of vision-language models (VLMs), such as CLIP, has spurred a\nsignificant research effort towards their application for downstream supervised\nlearning tasks. Although some previous studies have explored the unsupervised\nfine-tuning of CLIP, they often rely on prior knowledge in the form of class\nnames associated with ground truth labels. In this paper, we delve into a\nrealistic unsupervised fine-tuning scenario by assuming that the unlabeled data\nmight contain out-of-distribution samples from unknown classes. Furthermore, we\nemphasize the importance of simultaneously enhancing out-of-distribution\ndetection capabilities alongside the recognition of instances associated with\npredefined class labels.\n To tackle this problem, we present a simple, efficient, and effective\nfine-tuning approach called Universal Entropy Optimization (UEO). UEO leverages\nsample-level confidence to approximately minimize the conditional entropy of\nconfident instances and maximize the marginal entropy of less confident\ninstances. Apart from optimizing the textual prompts, UEO also incorporates\noptimization of channel-wise affine transformations within the visual branch of\nCLIP. Through extensive experiments conducted across 15 domains and 4 different\ntypes of prior knowledge, we demonstrate that UEO surpasses baseline methods in\nterms of both generalization and out-of-distribution detection.\n","authors":["Jian Liang","Lijun Sheng","Zhengbo Wang","Ran He","Tieniu Tan"],"pdf_url":"https://arxiv.org/pdf/2308.12919v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12918v1","updated":"2023-08-24T16:46:01Z","published":"2023-08-24T16:46:01Z","title":"Evaluating the Vulnerabilities in ML systems in terms of adversarial\n attacks","summary":" There have been recent adversarial attacks that are difficult to find. These\nnew adversarial attacks methods may pose challenges to current deep learning\ncyber defense systems and could influence the future defense of cyberattacks.\nThe authors focus on this domain in this research paper. They explore the\nconsequences of vulnerabilities in AI systems. This includes discussing how\nthey might arise, differences between randomized and adversarial examples and\nalso potential ethical implications of vulnerabilities. Moreover, it is\nimportant to train the AI systems appropriately when they are in testing phase\nand getting them ready for broader use.\n","authors":["John Harshith","Mantej Singh Gill","Madhan Jothimani"],"pdf_url":"https://arxiv.org/pdf/2308.12918v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17058v2","updated":"2023-08-24T16:45:29Z","published":"2023-05-26T16:09:59Z","title":"Exact Bayesian Inference on Discrete Models via Probability Generating\n Functions: A Probabilistic Programming Approach","summary":" We present an exact Bayesian inference method for discrete statistical\nmodels, which can find exact solutions to many discrete inference problems,\neven with infinite support and continuous priors. To express such models, we\nintroduce a probabilistic programming language that supports discrete and\ncontinuous sampling, discrete observations, affine functions, (stochastic)\nbranching, and conditioning on events. Our key tool is probability generating\nfunctions: they provide a compact closed-form representation of distributions\nthat are definable by programs, thus enabling the exact computation of\nposterior probabilities, expectation, variance, and higher moments. Our\ninference method is provably correct, fully automated and uses automatic\ndifferentiation (specifically, Taylor polynomials), but does not require\ncomputer algebra. Our experiments show that its performance on a range of\nreal-world examples is competitive with approximate Monte Carlo methods, while\navoiding approximation errors.\n","authors":["Fabian Zaiser","Andrzej S. Murawski","Luke Ong"],"pdf_url":"https://arxiv.org/pdf/2305.17058v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12908v1","updated":"2023-08-24T16:32:34Z","published":"2023-08-24T16:32:34Z","title":"POLCA: Power Oversubscription in LLM Cloud Providers","summary":" Recent innovation in large language models (LLMs), and their myriad use-cases\nhave rapidly driven up the compute capacity demand for datacenter GPUs. Several\ncloud providers and other enterprises have made substantial plans of growth in\ntheir datacenters to support these new workloads. One of the key bottleneck\nresources in datacenters is power, and given the increasing model sizes of\nLLMs, they are becoming increasingly power intensive. In this paper, we show\nthat there is a significant opportunity to oversubscribe power in LLM clusters.\nPower oversubscription improves the power efficiency of these datacenters,\nallowing more deployable servers per datacenter, and reduces the deployment\ntime, since building new datacenters is slow.\n We extensively characterize the power consumption patterns of a variety of\nLLMs and their configurations. We identify the differences between the\ninference and training power consumption patterns. Based on our analysis of\nthese LLMs, we claim that the average and peak power utilization in LLM\nclusters for inference should not be very high. Our deductions align with the\ndata from production LLM clusters, revealing that inference workloads offer\nsubstantial headroom for power oversubscription. However, the stringent set of\ntelemetry and controls that GPUs offer in a virtualized environment, makes it\nchallenging to have a reliable and robust power oversubscription mechanism.\n We propose POLCA, our framework for power oversubscription that is robust,\nreliable, and readily deployable for GPU clusters. Using open-source models to\nreplicate the power patterns observed in production, we simulate POLCA and\ndemonstrate that we can deploy 30% more servers in the same GPU cluster for\ninference, with minimal performance loss\n","authors":["Pratyush Patel","Esha Choukse","Chaojie Zhang","Íñigo Goiri","Brijesh Warrier","Nithish Mahalingam","Ricardo Bianchini"],"pdf_url":"https://arxiv.org/pdf/2308.12908v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.00939v6","updated":"2023-08-24T16:26:54Z","published":"2022-10-03T13:50:58Z","title":"Improving Sample Quality of Diffusion Models Using Self-Attention\n Guidance","summary":" Denoising diffusion models (DDMs) have attracted attention for their\nexceptional generation quality and diversity. This success is largely\nattributed to the use of class- or text-conditional diffusion guidance methods,\nsuch as classifier and classifier-free guidance. In this paper, we present a\nmore comprehensive perspective that goes beyond the traditional guidance\nmethods. From this generalized perspective, we introduce novel condition- and\ntraining-free strategies to enhance the quality of generated images. As a\nsimple solution, blur guidance improves the suitability of intermediate samples\nfor their fine-scale information and structures, enabling diffusion models to\ngenerate higher quality samples with a moderate guidance scale. Improving upon\nthis, Self-Attention Guidance (SAG) uses the intermediate self-attention maps\nof diffusion models to enhance their stability and efficacy. Specifically, SAG\nadversarially blurs only the regions that diffusion models attend to at each\niteration and guides them accordingly. Our experimental results show that our\nSAG improves the performance of various diffusion models, including ADM, IDDPM,\nStable Diffusion, and DiT. Moreover, combining SAG with conventional guidance\nmethods leads to further improvement.\n","authors":["Susung Hong","Gyuseong Lee","Wooseok Jang","Seungryong Kim"],"pdf_url":"https://arxiv.org/pdf/2210.00939v6.pdf","comment":"Accepted to ICCV 2023. Project Page:\n https://ku-cvlab.github.io/Self-Attention-Guidance"},{"id":"http://arxiv.org/abs/2308.12902v1","updated":"2023-08-24T16:22:05Z","published":"2023-08-24T16:22:05Z","title":"CDAN: Convolutional Dense Attention-guided Network for Low-light Image\n Enhancement","summary":" Low-light images, characterized by inadequate illumination, pose challenges\nof diminished clarity, muted colors, and reduced details. Low-light image\nenhancement, an essential task in computer vision, aims to rectify these issues\nby improving brightness, contrast, and overall perceptual quality, thereby\nfacilitating accurate analysis and interpretation. This paper introduces the\nConvolutional Dense Attention-guided Network (CDAN), a novel solution for\nenhancing low-light images. CDAN integrates an autoencoder-based architecture\nwith convolutional and dense blocks, complemented by an attention mechanism and\nskip connections. This architecture ensures efficient information propagation\nand feature learning. Furthermore, a dedicated post-processing phase refines\ncolor balance and contrast. Our approach demonstrates notable progress compared\nto state-of-the-art results in low-light image enhancement, showcasing its\nrobustness across a wide range of challenging scenarios. Our model performs\nremarkably on benchmark datasets, effectively mitigating under-exposure and\nproficiently restoring textures and colors in diverse low-light scenarios. This\nachievement underscores CDAN's potential for diverse computer vision tasks,\nnotably enabling robust object detection and recognition in challenging\nlow-light conditions.\n","authors":["Hossein Shakibania","Sina Raoufi","Hassan Khotanlou"],"pdf_url":"https://arxiv.org/pdf/2308.12902v1.pdf","comment":"18 pages, 13 figures"},{"id":"http://arxiv.org/abs/2308.12899v1","updated":"2023-08-24T16:20:00Z","published":"2023-08-24T16:20:00Z","title":"Unified Data Management and Comprehensive Performance Evaluation for\n Urban Spatial-Temporal Prediction [Experiment, Analysis & Benchmark]","summary":" The field of urban spatial-temporal prediction is advancing rapidly with the\ndevelopment of deep learning techniques and the availability of large-scale\ndatasets. However, challenges persist in accessing and utilizing diverse urban\nspatial-temporal datasets from different sources and stored in different\nformats, as well as determining effective model structures and components with\nthe proliferation of deep learning models. This work addresses these challenges\nand provides three significant contributions. Firstly, we introduce \"atomic\nfiles\", a unified storage format designed for urban spatial-temporal big data,\nand validate its effectiveness on 40 diverse datasets, simplifying data\nmanagement. Secondly, we present a comprehensive overview of technological\nadvances in urban spatial-temporal prediction models, guiding the development\nof robust models. Thirdly, we conduct extensive experiments using diverse\nmodels and datasets, establishing a performance leaderboard and identifying\npromising research directions. Overall, this work effectively manages urban\nspatial-temporal data, guides future efforts, and facilitates the development\nof accurate and efficient urban spatial-temporal prediction models. It can\npotentially make long-term contributions to urban spatial-temporal data\nmanagement and prediction, ultimately leading to improved urban living\nstandards.\n","authors":["Jiawei Jiang","Chengkai Han","Wayne Xin Zhao","Jingyuan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.12899v1.pdf","comment":"14 pages, 3 figures. arXiv admin note: text overlap with\n arXiv:2304.14343"},{"id":"http://arxiv.org/abs/2308.12896v1","updated":"2023-08-24T16:16:47Z","published":"2023-08-24T16:16:47Z","title":"Beyond Document Page Classification: Design, Datasets, and Challenges","summary":" This paper highlights the need to bring document classification benchmarking\ncloser to real-world applications, both in the nature of data tested ($X$:\nmulti-channel, multi-paged, multi-industry; $Y$: class distributions and label\nset variety) and in classification tasks considered ($f$: multi-page document,\npage stream, and document bundle classification, ...). We identify the lack of\npublic multi-page document classification datasets, formalize different\nclassification tasks arising in application scenarios, and motivate the value\nof targeting efficient multi-page document representations. An experimental\nstudy on proposed multi-page document classification datasets demonstrates that\ncurrent benchmarks have become irrelevant and need to be updated to evaluate\ncomplete documents, as they naturally occur in practice. This reality check\nalso calls for more mature evaluation methodologies, covering calibration\nevaluation, inference complexity (time-memory), and a range of realistic\ndistribution shifts (e.g., born-digital vs. scanning noise, shifting page\norder). Our study ends on a hopeful note by recommending concrete avenues for\nfuture improvements.}\n","authors":["Jordy Van Landeghem","Sanket Biswas","Matthew B. Blaschko","Marie-Francine Moens"],"pdf_url":"https://arxiv.org/pdf/2308.12896v1.pdf","comment":"8 pages, under review"},{"id":"http://arxiv.org/abs/2304.14343v5","updated":"2023-08-24T16:09:18Z","published":"2023-04-27T17:19:26Z","title":"Towards Efficient and Comprehensive Urban Spatial-Temporal Prediction: A\n Unified Library and Performance Benchmark","summary":" As deep learning technology advances and more urban spatial-temporal data\naccumulates, an increasing number of deep learning models are being proposed to\nsolve urban spatial-temporal prediction problems. However, there are\nlimitations in the existing field, including open-source data being in various\nformats and difficult to use, few papers making their code and data openly\navailable, and open-source models often using different frameworks and\nplatforms, making comparisons challenging. A standardized framework is urgently\nneeded to implement and evaluate these methods. To address these issues, we\nprovide a comprehensive review of urban spatial-temporal prediction and propose\na unified storage format for spatial-temporal data called atomic files. We also\npropose LibCity, an open-source library that offers researchers a credible\nexperimental tool and a convenient development framework. In this library, we\nhave reproduced 65 spatial-temporal prediction models and collected 55\nspatial-temporal datasets, allowing researchers to conduct comprehensive\nexperiments conveniently. Using LibCity, we conducted a series of experiments\nto validate the effectiveness of different models and components, and we\nsummarized promising future technology developments and research directions for\nspatial-temporal prediction. By enabling fair model comparisons, designing a\nunified data storage format, and simplifying the process of developing new\nmodels, LibCity is poised to make significant contributions to the\nspatial-temporal prediction field.\n","authors":["Jiawei Jiang","Chengkai Han","Wenjun Jiang","Wayne Xin Zhao","Jingyuan Wang"],"pdf_url":"https://arxiv.org/pdf/2304.14343v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12874v1","updated":"2023-08-24T15:54:32Z","published":"2023-08-24T15:54:32Z","title":"Easy attention: A simple self-attention mechanism for Transformers","summary":" To improve the robustness of transformer neural networks used for\ntemporal-dynamics prediction of chaotic systems, we propose a novel attention\nmechanism called easy attention. Due to the fact that self attention only makes\nusage of the inner product of queries and keys, it is demonstrated that the\nkeys, queries and softmax are not necessary for obtaining the attention score\nrequired to capture long-term dependencies in temporal sequences. Through\nimplementing singular-value decomposition (SVD) on the softmax attention score,\nwe further observe that the self attention compresses contribution from both\nqueries and keys in the spanned space of the attention score. Therefore, our\nproposed easy-attention method directly treats the attention scores as\nlearnable parameters. This approach produces excellent results when\nreconstructing and predicting the temporal dynamics of chaotic systems\nexhibiting more robustness and less complexity than the self attention or the\nwidely-used long short-term memory (LSTM) network. Our results show great\npotential for applications in more complex high-dimensional dynamical systems.\n","authors":["Marcial Sanchis-Agudo","Yuning Wang","Karthik Duraisamy","Ricardo Vinuesa"],"pdf_url":"https://arxiv.org/pdf/2308.12874v1.pdf","comment":"12 pages and 8 figures"},{"id":"http://arxiv.org/abs/2008.09312v6","updated":"2023-08-24T15:51:48Z","published":"2020-08-21T05:23:47Z","title":"Near Optimal Adversarial Attack on UCB Bandits","summary":" I study a stochastic multi-arm bandit problem where rewards are subject to\nadversarial corruption. I propose a novel attack strategy that manipulates a\nlearner employing the UCB algorithm into pulling some non-optimal target arm $T\n- o(T)$ times with a cumulative cost that scales as $\\widehat{O}(\\sqrt{\\log\nT})$, where $T$ is the number of rounds. I also prove the first lower bound on\nthe cumulative attack cost. The lower bound matches the upper bound up to\n$O(\\log \\log T)$ factors, showing the proposed attack strategy to be near\noptimal.\n","authors":["Shiliang Zuo"],"pdf_url":"https://arxiv.org/pdf/2008.09312v6.pdf","comment":"Appeared at ICML 2023 AdvML Workshop"},{"id":"http://arxiv.org/abs/2306.02157v3","updated":"2023-08-24T15:51:01Z","published":"2023-06-03T16:56:18Z","title":"Transforming to Yoked Neural Networks to Improve ANN Structure","summary":" Most existing classical artificial neural networks (ANN) are designed as a\ntree structure to imitate neural networks. In this paper, we argue that the\nconnectivity of a tree is not sufficient to characterize a neural network. The\nnodes of the same level of a tree cannot be connected with each other, i.e.,\nthese neural unit cannot share information with each other, which is a major\ndrawback of ANN. Although ANN has been significantly improved in recent years\nto more complex structures, such as the directed acyclic graph (DAG), these\nmethods also have unidirectional and acyclic bias for ANN. In this paper, we\npropose a method to build a bidirectional complete graph for the nodes in the\nsame level of an ANN, which yokes the nodes of the same level to formulate a\nneural module. We call our model as YNN in short. YNN promotes the information\ntransfer significantly which obviously helps in improving the performance of\nthe method. Our YNN can imitate neural networks much better compared with the\ntraditional ANN. In this paper, we analyze the existing structural bias of ANN\nand propose a model YNN to efficiently eliminate such structural bias. In our\nmodel, nodes also carry out aggregation and transformation of features, and\nedges determine the flow of information. We further impose auxiliary sparsity\nconstraint to the distribution of connectedness, which promotes the learned\nstructure to focus on critical connections. Finally, based on the optimized\nstructure, we also design small neural module structure based on the minimum\ncut technique to reduce the computational burden of the YNN model. This\nlearning process is compatible with the existing networks and different tasks.\nThe obtained quantitative experimental results reflect that the learned\nconnectivity is superior to the traditional NN structure.\n","authors":["Xinshun Liu","Yizhi Fang","Yichao Jiang"],"pdf_url":"https://arxiv.org/pdf/2306.02157v3.pdf","comment":"arXiv admin note: text overlap with arXiv:2008.08261 by other authors"},{"id":"http://arxiv.org/abs/2308.12871v1","updated":"2023-08-24T15:48:21Z","published":"2023-08-24T15:48:21Z","title":"IPA: Inference Pipeline Adaptation to Achieve High Accuracy and\n Cost-Efficiency","summary":" Efficiently optimizing multi-model inference pipelines for fast, accurate,\nand cost-effective inference is a crucial challenge in ML production systems,\ngiven their tight end-to-end latency requirements. To simplify the exploration\nof the vast and intricate trade-off space of accuracy and cost in inference\npipelines, providers frequently opt to consider one of them. However, the\nchallenge lies in reconciling accuracy and cost trade-offs. To address this\nchallenge and propose a solution to efficiently manage model variants in\ninference pipelines, we present IPA, an online deep-learning Inference Pipeline\nAdaptation system that efficiently leverages model variants for each deep\nlearning task. Model variants are different versions of pre-trained models for\nthe same deep learning task with variations in resource requirements, latency,\nand accuracy. IPA dynamically configures batch size, replication, and model\nvariants to optimize accuracy, minimize costs, and meet user-defined latency\nSLAs using Integer Programming. It supports multi-objective settings for\nachieving different trade-offs between accuracy and cost objectives while\nremaining adaptable to varying workloads and dynamic traffic patterns.\nExtensive experiments on a Kubernetes implementation with five real-world\ninference pipelines demonstrate that IPA improves normalized accuracy by up to\n35% with a minimal cost increase of less than 5%.\n","authors":["Saeid Ghafouri","Kamran Razavi","Mehran Salmani","Alireza Sanaee","Tania Lorido-Botran","Lin Wang","Joseph Doyle","Pooyan Jamshidi"],"pdf_url":"https://arxiv.org/pdf/2308.12871v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12864v1","updated":"2023-08-24T15:39:01Z","published":"2023-08-24T15:39:01Z","title":"Auto-weighted Bayesian Physics-Informed Neural Networks and robust\n estimations for multitask inverse problems in pore-scale imaging of\n dissolution","summary":" In this article, we present a novel data assimilation strategy in pore-scale\nimaging and demonstrate that this makes it possible to robustly address\nreactive inverse problems incorporating Uncertainty Quantification (UQ).\nPore-scale modeling of reactive flow offers a valuable opportunity to\ninvestigate the evolution of macro-scale properties subject to dynamic\nprocesses. Yet, they suffer from imaging limitations arising from the\nassociated X-ray microtomography (X-ray microCT) process, which induces\ndiscrepancies in the properties estimates. Assessment of the kinetic parameters\nalso raises challenges, as reactive coefficients are critical parameters that\ncan cover a wide range of values. We account for these two issues and ensure\nreliable calibration of pore-scale modeling, based on dynamical microCT images,\nby integrating uncertainty quantification in the workflow.\n The present method is based on a multitasking formulation of reactive inverse\nproblems combining data-driven and physics-informed techniques in calcite\ndissolution. This allows quantifying morphological uncertainties on the\nporosity field and estimating reactive parameter ranges through prescribed PDE\nmodels with a latent concentration field and dynamical microCT. The data\nassimilation strategy relies on sequential reinforcement incorporating\nsuccessively additional PDE constraints. We guarantee robust and unbiased\nuncertainty quantification by straightforward adaptive weighting of Bayesian\nPhysics-Informed Neural Networks (BPINNs), ensuring reliable micro-porosity\nchanges during geochemical transformations. We demonstrate successful Bayesian\nInference in 1D+Time and 2D+Time calcite dissolution based on synthetic microCT\nimages with meaningful posterior distribution on the reactive parameters and\ndimensionless numbers.\n","authors":["Sarah Perez","Philippe Poncet"],"pdf_url":"https://arxiv.org/pdf/2308.12864v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.00642v2","updated":"2023-08-24T15:35:18Z","published":"2022-10-31T13:02:50Z","title":"Farm-wide virtual load monitoring for offshore wind structures via\n Bayesian neural networks","summary":" Offshore wind structures are subject to deterioration mechanisms throughout\ntheir operational lifetime. Even if the deterioration evolution of structural\nelements can be estimated through physics-based deterioration models, the\nuncertainties involved in the process hurdle the selection of lifecycle\nmanagement decisions. In this scenario, the collection of relevant information\nthrough an efficient monitoring system enables the reduction of uncertainties,\nultimately driving more optimal lifecycle decisions. However, a full monitoring\ninstrumentation implemented on all wind turbines in a farm might become\nunfeasible due to practical and economical constraints. Besides, certain load\nmonitoring systems often become defective after a few years of marine\nenvironment exposure. Addressing the aforementioned concerns, a farm-wide\nvirtual load monitoring scheme directed by a fleet-leader wind turbine offers\nan attractive solution. Fetched with data retrieved from a fully-instrumented\nwind turbine, a model can be trained and then deployed, thus yielding load\npredictions of non-fully monitored wind turbines, from which only standard data\nremains available. In this paper, we propose a virtual load monitoring\nframework formulated via Bayesian neural networks (BNNs) and we provide\nrelevant implementation details needed for the construction, training, and\ndeployment of BNN data-based virtual monitoring models. As opposed to their\ndeterministic counterparts, BNNs intrinsically announce the uncertainties\nassociated with generated load predictions and allow to detect inaccurate load\nestimations generated for non-fully monitored wind turbines. The proposed\nvirtual load monitoring is thoroughly tested through an experimental campaign\nin an operational offshore wind farm and the results demonstrate the\neffectiveness of BNN models for fleet-leader-based farm-wide virtual\nmonitoring.\n","authors":["N. Hlaing","Pablo G. Morato","F. d. N. Santos","W. Weijtjens","C. Devriendt","P. Rigo"],"pdf_url":"https://arxiv.org/pdf/2211.00642v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12859v1","updated":"2023-08-24T15:29:24Z","published":"2023-08-24T15:29:24Z","title":"Towards Automated Animal Density Estimation with Acoustic Spatial\n Capture-Recapture","summary":" Passive acoustic monitoring can be an effective way of monitoring wildlife\npopulations that are acoustically active but difficult to survey visually.\nDigital recorders allow surveyors to gather large volumes of data at low cost,\nbut identifying target species vocalisations in these data is non-trivial.\nMachine learning (ML) methods are often used to do the identification. They can\nprocess large volumes of data quickly, but they do not detect all vocalisations\nand they do generate some false positives (vocalisations that are not from the\ntarget species). Existing wildlife abundance survey methods have been designed\nspecifically to deal with the first of these mistakes, but current methods of\ndealing with false positives are not well-developed. They do not take account\nof features of individual vocalisations, some of which are more likely to be\nfalse positives than others. We propose three methods for acoustic spatial\ncapture-recapture inference that integrate individual-level measures of\nconfidence from ML vocalisation identification into the likelihood and hence\nintegrate ML uncertainty into inference. The methods include a mixture model in\nwhich species identity is a latent variable. We test the methods by simulation\nand find that in a scenario based on acoustic data from Hainan gibbons, in\nwhich ignoring false positives results in 17% positive bias, our methods give\nnegligible bias and coverage probabilities that are close to the nominal 95%\nlevel.\n","authors":["Yuheng Wang","Juan Ye","David L. Borchers"],"pdf_url":"https://arxiv.org/pdf/2308.12859v1.pdf","comment":"35 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.12857v1","updated":"2023-08-24T15:28:52Z","published":"2023-08-24T15:28:52Z","title":"Fast Adversarial Training with Smooth Convergence","summary":" Fast adversarial training (FAT) is beneficial for improving the adversarial\nrobustness of neural networks. However, previous FAT work has encountered a\nsignificant issue known as catastrophic overfitting when dealing with large\nperturbation budgets, \\ie the adversarial robustness of models declines to near\nzero during training.\n To address this, we analyze the training process of prior FAT work and\nobserve that catastrophic overfitting is accompanied by the appearance of loss\nconvergence outliers.\n Therefore, we argue a moderately smooth loss convergence process will be a\nstable FAT process that solves catastrophic overfitting.\n To obtain a smooth loss convergence process, we propose a novel oscillatory\nconstraint (dubbed ConvergeSmooth) to limit the loss difference between\nadjacent epochs. The convergence stride of ConvergeSmooth is introduced to\nbalance convergence and smoothing. Likewise, we design weight centralization\nwithout introducing additional hyperparameters other than the loss balance\ncoefficient.\n Our proposed methods are attack-agnostic and thus can improve the training\nstability of various FAT techniques.\n Extensive experiments on popular datasets show that the proposed methods\nefficiently avoid catastrophic overfitting and outperform all previous FAT\nmethods. Code is available at \\url{https://github.com/FAT-CS/ConvergeSmooth}.\n","authors":["Mengnan Zhao","Lihe Zhang","Yuqiu Kong","Baocai Yin"],"pdf_url":"https://arxiv.org/pdf/2308.12857v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.05153v4","updated":"2023-08-24T15:15:44Z","published":"2022-12-10T00:18:05Z","title":"Algorithmic progress in computer vision","summary":" We investigate algorithmic progress in image classification on ImageNet,\nperhaps the most well-known test bed for computer vision. We estimate a model,\ninformed by work on neural scaling laws, and infer a decomposition of progress\ninto the scaling of compute, data, and algorithms. Using Shapley values to\nattribute performance improvements, we find that algorithmic improvements have\nbeen roughly as important as the scaling of compute for progress computer\nvision. Our estimates indicate that algorithmic innovations mostly take the\nform of compute-augmenting algorithmic advances (which enable researchers to\nget better performance from less compute), not data-augmenting algorithmic\nadvances. We find that compute-augmenting algorithmic advances are made at a\npace more than twice as fast as the rate usually associated with Moore's law.\nIn particular, we estimate that compute-augmenting innovations halve compute\nrequirements every nine months (95\\% confidence interval: 4 to 25 months).\n","authors":["Ege Erdil","Tamay Besiroglu"],"pdf_url":"https://arxiv.org/pdf/2212.05153v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15490v2","updated":"2023-08-24T15:08:50Z","published":"2023-05-24T18:23:25Z","title":"Symplectic model reduction of Hamiltonian systems using data-driven\n quadratic manifolds","summary":" This work presents two novel approaches for the symplectic model reduction of\nhigh-dimensional Hamiltonian systems using data-driven quadratic manifolds.\nClassical symplectic model reduction approaches employ linear symplectic\nsubspaces for representing the high-dimensional system states in a\nreduced-dimensional coordinate system. While these approximations respect the\nsymplectic nature of Hamiltonian systems, linear basis approximations can\nsuffer from slowly decaying Kolmogorov $N$-width, especially in wave-type\nproblems, which then requires a large basis size. We propose two different\nmodel reduction methods based on recently developed quadratic manifolds, each\npresenting its own advantages and limitations. The addition of quadratic terms\nto the state approximation, which sits at the heart of the proposed\nmethodologies, enables us to better represent intrinsic low-dimensionality in\nthe problem at hand. Both approaches are effective for issuing predictions in\nsettings well outside the range of their training data while providing more\naccurate solutions than the linear symplectic reduced-order models.\n","authors":["Harsh Sharma","Hongliang Mu","Patrick Buchfink","Rudy Geelen","Silke Glas","Boris Kramer"],"pdf_url":"https://arxiv.org/pdf/2305.15490v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12844v1","updated":"2023-08-24T15:07:08Z","published":"2023-08-24T15:07:08Z","title":"Probabilistic load forecasting with Reservoir Computing","summary":" Some applications of deep learning require not only to provide accurate\nresults but also to quantify the amount of confidence in their prediction. The\nmanagement of an electric power grid is one of these cases: to avoid risky\nscenarios, decision-makers need both precise and reliable forecasts of, for\nexample, power loads. For this reason, point forecasts are not enough hence it\nis necessary to adopt methods that provide an uncertainty quantification.\n This work focuses on reservoir computing as the core time series forecasting\nmethod, due to its computational efficiency and effectiveness in predicting\ntime series. While the RC literature mostly focused on point forecasting, this\nwork explores the compatibility of some popular uncertainty quantification\nmethods with the reservoir setting. Both Bayesian and deterministic approaches\nto uncertainty assessment are evaluated and compared in terms of their\nprediction accuracy, computational resource efficiency and reliability of the\nestimated uncertainty, based on a set of carefully chosen performance metrics.\n","authors":["Michele Guerra","Simone Scardapane","Filippo Maria Bianchi"],"pdf_url":"https://arxiv.org/pdf/2308.12844v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12843v1","updated":"2023-08-24T15:06:23Z","published":"2023-08-24T15:06:23Z","title":"Actuator Trajectory Planning for UAVs with Overhead Manipulator using\n Reinforcement Learning","summary":" In this paper, we investigate the operation of an aerial manipulator system,\nnamely an Unmanned Aerial Vehicle (UAV) equipped with a controllable arm with\ntwo degrees of freedom to carry out actuation tasks on the fly. Our solution is\nbased on employing a Q-learning method to control the trajectory of the tip of\nthe arm, also called \\textit{end-effector}. More specifically, we develop a\nmotion planning model based on Time To Collision (TTC), which enables a\nquadrotor UAV to navigate around obstacles while ensuring the manipulator's\nreachability. Additionally, we utilize a model-based Q-learning model to\nindependently track and control the desired trajectory of the manipulator's\nend-effector, given an arbitrary baseline trajectory for the UAV platform. Such\na combination enables a variety of actuation tasks such as high-altitude\nwelding, structural monitoring and repair, battery replacement, gutter\ncleaning, sky scrapper cleaning, and power line maintenance in hard-to-reach\nand risky environments while retaining compatibility with flight control\nfirmware. Our RL-based control mechanism results in a robust control strategy\nthat can handle uncertainties in the motion of the UAV, offering promising\nperformance. Specifically, our method achieves 92\\% accuracy in terms of\naverage displacement error (i.e. the mean distance between the target and\nobtained trajectory points) using Q-learning with 15,000 episodes\n","authors":["Hazim Alzorgan","Abolfazl Razi","Ata Jahangir Moshayedi"],"pdf_url":"https://arxiv.org/pdf/2308.12843v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.09107v2","updated":"2023-08-24T15:00:34Z","published":"2022-05-13T15:45:50Z","title":"Leveraging Global Binary Masks for Structure Segmentation in Medical\n Images","summary":" Deep learning (DL) models for medical image segmentation are highly\ninfluenced by intensity variations of input images and lack generalization due\nto primarily utilizing pixels' intensity information for inference. Acquiring\nsufficient training data is another challenge limiting models' applications. We\nproposed to leverage the consistency of organs' anatomical shape and position\ninformation in medical images. We introduced a framework leveraging recurring\nanatomical patterns through global binary masks for organ segmentation. Two\nscenarios were studied.1) Global binary masks were the only model's (i.e.\nU-Net) input, forcing exclusively encoding organs' position and shape\ninformation for segmentation/localization.2) Global binary masks were\nincorporated as an additional channel functioning as position/shape clues to\nmitigate training data scarcity. Two datasets of the brain and heart CT images\nwith their ground-truth were split into (26:10:10) and (12:3:5) for training,\nvalidation, and test respectively. Training exclusively on global binary masks\nled to Dice scores of 0.77(0.06) and 0.85(0.04), with the average Euclidian\ndistance of 3.12(1.43)mm and 2.5(0.93)mm relative to the center of mass of the\nground truth for the brain and heart structures respectively. The outcomes\nindicate that a surprising degree of position and shape information is encoded\nthrough global binary masks. Incorporating global binary masks led to\nsignificantly higher accuracy relative to the model trained on only CT images\nin small subsets of training data; the performance improved by 4.3-125.3% and\n1.3-48.1% for 1-8 training cases of the brain and heart datasets respectively.\nThe findings imply the advantages of utilizing global binary masks for building\ngeneralizable models and to compensate for training data scarcity.\n","authors":["Mahdieh Kazemimoghadam","Zi Yang","Lin Ma","Mingli Chen","Weiguo Lu","Xuejun Gu"],"pdf_url":"https://arxiv.org/pdf/2205.09107v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.01975v3","updated":"2023-08-24T14:50:34Z","published":"2023-05-03T08:41:37Z","title":"A Survey on Dataset Distillation: Approaches, Applications and Future\n Directions","summary":" Dataset distillation is attracting more attention in machine learning as\ntraining sets continue to grow and the cost of training state-of-the-art models\nbecomes increasingly high. By synthesizing datasets with high information\ndensity, dataset distillation offers a range of potential applications,\nincluding support for continual learning, neural architecture search, and\nprivacy protection. Despite recent advances, we lack a holistic understanding\nof the approaches and applications. Our survey aims to bridge this gap by first\nproposing a taxonomy of dataset distillation, characterizing existing\napproaches, and then systematically reviewing the data modalities, and related\napplications. In addition, we summarize the challenges and discuss future\ndirections for this field of research.\n","authors":["Jiahui Geng","Zongxiong Chen","Yuandou Wang","Herbert Woisetschlaeger","Sonja Schimmler","Ruben Mayer","Zhiming Zhao","Chunming Rong"],"pdf_url":"https://arxiv.org/pdf/2305.01975v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12828v1","updated":"2023-08-24T14:37:55Z","published":"2023-08-24T14:37:55Z","title":"Short Run Transit Route Planning Decision Support System Using a Deep\n Learning-Based Weighted Graph","summary":" Public transport routing plays a crucial role in transit network design,\nensuring a satisfactory level of service for passengers. However, current\nrouting solutions rely on traditional operational research heuristics, which\ncan be time-consuming to implement and lack the ability to provide quick\nsolutions. Here, we propose a novel deep learning-based methodology for a\ndecision support system that enables public transport (PT) planners to identify\nshort-term route improvements rapidly. By seamlessly adjusting specific\nsections of routes between two stops during specific times of the day, our\nmethod effectively reduces times and enhances PT services. Leveraging diverse\ndata sources such as GTFS and smart card data, we extract features and model\nthe transportation network as a directed graph. Using self-supervision, we\ntrain a deep learning model for predicting lateness values for road segments.\n These lateness values are then utilized as edge weights in the transportation\ngraph, enabling efficient path searching. Through evaluating the method on Tel\nAviv, we are able to reduce times on more than 9\\% of the routes. The improved\nroutes included both intraurban and suburban routes showcasing a fact\nhighlighting the model's versatility. The findings emphasize the potential of\nour data-driven decision support system to enhance public transport and city\nlogistics, promoting greater efficiency and reliability in PT services.\n","authors":["Nadav Shalit","Michael Fire","Dima Kagan","Eran Ben-Elia"],"pdf_url":"https://arxiv.org/pdf/2308.12828v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12820v1","updated":"2023-08-24T14:24:04Z","published":"2023-08-24T14:24:04Z","title":"Prediction without Preclusion: Recourse Verification with Reachable Sets","summary":" Machine learning models are often used to decide who will receive a loan, a\njob interview, or a public benefit. Standard techniques to build these models\nuse features about people but overlook their actionability. In turn, models can\nassign predictions that are fixed, meaning that consumers who are denied loans,\ninterviews, or benefits may be permanently locked out from access to credit,\nemployment, or assistance. In this work, we introduce a formal testing\nprocedure to flag models that assign fixed predictions that we call recourse\nverification. We develop machinery to reliably determine if a given model can\nprovide recourse to its decision subjects from a set of user-specified\nactionability constraints. We demonstrate how our tools can ensure recourse and\nadversarial robustness in real-world datasets and use them to study the\ninfeasibility of recourse in real-world lending datasets. Our results highlight\nhow models can inadvertently assign fixed predictions that permanently bar\naccess, and we provide tools to design algorithms that account for\nactionability when developing models.\n","authors":["Avni Kothari","Bogdan Kulynych","Tsui-Wei Weng","Berk Ustun"],"pdf_url":"https://arxiv.org/pdf/2308.12820v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.06228v2","updated":"2023-08-24T14:22:53Z","published":"2022-08-12T11:41:56Z","title":"Unifying Gradients to Improve Real-world Robustness for Deep Networks","summary":" The wide application of deep neural networks (DNNs) demands an increasing\namount of attention to their real-world robustness, i.e., whether a DNN resists\nblack-box adversarial attacks, among which score-based query attacks (SQAs) are\nmost threatening since they can effectively hurt a victim network with the only\naccess to model outputs. Defending against SQAs requires a slight but artful\nvariation of outputs due to the service purpose for users, who share the same\noutput information with SQAs. In this paper, we propose a real-world defense by\nUnifying Gradients (UniG) of different data so that SQAs could only probe a\nmuch weaker attack direction that is similar for different samples. Since such\nuniversal attack perturbations have been validated as less aggressive than the\ninput-specific perturbations, UniG protects real-world DNNs by indicating\nattackers a twisted and less informative attack direction. We implement UniG\nefficiently by a Hadamard product module which is plug-and-play. According to\nextensive experiments on 5 SQAs, 2 adaptive attacks and 7 defense baselines,\nUniG significantly improves real-world robustness without hurting clean\naccuracy on CIFAR10 and ImageNet. For instance, UniG maintains a model of\n77.80% accuracy under 2500-query Square attack while the state-of-the-art\nadversarially-trained model only has 67.34% on CIFAR10. Simultaneously, UniG\noutperforms all compared baselines in terms of clean accuracy and achieves the\nsmallest modification of the model output. The code is released at\nhttps://github.com/snowien/UniG-pytorch.\n","authors":["Yingwen Wu","Sizhe Chen","Kun Fang","Xiaolin Huang"],"pdf_url":"https://arxiv.org/pdf/2208.06228v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06886v2","updated":"2023-08-24T14:22:44Z","published":"2023-07-13T16:39:01Z","title":"Min-Max Optimization under Delays","summary":" Delays and asynchrony are inevitable in large-scale machine-learning problems\nwhere communication plays a key role. As such, several works have extensively\nanalyzed stochastic optimization with delayed gradients. However, as far as we\nare aware, no analogous theory is available for min-max optimization, a topic\nthat has gained recent popularity due to applications in adversarial\nrobustness, game theory, and reinforcement learning. Motivated by this gap, we\nexamine the performance of standard min-max optimization algorithms with\ndelayed gradient updates. First, we show (empirically) that even small delays\ncan cause prominent algorithms like Extra-gradient (\\texttt{EG}) to diverge on\nsimple instances for which \\texttt{EG} guarantees convergence in the absence of\ndelays. Our empirical study thus suggests the need for a careful analysis of\ndelayed versions of min-max optimization algorithms. Accordingly, under\nsuitable technical assumptions, we prove that Gradient Descent-Ascent\n(\\texttt{GDA}) and \\texttt{EG} with delayed updates continue to guarantee\nconvergence to saddle points for convex-concave and strongly convex-strongly\nconcave settings. Our complexity bounds reveal, in a transparent manner, the\nslow-down in convergence caused by delays.\n","authors":["Arman Adibi","Aritra Mitra","Hamed Hassani"],"pdf_url":"https://arxiv.org/pdf/2307.06886v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12126v2","updated":"2023-08-24T14:16:35Z","published":"2023-08-23T13:32:31Z","title":"An Accelerated Block Proximal Framework with Adaptive Momentum for\n Nonconvex and Nonsmooth Optimization","summary":" We propose an accelerated block proximal linear framework with adaptive\nmomentum (ABPL$^+$) for nonconvex and nonsmooth optimization. We analyze the\npotential causes of the extrapolation step failing in some algorithms, and\nresolve this issue by enhancing the comparison process that evaluates the\ntrade-off between the proximal gradient step and the linear extrapolation step\nin our algorithm. Furthermore, we extends our algorithm to any scenario\ninvolving updating block variables with positive integers, allowing each cycle\nto randomly shuffle the update order of the variable blocks. Additionally,\nunder mild assumptions, we prove that ABPL$^+$ can monotonically decrease the\nfunction value without strictly restricting the extrapolation parameters and\nstep size, demonstrates the viability and effectiveness of updating these\nblocks in a random order, and we also more obviously and intuitively\ndemonstrate that the derivative set of the sequence generated by our algorithm\nis a critical point set. Moreover, we demonstrate the global convergence as\nwell as the linear and sublinear convergence rates of our algorithm by\nutilizing the Kurdyka-Lojasiewicz (K{\\L}) condition. To enhance the\neffectiveness and flexibility of our algorithm, we also expand the study to the\nimprecise version of our algorithm and construct an adaptive extrapolation\nparameter strategy, which improving its overall performance. We apply our\nalgorithm to multiple non-negative matrix factorization with the $\\ell_0$ norm,\nnonnegative tensor decomposition with the $\\ell_0$ norm, and perform extensive\nnumerical experiments to validate its effectiveness and efficiency.\n","authors":["Weifeng Yang","Wenwen Min"],"pdf_url":"https://arxiv.org/pdf/2308.12126v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.15596v2","updated":"2023-08-24T14:00:18Z","published":"2022-09-30T17:19:40Z","title":"Individual Privacy Accounting with Gaussian Differential Privacy","summary":" Individual privacy accounting enables bounding differential privacy (DP) loss\nindividually for each participant involved in the analysis. This can be\ninformative as often the individual privacy losses are considerably smaller\nthan those indicated by the DP bounds that are based on considering worst-case\nbounds at each data access. In order to account for the individual privacy\nlosses in a principled manner, we need a privacy accountant for adaptive\ncompositions of randomised mechanisms, where the loss incurred at a given data\naccess is allowed to be smaller than the worst-case loss. This kind of analysis\nhas been carried out for the R\\'enyi differential privacy (RDP) by Feldman and\nZrnic (2021), however not yet for the so-called optimal privacy accountants. We\nmake first steps in this direction by providing a careful analysis using the\nGaussian differential privacy which gives optimal bounds for the Gaussian\nmechanism, one of the most versatile DP mechanisms. This approach is based on\ndetermining a certain supermartingale for the hockey-stick divergence and on\nextending the R\\'enyi divergence-based fully adaptive composition results by\nFeldman and Zrnic. We also consider measuring the individual\n$(\\varepsilon,\\delta)$-privacy losses using the so-called privacy loss\ndistributions. With the help of the Blackwell theorem, we can then make use of\nthe RDP analysis to construct an approximative individual\n$(\\varepsilon,\\delta)$-accountant.\n","authors":["Antti Koskela","Marlon Tobaben","Antti Honkela"],"pdf_url":"https://arxiv.org/pdf/2209.15596v2.pdf","comment":"31 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.12794v1","updated":"2023-08-24T13:49:48Z","published":"2023-08-24T13:49:48Z","title":"Job Shop Scheduling Benchmark: Environments and Instances for Learning\n and Non-learning Methods","summary":" We introduce an open-source GitHub repository containing comprehensive\nbenchmarks for a wide range of machine scheduling problems, including Job Shop\nScheduling (JSP), Flow Shop Scheduling (FSP), Flexible Job Shop Scheduling\n(FJSP), FJSP with Assembly constraints (FAJSP), FJSP with Sequence-Dependent\nSetup Times (FJSP-SDST), and the online FJSP (with online job arrivals). Our\nprimary goal is to provide a centralized hub for researchers, practitioners,\nand enthusiasts interested in tackling machine scheduling challenges.\n","authors":["Robbert Reijnen","Kjell van Straaten","Zaharah Bukhsh","Yingqian Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.12794v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.10634v2","updated":"2023-08-24T13:46:05Z","published":"2022-09-21T20:03:58Z","title":"Interneurons accelerate learning dynamics in recurrent neural networks\n for statistical adaptation","summary":" Early sensory systems in the brain rapidly adapt to fluctuating input\nstatistics, which requires recurrent communication between neurons.\nMechanistically, such recurrent communication is often indirect and mediated by\nlocal interneurons. In this work, we explore the computational benefits of\nmediating recurrent communication via interneurons compared with direct\nrecurrent connections. To this end, we consider two mathematically tractable\nrecurrent linear neural networks that statistically whiten their inputs -- one\nwith direct recurrent connections and the other with interneurons that mediate\nrecurrent communication. By analyzing the corresponding continuous synaptic\ndynamics and numerically simulating the networks, we show that the network with\ninterneurons is more robust to initialization than the network with direct\nrecurrent connections in the sense that the convergence time for the synaptic\ndynamics in the network with interneurons (resp. direct recurrent connections)\nscales logarithmically (resp. linearly) with the spectrum of their\ninitialization. Our results suggest that interneurons are computationally\nuseful for rapid adaptation to changing input statistics. Interestingly, the\nnetwork with interneurons is an overparameterized solution of the whitening\nobjective for the network with direct recurrent connections, so our results can\nbe viewed as a recurrent linear neural network analogue of the implicit\nacceleration phenomenon observed in overparameterized feedforward linear neural\nnetworks.\n","authors":["David Lipshutz","Cengiz Pehlevan","Dmitri B. Chklovskii"],"pdf_url":"https://arxiv.org/pdf/2209.10634v2.pdf","comment":"16 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.12785v1","updated":"2023-08-24T13:40:36Z","published":"2023-08-24T13:40:36Z","title":"Single-shot Bayesian approximation for neural networks","summary":" Deep neural networks (NNs) are known for their high-prediction performances.\nHowever, NNs are prone to yield unreliable predictions when encountering\ncompletely new situations without indicating their uncertainty. Bayesian\nvariants of NNs (BNNs), such as Monte Carlo (MC) dropout BNNs, do provide\nuncertainty measures and simultaneously increase the prediction performance.\nThe only disadvantage of BNNs is their higher computation time during test time\nbecause they rely on a sampling approach. Here we present a single-shot MC\ndropout approximation that preserves the advantages of BNNs while being as fast\nas NNs. Our approach is based on moment propagation (MP) and allows to\nanalytically approximate the expected value and the variance of the MC dropout\nsignal for commonly used layers in NNs, i.e. convolution, max pooling, dense,\nsoftmax, and dropout layers. The MP approach can convert an NN into a BNN\nwithout re-training given the NN has been trained with standard dropout. We\nevaluate our approach on different benchmark datasets and a simulated toy\nexample in a classification and regression setting. We demonstrate that our\nsingle-shot MC dropout approximation resembles the point estimate and the\nuncertainty estimate of the predictive distribution that is achieved with an MC\napproach, while being fast enough for real-time deployments of BNNs. We show\nthat using part of the saved time to combine our MP approach with deep ensemble\ntechniques does further improve the uncertainty measures.\n","authors":["Kai Brach","Beate Sick","Oliver Dürr"],"pdf_url":"https://arxiv.org/pdf/2308.12785v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2007.03293"},{"id":"http://arxiv.org/abs/2306.04504v3","updated":"2023-08-24T13:39:17Z","published":"2023-06-07T15:11:26Z","title":"Evaluation of ChatGPT on Biomedical Tasks: A Zero-Shot Comparison with\n Fine-Tuned Generative Transformers","summary":" ChatGPT is a large language model developed by OpenAI. Despite its impressive\nperformance across various tasks, no prior work has investigated its capability\nin the biomedical domain yet. To this end, this paper aims to evaluate the\nperformance of ChatGPT on various benchmark biomedical tasks, such as relation\nextraction, document classification, question answering, and summarization. To\nthe best of our knowledge, this is the first work that conducts an extensive\nevaluation of ChatGPT in the biomedical domain. Interestingly, we find based on\nour evaluation that in biomedical datasets that have smaller training sets,\nzero-shot ChatGPT even outperforms the state-of-the-art fine-tuned generative\ntransformer models, such as BioGPT and BioBART. This suggests that ChatGPT's\npre-training on large text corpora makes it quite specialized even in the\nbiomedical domain. Our findings demonstrate that ChatGPT has the potential to\nbe a valuable tool for various tasks in the biomedical domain that lack large\nannotated data.\n","authors":["Israt Jahan","Md Tahmid Rahman Laskar","Chun Peng","Jimmy Huang"],"pdf_url":"https://arxiv.org/pdf/2306.04504v3.pdf","comment":"Accepted by BioNLP@ACL 2023"},{"id":"http://arxiv.org/abs/2303.00028v4","updated":"2023-08-24T13:33:50Z","published":"2023-02-28T19:10:12Z","title":"Efficient Sensor Placement from Regression with Sparse Gaussian\n Processes in Continuous and Discrete Spaces","summary":" The sensor placement problem is a common problem that arises when monitoring\ncorrelated phenomena, such as temperature and precipitation. Existing\napproaches to this problem typically use discrete optimization methods, which\nare computationally expensive and cannot scale to large problems. We address\nthe sensor placement problem in correlated environments by reducing it to a\nregression problem that can be efficiently solved using sparse Gaussian\nprocesses (SGPs). Our approach can handle both discrete sensor placement\nproblems-where sensors are limited to a subset of a given set of locations-and\ncontinuous sensor placement problems-where sensors can be placed anywhere in a\nbounded continuous region. We further generalize our approach to handle sensors\nwith a non-point field of view and integrated observations. Our experimental\nresults on three real-world datasets show that our approach generates sensor\nplacements that result in reconstruction quality that is consistently on par or\nbetter than the prior state-of-the-art approach while being significantly\nfaster. Our computationally efficient approach enables both large-scale sensor\nplacement and fast robotic sensor placement for informative path planning\nalgorithms.\n","authors":["Kalvik Jakkala","Srinivas Akella"],"pdf_url":"https://arxiv.org/pdf/2303.00028v4.pdf","comment":"10 pages, 4 figures, preprint, appendix"},{"id":"http://arxiv.org/abs/2302.00747v3","updated":"2023-08-24T13:27:08Z","published":"2023-02-01T20:47:58Z","title":"Universal Soldier: Using Universal Adversarial Perturbations for\n Detecting Backdoor Attacks","summary":" Deep learning models achieve excellent performance in numerous machine\nlearning tasks. Yet, they suffer from security-related issues such as\nadversarial examples and poisoning (backdoor) attacks. A deep learning model\nmay be poisoned by training with backdoored data or by modifying inner network\nparameters. Then, a backdoored model performs as expected when receiving a\nclean input, but it misclassifies when receiving a backdoored input stamped\nwith a pre-designed pattern called \"trigger\". Unfortunately, it is difficult to\ndistinguish between clean and backdoored models without prior knowledge of the\ntrigger. This paper proposes a backdoor detection method by utilizing a special\ntype of adversarial attack, universal adversarial perturbation (UAP), and its\nsimilarities with a backdoor trigger. We observe an intuitive phenomenon: UAPs\ngenerated from backdoored models need fewer perturbations to mislead the model\nthan UAPs from clean models. UAPs of backdoored models tend to exploit the\nshortcut from all classes to the target class, built by the backdoor trigger.\nWe propose a novel method called Universal Soldier for Backdoor detection (USB)\nand reverse engineering potential backdoor triggers via UAPs. Experiments on\n345 models trained on several datasets show that USB effectively detects the\ninjected backdoor and provides comparable or better results than\nstate-of-the-art methods.\n","authors":["Xiaoyun Xu","Oguzhan Ersoy","Stjepan Picek"],"pdf_url":"https://arxiv.org/pdf/2302.00747v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11787v2","updated":"2023-08-24T13:24:45Z","published":"2023-08-22T20:59:21Z","title":"HypBO: Expert-Guided Chemist-in-the-Loop Bayesian Search for New\n Materials","summary":" Robotics and automation offer massive accelerations for solving intractable,\nmultivariate scientific problems such as materials discovery, but the available\nsearch spaces can be dauntingly large. Bayesian optimization (BO) has emerged\nas a popular sample-efficient optimization engine, thriving in tasks where no\nanalytic form of the target function/property is known. Here we exploit expert\nhuman knowledge in the form of hypotheses to direct Bayesian searches more\nquickly to promising regions of chemical space. Previous methods have used\nunderlying distributions derived from existing experimental measurements, which\nis unfeasible for new, unexplored scientific tasks. Also, such distributions\ncannot capture intricate hypotheses. Our proposed method, which we call HypBO,\nuses expert human hypotheses to generate an improved seed of samples.\nUnpromising seeds are automatically discounted, while promising seeds are used\nto augment the surrogate model data, thus achieving better-informed sampling.\nThis process continues in a global versus local search fashion, organized in a\nbilevel optimization framework. We validate the performance of our method on a\nrange of synthetic functions and demonstrate its practical utility on a real\nchemical design task where the use of expert hypotheses accelerates the search\nperformance significantly.\n","authors":["Abdoulatif Cisse","Xenophon Evangelopoulos","Sam Carruthers","Vladimir V. Gusev","Andrew I. Cooper"],"pdf_url":"https://arxiv.org/pdf/2308.11787v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12772v1","updated":"2023-08-24T13:21:25Z","published":"2023-08-24T13:21:25Z","title":"Intentionally-underestimated Value Function at Terminal State for\n Temporal-difference Learning with Mis-designed Reward","summary":" Robot control using reinforcement learning has become popular, but its\nlearning process generally terminates halfway through an episode for safety and\ntime-saving reasons. This study addresses the problem of the most popular\nexception handling that temporal-difference (TD) learning performs at such\ntermination. That is, by forcibly assuming zero value after termination,\nunintentionally implicit underestimation or overestimation occurs, depending on\nthe reward design in the normal states. When the episode is terminated due to\ntask failure, the failure may be highly valued with the unintentional\noverestimation, and the wrong policy may be acquired. Although this problem can\nbe avoided by paying attention to the reward design, it is essential in\npractical use of TD learning to review the exception handling at termination.\nThis paper therefore proposes a method to intentionally underestimate the value\nafter termination to avoid learning failures due to the unintentional\noverestimation. In addition, the degree of underestimation is adjusted\naccording to the degree of stationarity at termination, thereby preventing\nexcessive exploration due to the intentional underestimation. Simulations and\nreal robot experiments showed that the proposed method can stably obtain the\noptimal policies for various tasks and reward designs.\nhttps://youtu.be/AxXr8uFOe7M\n","authors":["Taisuke Kobayashi"],"pdf_url":"https://arxiv.org/pdf/2308.12772v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.12767v1","updated":"2023-08-24T13:14:49Z","published":"2023-08-24T13:14:49Z","title":"On the Consistency of Average Embeddings for Item Recommendation","summary":" A prevalent practice in recommender systems consists of averaging item\nembeddings to represent users or higher-level concepts in the same embedding\nspace. This paper investigates the relevance of such a practice. For this\npurpose, we propose an expected precision score, designed to measure the\nconsistency of an average embedding relative to the items used for its\nconstruction. We subsequently analyze the mathematical expression of this score\nin a theoretical setting with specific assumptions, as well as its empirical\nbehavior on real-world data from music streaming services. Our results\nemphasize that real-world averages are less consistent for recommendation,\nwhich paves the way for future research to better align real-world embeddings\nwith assumptions from our theoretical setting.\n","authors":["Walid Bendada","Guillaume Salha-Galvan","Romain Hennequin","Thomas Bouabça","Tristan Cazenave"],"pdf_url":"https://arxiv.org/pdf/2308.12767v1.pdf","comment":"17th ACM Conference on Recommender Systems (RecSys 2023)"},{"id":"http://arxiv.org/abs/2308.12761v1","updated":"2023-08-24T13:08:02Z","published":"2023-08-24T13:08:02Z","title":"IP-UNet: Intensity Projection UNet Architecture for 3D Medical Volume\n Segmentation","summary":" CNNs have been widely applied for medical image analysis. However, limited\nmemory capacity is one of the most common drawbacks of processing\nhigh-resolution 3D volumetric data. 3D volumes are usually cropped or downsized\nfirst before processing, which can result in a loss of resolution, increase\nclass imbalance, and affect the performance of the segmentation algorithms. In\nthis paper, we propose an end-to-end deep learning approach called IP-UNet.\nIP-UNet is a UNet-based model that performs multi-class segmentation on\nIntensity Projection (IP) of 3D volumetric data instead of the memory-consuming\n3D volumes. IP-UNet uses limited memory capability for training without losing\nthe original 3D image resolution. We compare the performance of three models in\nterms of segmentation accuracy and computational cost: 1) Slice-by-slice 2D\nsegmentation of the CT scan images using a conventional 2D UNet model. 2)\nIP-UNet that operates on data obtained by merging the extracted Maximum\nIntensity Projection (MIP), Closest Vessel Projection (CVP), and Average\nIntensity Projection (AvgIP) representations of the source 3D volumes, then\napplying the UNet model on the output IP images. 3) 3D-UNet model directly\nreads the 3D volumes constructed from a series of CT scan images and outputs\nthe 3D volume of the predicted segmentation. We test the performance of these\nmethods on 3D volumetric images for automatic breast calcification detection.\nExperimental results show that IP-Unet can achieve similar segmentation\naccuracy with 3D-Unet but with much better performance. It reduces the training\ntime by 70\\% and memory consumption by 92\\%.\n","authors":["Nyothiri Aung","Tahar Kechadi","Liming Chen","Sahraoui Dhelim"],"pdf_url":"https://arxiv.org/pdf/2308.12761v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.07557v2","updated":"2023-08-24T13:07:48Z","published":"2023-02-15T09:51:56Z","title":"On the Generalization of PINNs outside the training domain and the\n Hyperparameters influencing it","summary":" Physics-Informed Neural Networks (PINNs) are Neural Network architectures\ntrained to emulate solutions of differential equations without the necessity of\nsolution data. They are currently ubiquitous in the scientific literature due\nto their flexible and promising settings. However, very little of the available\nresearch provides practical studies that aim for a better quantitative\nunderstanding of such architecture and its functioning. In this paper, we\nperform an empirical analysis of the behavior of PINN predictions outside their\ntraining domain. The primary goal is to investigate the scenarios in which a\nPINN can provide consistent predictions outside the training area.\nThereinafter, we assess whether the algorithmic setup of PINNs can influence\ntheir potential for generalization and showcase the respective effect on the\nprediction. The results obtained in this study returns insightful and at times\ncounterintuitive perspectives which can be highly relevant for architectures\nwhich combines PINNs with domain decomposition and/or adaptive training\nstrategies.\n","authors":["Andrea Bonfanti","Roberto Santana","Marco Ellero","Babak Gholami"],"pdf_url":"https://arxiv.org/pdf/2302.07557v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2010.11925v3","updated":"2023-08-24T12:58:14Z","published":"2020-10-22T17:55:26Z","title":"The Polynomial Method is Universal for Distribution-Free Correlational\n SQ Learning","summary":" We consider the problem of distribution-free learning for Boolean function\nclasses in the PAC and agnostic models. Generalizing a beautiful work of Malach\nand Shalev-Shwartz (2022) that gave tight correlational SQ (CSQ) lower bounds\nfor learning DNF formulas, we give new proofs that lower bounds on the\nthreshold or approximate degree of any function class directly imply CSQ lower\nbounds for PAC or agnostic learning respectively. While such bounds implicitly\nfollow by combining prior results by Feldman (2008, 2012) and Sherstov (2008,\n2011), to our knowledge the precise statements we give had not appeared in this\nform before. Moreover, our proofs are simple and largely self-contained.\n These lower bounds match corresponding positive results using upper bounds on\nthe threshold or approximate degree in the SQ model for PAC or agnostic\nlearning, and in this sense these results show that the polynomial method is a\nuniversal, best-possible approach for distribution-free CSQ learning.\n","authors":["Aravind Gollakota","Sushrut Karmalkar","Adam Klivans"],"pdf_url":"https://arxiv.org/pdf/2010.11925v3.pdf","comment":"v3: Improved discussion of relation to prior work"},{"id":"http://arxiv.org/abs/2308.12751v1","updated":"2023-08-24T12:56:39Z","published":"2023-08-24T12:56:39Z","title":"Motion In-Betweening with Phase Manifolds","summary":" This paper introduces a novel data-driven motion in-betweening system to\nreach target poses of characters by making use of phases variables learned by a\nPeriodic Autoencoder. Our approach utilizes a mixture-of-experts neural network\nmodel, in which the phases cluster movements in both space and time with\ndifferent expert weights. Each generated set of weights then produces a\nsequence of poses in an autoregressive manner between the current and target\nstate of the character. In addition, to satisfy poses which are manually\nmodified by the animators or where certain end effectors serve as constraints\nto be reached by the animation, a learned bi-directional control scheme is\nimplemented to satisfy such constraints. The results demonstrate that using\nphases for motion in-betweening tasks sharpen the interpolated movements, and\nfurthermore stabilizes the learning process. Moreover, using phases for motion\nin-betweening tasks can also synthesize more challenging movements beyond\nlocomotion behaviors. Additionally, style control is enabled between given\ntarget keyframes. Our proposed framework can compete with popular\nstate-of-the-art methods for motion in-betweening in terms of motion quality\nand generalization, especially in the existence of long transition durations.\nOur framework contributes to faster prototyping workflows for creating animated\ncharacter sequences, which is of enormous interest for the game and film\nindustry.\n","authors":["Paul Starke","Sebastian Starke","Taku Komura","Frank Steinicke"],"pdf_url":"https://arxiv.org/pdf/2308.12751v1.pdf","comment":"17 pages, 11 figures, conference"},{"id":"http://arxiv.org/abs/2207.09755v2","updated":"2023-08-24T12:44:57Z","published":"2022-07-20T08:57:53Z","title":"A temporally and spatially local spike-based backpropagation algorithm\n to enable training in hardware","summary":" Spiking Neural Networks (SNNs) have emerged as a hardware efficient\narchitecture for classification tasks. The challenge of spike-based encoding\nhas been the lack of a universal training mechanism performed entirely using\nspikes. There have been several attempts to adopt the powerful backpropagation\n(BP) technique used in non-spiking artificial neural networks (ANN): (1) SNNs\ncan be trained by externally computed numerical gradients. (2) A major\nadvancement towards native spike-based learning has been the use of approximate\nBackpropagation using spike-time dependent plasticity (STDP) with phased\nforward/backward passes. However, the transfer of information between such\nphases for gradient and weight update calculation necessitates external memory\nand computational access. This is a challenge for standard neuromorphic\nhardware implementations. In this paper, we propose a stochastic SNN based\nBack-Prop (SSNN-BP) algorithm that utilizes a composite neuron to\nsimultaneously compute the forward pass activations and backward pass gradients\nexplicitly with spikes. Although signed gradient values are a challenge for\nspike-based representation, we tackle this by splitting the gradient signal\ninto positive and negative streams. We show that our method approaches BP ANN\nbaseline with sufficiently long spike-trains. Finally, we show that the\nwell-performing softmax cross-entropy loss function can be implemented through\ninhibitory lateral connections enforcing a Winner Take All (WTA) rule. Our SNN\nwith a 2-layer network shows excellent generalization through comparable\nperformance to ANNs with equivalent architecture and regularization parameters\non static image datasets like MNIST, Fashion-MNIST, Extended MNIST, and\ntemporally encoded image datasets like Neuromorphic MNIST datasets. Thus,\nSSNN-BP enables BP compatible with purely spike-based neuromorphic hardware.\n","authors":["Anmol Biswas","Vivek Saraswat","Udayan Ganguly"],"pdf_url":"https://arxiv.org/pdf/2207.09755v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12740v1","updated":"2023-08-24T12:42:00Z","published":"2023-08-24T12:42:00Z","title":"Human Comprehensible Active Learning of Genome-Scale Metabolic Networks","summary":" An important application of Synthetic Biology is the engineering of the host\ncell system to yield useful products. However, an increase in the scale of the\nhost system leads to huge design space and requires a large number of\nvalidation trials with high experimental costs. A comprehensible machine\nlearning approach that efficiently explores the hypothesis space and guides\nexperimental design is urgently needed for the Design-Build-Test-Learn (DBTL)\ncycle of the host cell system. We introduce a novel machine learning framework\nILP-iML1515 based on Inductive Logic Programming (ILP) that performs abductive\nlogical reasoning and actively learns from training examples. In contrast to\nnumerical models, ILP-iML1515 is built on comprehensible logical\nrepresentations of a genome-scale metabolic model and can update the model by\nlearning new logical structures from auxotrophic mutant trials. The ILP-iML1515\nframework 1) allows high-throughput simulations and 2) actively selects\nexperiments that reduce the experimental cost of learning gene functions in\ncomparison to randomly selected experiments.\n","authors":["Lun Ai","Shi-Shun Liang","Wang-Zhou Dai","Liam Hallett","Stephen H. Muggleton","Geoff S. Baldwin"],"pdf_url":"https://arxiv.org/pdf/2308.12740v1.pdf","comment":"Invited presentation for AAAI Spring Symposium Series 2023 on\n Computational Scientific Discovery"},{"id":"http://arxiv.org/abs/2308.12734v1","updated":"2023-08-24T12:26:15Z","published":"2023-08-24T12:26:15Z","title":"Real-time Detection of AI-Generated Speech for DeepFake Voice Conversion","summary":" There are growing implications surrounding generative AI in the speech domain\nthat enable voice cloning and real-time voice conversion from one individual to\nanother. This technology poses a significant ethical threat and could lead to\nbreaches of privacy and misrepresentation, thus there is an urgent need for\nreal-time detection of AI-generated speech for DeepFake Voice Conversion. To\naddress the above emerging issues, the DEEP-VOICE dataset is generated in this\nstudy, comprised of real human speech from eight well-known figures and their\nspeech converted to one another using Retrieval-based Voice Conversion.\nPresenting as a binary classification problem of whether the speech is real or\nAI-generated, statistical analysis of temporal audio features through t-testing\nreveals that there are significantly different distributions. Hyperparameter\noptimisation is implemented for machine learning models to identify the source\nof speech. Following the training of 208 individual machine learning models\nover 10-fold cross validation, it is found that the Extreme Gradient Boosting\nmodel can achieve an average classification accuracy of 99.3% and can classify\nspeech in real-time, at around 0.004 milliseconds given one second of speech.\nAll data generated for this study is released publicly for future research on\nAI speech detection.\n","authors":["Jordan J. Bird","Ahmad Lotfi"],"pdf_url":"https://arxiv.org/pdf/2308.12734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06534v2","updated":"2023-08-24T12:26:06Z","published":"2023-08-12T11:31:01Z","title":"Dealing with Small Datasets for Deep Learning in Medical Imaging: An\n Evaluation of Self-Supervised Pre-Training on CT Scans Comparing Contrastive\n and Masked Autoencoder Methods for Convolutional Models","summary":" Deep learning in medical imaging has the potential to minimize the risk of\ndiagnostic errors, reduce radiologist workload, and accelerate diagnosis.\nTraining such deep learning models requires large and accurate datasets, with\nannotations for all training samples. However, in the medical imaging domain,\nannotated datasets for specific tasks are often small due to the high\ncomplexity of annotations, limited access, or the rarity of diseases. To\naddress this challenge, deep learning models can be pre-trained on large image\ndatasets without annotations using methods from the field of self-supervised\nlearning. After pre-training, small annotated datasets are sufficient to\nfine-tune the models for a specific task. The most popular self-supervised\npre-training approaches in medical imaging are based on contrastive learning.\nHowever, recent studies in natural image processing indicate a strong potential\nfor masked autoencoder approaches. Our work compares state-of-the-art\ncontrastive learning methods with the recently introduced masked autoencoder\napproach \"SparK\" for convolutional neural networks (CNNs) on medical images.\nTherefore we pre-train on a large unannotated CT image dataset and fine-tune on\nseveral CT classification tasks. Due to the challenge of obtaining sufficient\nannotated training data in medical imaging, it is of particular interest to\nevaluate how the self-supervised pre-training methods perform when fine-tuning\non small datasets. By experimenting with gradually reducing the training\ndataset size for fine-tuning, we find that the reduction has different effects\ndepending on the type of pre-training chosen. The SparK pre-training method is\nmore robust to the training dataset size than the contrastive methods. Based on\nour results, we propose the SparK pre-training for medical imaging tasks with\nonly small annotated datasets.\n","authors":["Daniel Wolf","Tristan Payer","Catharina Silvia Lisson","Christoph Gerhard Lisson","Meinrad Beer","Timo Ropinski","Michael Götz"],"pdf_url":"https://arxiv.org/pdf/2308.06534v2.pdf","comment":"This paper is under review. The code will be released if accepted"},{"id":"http://arxiv.org/abs/2308.12729v1","updated":"2023-08-24T12:08:07Z","published":"2023-08-24T12:08:07Z","title":"Out of the Box Thinking: Improving Customer Lifetime Value Modelling via\n Expert Routing and Game Whale Detection","summary":" Customer lifetime value (LTV) prediction is essential for mobile game\npublishers trying to optimize the advertising investment for each user\nacquisition based on the estimated worth. In mobile games, deploying\nmicrotransactions is a simple yet effective monetization strategy, which\nattracts a tiny group of game whales who splurge on in-game purchases. The\npresence of such game whales may impede the practicality of existing LTV\nprediction models, since game whales' purchase behaviours always exhibit varied\ndistribution from general users. Consequently, identifying game whales can open\nup new opportunities to improve the accuracy of LTV prediction models. However,\nlittle attention has been paid to applying game whale detection in LTV\nprediction, and existing works are mainly specialized for the long-term LTV\nprediction with the assumption that the high-quality user features are\navailable, which is not applicable in the UA stage. In this paper, we propose\nExpLTV, a novel multi-task framework to perform LTV prediction and game whale\ndetection in a unified way. In ExpLTV, we first innovatively design a deep\nneural network-based game whale detector that can not only infer the intrinsic\norder in accordance with monetary value, but also precisely identify high\nspenders (i.e., game whales) and low spenders. Then, by treating the game whale\ndetector as a gating network to decide the different mixture patterns of LTV\nexperts assembling, we can thoroughly leverage the shared information and\nscenario-specific information (i.e., game whales modelling and low spenders\nmodelling). Finally, instead of separately designing a purchase rate estimator\nfor two tasks, we design a shared estimator that can preserve the inner task\nrelationships. The superiority of ExpLTV is further validated via extensive\nexperiments on three industrial datasets.\n","authors":["Shijie Zhang","Xin Yan","Xuejiao Yang","Binfeng Jia","Shuangyang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.12729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12726v1","updated":"2023-08-24T12:05:46Z","published":"2023-08-24T12:05:46Z","title":"Continuous Reinforcement Learning-based Dynamic Difficulty Adjustment in\n a Visual Working Memory Game","summary":" Dynamic Difficulty Adjustment (DDA) is a viable approach to enhance a\nplayer's experience in video games. Recently, Reinforcement Learning (RL)\nmethods have been employed for DDA in non-competitive games; nevertheless, they\nrely solely on discrete state-action space with a small search space. In this\npaper, we propose a continuous RL-based DDA methodology for a visual working\nmemory (VWM) game to handle the complex search space for the difficulty of\nmemorization. The proposed RL-based DDA tailors game difficulty based on the\nplayer's score and game difficulty in the last trial. We defined a continuous\nmetric for the difficulty of memorization. Then, we consider the task\ndifficulty and the vector of difficulty-score as the RL's action and state,\nrespectively. We evaluated the proposed method through a within-subject\nexperiment involving 52 subjects. The proposed approach was compared with two\nrule-based difficulty adjustment methods in terms of player's score and game\nexperience measured by a questionnaire. The proposed RL-based approach resulted\nin a significantly better game experience in terms of competence, tension, and\nnegative and positive affect. Players also achieved higher scores and win\nrates. Furthermore, the proposed RL-based DDA led to a significantly less\ndecline in the score in a 20-trial session.\n","authors":["Masoud Rahimi","Hadi Moradi","Abdol-hossein Vahabie","Hamed Kebriaei"],"pdf_url":"https://arxiv.org/pdf/2308.12726v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.16331v3","updated":"2023-08-24T11:41:57Z","published":"2022-03-28T21:13:24Z","title":"FlexFringe: Modeling Software Behavior by Learning Probabilistic\n Automata","summary":" We present the efficient implementations of probabilistic deterministic\nfinite automaton learning methods available in FlexFringe. These implement\nwell-known strategies for state-merging including several modifications to\nimprove their performance in practice. We show experimentally that these\nalgorithms obtain competitive results and significant improvements over a\ndefault implementation. We also demonstrate how to use FlexFringe to learn\ninterpretable models from software logs and use these for anomaly detection.\nAlthough less interpretable, we show that learning smaller more convoluted\nmodels improves the performance of FlexFringe on anomaly detection,\noutperforming an existing solution based on neural nets.\n","authors":["Sicco Verwer","Christian Hammerschmidt"],"pdf_url":"https://arxiv.org/pdf/2203.16331v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.11723v4","updated":"2023-08-24T11:35:01Z","published":"2022-06-23T14:16:30Z","title":"Self-Supervised Training with Autoencoders for Visual Anomaly Detection","summary":" Deep autoencoders provide an effective tool for learning non-linear\ndimensionality reduction in an unsupervised way. Recently, they have been used\nfor the task of anomaly detection in the visual domain. By optimizing for the\nreconstruction error using anomaly-free examples, the common belief is that a\ncorresponding network should fail to accurately reconstruct anomalous regions\nin the application phase. This goal is typically addressed by controlling the\ncapacity of the network, either by reducing the size of the bottleneck layer or\nby enforcing sparsity constraints on the activations. However, neither of these\ntechniques does explicitly penalize reconstruction of anomalous signals often\nresulting in poor detection. We tackle this problem by adapting a\nself-supervised learning regime that allows the use of discriminative\ninformation during training but focuses on the data manifold of normal\nexamples. We emphasize that inference with our approach is very efficient\nduring training and prediction requiring a single forward pass for each input\nimage. Our experiments on the MVTec AD dataset demonstrate high detection and\nlocalization performance. On the texture-subset, in particular, our approach\nconsistently outperforms recent anomaly detection methods by a significant\nmargin.\n","authors":["Alexander Bauer","Shinichi Nakajima","Klaus-Robert Müller"],"pdf_url":"https://arxiv.org/pdf/2206.11723v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12716v1","updated":"2023-08-24T11:31:24Z","published":"2023-08-24T11:31:24Z","title":"Solving Forward and Inverse Problems of Contact Mechanics using\n Physics-Informed Neural Networks","summary":" This paper explores the ability of physics-informed neural networks (PINNs)\nto solve forward and inverse problems of contact mechanics for small\ndeformation elasticity. We deploy PINNs in a mixed-variable formulation\nenhanced by output transformation to enforce Dirichlet and Neumann boundary\nconditions as hard constraints. Inequality constraints of contact problems,\nnamely Karush-Kuhn-Tucker (KKT) type conditions, are enforced as soft\nconstraints by incorporating them into the loss function during network\ntraining. To formulate the loss function contribution of KKT constraints,\nexisting approaches applied to elastoplasticity problems are investigated and\nwe explore a nonlinear complementarity problem (NCP) function, namely\nFischer-Burmeister, which possesses advantageous characteristics in terms of\noptimization. Based on the Hertzian contact problem, we show that PINNs can\nserve as pure partial differential equation (PDE) solver, as data-enhanced\nforward model, as inverse solver for parameter identification, and as\nfast-to-evaluate surrogate model. Furthermore, we demonstrate the importance of\nchoosing proper hyperparameters, e.g. loss weights, and a combination of Adam\nand L-BFGS-B optimizers aiming for better results in terms of accuracy and\ntraining time.\n","authors":["T. Sahin","M. von Danwitz","A. Popp"],"pdf_url":"https://arxiv.org/pdf/2308.12716v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12696v1","updated":"2023-08-24T10:29:25Z","published":"2023-08-24T10:29:25Z","title":"Disentanglement Learning via Topology","summary":" We propose TopDis (Topological Disentanglement), a method for learning\ndisentangled representations via adding multi-scale topological loss term.\nDisentanglement is a crucial property of data representations substantial for\nthe explainability and robustness of deep learning models and a step towards\nhigh-level cognition. The state-of-the-art method based on VAE minimizes the\ntotal correlation of the joint distribution of latent variables. We take a\ndifferent perspective on disentanglement by analyzing topological properties of\ndata manifolds. In particular, we optimize the topological similarity for data\nmanifolds traversals. To the best of our knowledge, our paper is the first one\nto propose a differentiable topological loss for disentanglement. Our\nexperiments have shown that the proposed topological loss improves\ndisentanglement scores such as MIG, FactorVAE score, SAP score and DCI\ndisentanglement score with respect to state-of-the-art results. Our method\nworks in an unsupervised manner, permitting to apply it for problems without\nlabeled factors of variation. Additionally, we show how to use the proposed\ntopological loss to find disentangled directions in a trained GAN.\n","authors":["Nikita Balabin","Daria Voronkova","Ilya Trofimov","Evgeny Burnaev","Serguei Barannikov"],"pdf_url":"https://arxiv.org/pdf/2308.12696v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12691v1","updated":"2023-08-24T10:20:15Z","published":"2023-08-24T10:20:15Z","title":"An Efficient Data Analysis Method for Big Data using Multiple-Model\n Linear Regression","summary":" This paper introduces a new data analysis method for big data using a newly\ndefined regression model named multiple model linear regression(MMLR), which\nseparates input datasets into subsets and construct local linear regression\nmodels of them. The proposed data analysis method is shown to be more efficient\nand flexible than other regression based methods. This paper also proposes an\napproximate algorithm to construct MMLR models based on\n$(\\epsilon,\\delta)$-estimator, and gives mathematical proofs of the correctness\nand efficiency of MMLR algorithm, of which the time complexity is linear with\nrespect to the size of input datasets. This paper also empirically implements\nthe method on both synthetic and real-world datasets, the algorithm shows to\nhave comparable performance to existing regression methods in many cases, while\nit takes almost the shortest time to provide a high prediction accuracy.\n","authors":["Bohan Lyu","Jianzhong Li"],"pdf_url":"https://arxiv.org/pdf/2308.12691v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12686v1","updated":"2023-08-24T09:57:11Z","published":"2023-08-24T09:57:11Z","title":"Match-And-Deform: Time Series Domain Adaptation through Optimal\n Transport and Temporal Alignment","summary":" While large volumes of unlabeled data are usually available, associated\nlabels are often scarce. The unsupervised domain adaptation problem aims at\nexploiting labels from a source domain to classify data from a related, yet\ndifferent, target domain. When time series are at stake, new difficulties arise\nas temporal shifts may appear in addition to the standard feature distribution\nshift. In this paper, we introduce the Match-And-Deform (MAD) approach that\naims at finding correspondences between the source and target time series while\nallowing temporal distortions. The associated optimization problem\nsimultaneously aligns the series thanks to an optimal transport loss and the\ntime stamps through dynamic time warping. When embedded into a deep neural\nnetwork, MAD helps learning new representations of time series that both align\nthe domains and maximize the discriminative power of the network. Empirical\nstudies on benchmark datasets and remote sensing data demonstrate that MAD\nmakes meaningful sample-to-sample pairing and time shift estimation, reaching\nsimilar or better classification performance than state-of-the-art deep time\nseries domain adaptation strategies.\n","authors":["François Painblanc","Laetitia Chapel","Nicolas Courty","Chloé Friguet","Charlotte Pelletier","Romain Tavenard"],"pdf_url":"https://arxiv.org/pdf/2308.12686v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.11418v3","updated":"2023-08-24T09:52:55Z","published":"2022-04-25T03:32:17Z","title":"Riemannian Hamiltonian methods for min-max optimization on manifolds","summary":" In this paper, we study min-max optimization problems on Riemannian\nmanifolds. We introduce a Riemannian Hamiltonian function, minimization of\nwhich serves as a proxy for solving the original min-max problems. Under the\nRiemannian Polyak--{\\L}ojasiewicz condition on the Hamiltonian function, its\nminimizer corresponds to the desired min-max saddle point. We also provide\ncases where this condition is satisfied. For geodesic-bilinear optimization in\nparticular, solving the proxy problem leads to the correct search direction\ntowards global optimality, which becomes challenging with the min-max\nformulation. To minimize the Hamiltonian function, we propose Riemannian\nHamiltonian methods (RHM) and present their convergence analyses. We extend RHM\nto include consensus regularization and to the stochastic setting. We\nillustrate the efficacy of the proposed RHM in applications such as subspace\nrobust Wasserstein distance, robust training of neural networks, and generative\nadversarial networks.\n","authors":["Andi Han","Bamdev Mishra","Pratik Jawanpuria","Pawan Kumar","Junbin Gao"],"pdf_url":"https://arxiv.org/pdf/2204.11418v3.pdf","comment":"Extended version with proofs"},{"id":"http://arxiv.org/abs/2304.09355v4","updated":"2023-08-24T09:46:45Z","published":"2023-04-19T00:33:59Z","title":"To Compress or Not to Compress- Self-Supervised Learning and Information\n Theory: A Review","summary":" \\begin{abstract} Deep neural networks excel in supervised learning tasks but\nare constrained by the need for extensive labeled data. Self-supervised\nlearning emerges as a promising alternative, allowing models to learn without\nexplicit labels. Information theory, and notably the information bottleneck\nprinciple, has been pivotal in shaping deep neural networks. This principle\nfocuses on optimizing the trade-off between compression and preserving relevant\ninformation, providing a foundation for efficient network design in supervised\ncontexts. However, its precise role and adaptation in self-supervised learning\nremain unclear. In this work, we scrutinize various self-supervised learning\napproaches from an information-theoretic perspective, introducing a unified\nframework that encapsulates the self-supervised information-theoretic learning\nproblem. We weave together existing research into a cohesive narrative, delve\ninto contemporary self-supervised methodologies, and spotlight potential\nresearch avenues and inherent challenges. Additionally, we discuss the\nempirical evaluation of information-theoretic quantities and their estimation\nmethods. Overall, this paper furnishes an exhaustive review of the intersection\nof information theory, self-supervised learning, and deep neural networks.\n","authors":["Ravid Shwartz-Ziv","Yann LeCun"],"pdf_url":"https://arxiv.org/pdf/2304.09355v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.14473v2","updated":"2023-08-24T09:44:29Z","published":"2022-05-28T16:17:52Z","title":"Efficient-Adam: Communication-Efficient Distributed Adam","summary":" Distributed adaptive stochastic gradient methods have been widely used for\nlarge-scale nonconvex optimization, such as training deep learning models.\nHowever, their communication complexity on finding $\\varepsilon$-stationary\npoints has rarely been analyzed in the nonconvex setting. In this work, we\npresent a novel communication-efficient distributed Adam in the\nparameter-server model for stochastic nonconvex optimization, dubbed {\\em\nEfficient-Adam}. Specifically, we incorporate a two-way quantization scheme\ninto Efficient-Adam to reduce the communication cost between the workers and\nserver. Simultaneously, we adopt a two-way error feedback strategy to reduce\nthe biases caused by the two-way quantization on both the server and workers,\nrespectively. In addition, we establish the iteration complexity for the\nproposed Efficient-Adam with a class of quantization operators, and further\ncharacterize its communication complexity between the server and workers when\nan $\\varepsilon$-stationary point is achieved. Finally, we apply Efficient-Adam\nto solve a toy stochastic convex optimization problem and train deep learning\nmodels on real-world vision and language tasks. Extensive experiments together\nwith a theoretical guarantee justify the merits of Efficient Adam.\n","authors":["Congliang Chen","Li Shen","Wei Liu","Zhi-Quan Luo"],"pdf_url":"https://arxiv.org/pdf/2205.14473v2.pdf","comment":"IEEE Transactions on Signal Processing"},{"id":"http://arxiv.org/abs/2308.12681v1","updated":"2023-08-24T09:40:37Z","published":"2023-08-24T09:40:37Z","title":"LR-XFL: Logical Reasoning-based Explainable Federated Learning","summary":" Federated learning (FL) is an emerging approach for training machine learning\nmodels collaboratively while preserving data privacy. The need for privacy\nprotection makes it difficult for FL models to achieve global transparency and\nexplainability. To address this limitation, we incorporate logic-based\nexplanations into FL by proposing the Logical Reasoning-based eXplainable\nFederated Learning (LR-XFL) approach. Under LR-XFL, FL clients create local\nlogic rules based on their local data and send them, along with model updates,\nto the FL server. The FL server connects the local logic rules through a proper\nlogical connector that is derived based on properties of client data, without\nrequiring access to the raw data. In addition, the server also aggregates the\nlocal model updates with weight values determined by the quality of the\nclients' local data as reflected by their uploaded logic rules. The results\nshow that LR-XFL outperforms the most relevant baseline by 1.19%, 5.81% and\n5.41% in terms of classification accuracy, rule accuracy and rule fidelity,\nrespectively. The explicit rule evaluation and expression under LR-XFL enable\nhuman experts to validate and correct the rules on the server side, hence\nimproving the global FL model's robustness to errors. It has the potential to\nenhance the transparency of FL models for areas like healthcare and finance\nwhere both data privacy and explainability are important.\n","authors":["Yanci Zhang","Han Yu"],"pdf_url":"https://arxiv.org/pdf/2308.12681v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12680v1","updated":"2023-08-24T09:39:04Z","published":"2023-08-24T09:39:04Z","title":"Master-slave Deep Architecture for Top-K Multi-armed Bandits with\n Non-linear Bandit Feedback and Diversity Constraints","summary":" We propose a novel master-slave architecture to solve the top-$K$\ncombinatorial multi-armed bandits problem with non-linear bandit feedback and\ndiversity constraints, which, to the best of our knowledge, is the first\ncombinatorial bandits setting considering diversity constraints under bandit\nfeedback. Specifically, to efficiently explore the combinatorial and\nconstrained action space, we introduce six slave models with distinguished\nmerits to generate diversified samples well balancing rewards and constraints\nas well as efficiency. Moreover, we propose teacher learning based optimization\nand the policy co-training technique to boost the performance of the multiple\nslave models. The master model then collects the elite samples provided by the\nslave models and selects the best sample estimated by a neural contextual\nUCB-based network to make a decision with a trade-off between exploration and\nexploitation. Thanks to the elaborate design of slave models, the co-training\nmechanism among slave models, and the novel interactions between the master and\nslave models, our approach significantly surpasses existing state-of-the-art\nalgorithms in both synthetic and real datasets for recommendation tasks. The\ncode is available at:\n\\url{https://github.com/huanghanchi/Master-slave-Algorithm-for-Top-K-Bandits}.\n","authors":["Hanchi Huang","Li Shen","Deheng Ye","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2308.12680v1.pdf","comment":"IEEE Transactions on Neural Networks and Learning Systems"},{"id":"http://arxiv.org/abs/2308.12679v1","updated":"2023-08-24T09:38:54Z","published":"2023-08-24T09:38:54Z","title":"A Continual Learning Approach for Cross-Domain White Blood Cell\n Classification","summary":" Accurate classification of white blood cells in peripheral blood is essential\nfor diagnosing hematological diseases. Due to constantly evolving clinical\nsettings, data sources, and disease classifications, it is necessary to update\nmachine learning classification models regularly for practical real-world use.\nSuch models significantly benefit from sequentially learning from incoming data\nstreams without forgetting previously acquired knowledge. However, models can\nsuffer from catastrophic forgetting, causing a drop in performance on previous\ntasks when fine-tuned on new data. Here, we propose a rehearsal-based continual\nlearning approach for class incremental and domain incremental scenarios in\nwhite blood cell classification. To choose representative samples from previous\ntasks, we employ exemplar set selection based on the model's predictions. This\ninvolves selecting the most confident samples and the most challenging samples\nidentified through uncertainty estimation of the model. We thoroughly evaluated\nour proposed approach on three white blood cell classification datasets that\ndiffer in color, resolution, and class composition, including scenarios where\nnew domains or new classes are introduced to the model with every task. We also\ntest a long class incremental experiment with both new domains and new classes.\nOur results demonstrate that our approach outperforms established baselines in\ncontinual learning, including existing iCaRL and EWC methods for classifying\nwhite blood cells in cross-domain environments.\n","authors":["Ario Sadafi","Raheleh Salehi","Armin Gruber","Sayedali Shetab Boushehri","Pascal Giehr","Nassir Navab","Carsten Marr"],"pdf_url":"https://arxiv.org/pdf/2308.12679v1.pdf","comment":"Accepted for publication at workshop on Domain Adaptation and\n Representation Transfer (DART) in International Conference on Medical Image\n Computing and Computer Assisted Intervention (MICCAI 2023)"},{"id":"http://arxiv.org/abs/2308.12673v1","updated":"2023-08-24T09:31:02Z","published":"2023-08-24T09:31:02Z","title":"Masked Feature Modelling: Feature Masking for the Unsupervised\n Pre-training of a Graph Attention Network Block for Bottom-up Video Event\n Recognition","summary":" In this paper, we introduce Masked Feature Modelling (MFM), a novel approach\nfor the unsupervised pre-training of a Graph Attention Network (GAT) block. MFM\nutilizes a pretrained Visual Tokenizer to reconstruct masked features of\nobjects within a video, leveraging the MiniKinetics dataset. We then\nincorporate the pre-trained GAT block into a state-of-the-art bottom-up\nsupervised video-event recognition architecture, ViGAT, to improve the model's\nstarting point and overall accuracy. Experimental evaluations on the YLI-MED\ndataset demonstrate the effectiveness of MFM in improving event recognition\nperformance.\n","authors":["Dimitrios Daskalakis","Nikolaos Gkalelis","Vasileios Mezaris"],"pdf_url":"https://arxiv.org/pdf/2308.12673v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2308.12670v1","updated":"2023-08-24T09:27:38Z","published":"2023-08-24T09:27:38Z","title":"Optimal data pooling for shared learning in maintenance operations","summary":" This paper addresses the benefits of pooling data for shared learning in\nmaintenance operations. We consider a set of systems subject to Poisson\ndegradation that are coupled through an a-priori unknown rate. Decision\nproblems involving these systems are high-dimensional Markov decision processes\n(MDPs). We present a decomposition result that reduces such an MDP to\ntwo-dimensional MDPs, enabling structural analyses and computations. We\nleverage this decomposition to demonstrate that pooling data can lead to\nsignificant cost reductions compared to not pooling.\n","authors":["Collin Drent","Melvin Drent","Geert-Jan van Houtum"],"pdf_url":"https://arxiv.org/pdf/2308.12670v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12666v1","updated":"2023-08-24T09:18:43Z","published":"2023-08-24T09:18:43Z","title":"Geodesic Mode Connectivity","summary":" Mode connectivity is a phenomenon where trained models are connected by a\npath of low loss. We reframe this in the context of Information Geometry, where\nneural networks are studied as spaces of parameterized distributions with\ncurved geometry. We hypothesize that shortest paths in these spaces, known as\ngeodesics, correspond to mode-connecting paths in the loss landscape. We\npropose an algorithm to approximate geodesics and demonstrate that they achieve\nmode connectivity.\n","authors":["Charlie Tan","Theodore Long","Sarah Zhao","Rudolf Laine"],"pdf_url":"https://arxiv.org/pdf/2308.12666v1.pdf","comment":"Published as a TinyPaper at ICLR 2023"},{"id":"http://arxiv.org/abs/2308.12661v1","updated":"2023-08-24T09:10:10Z","published":"2023-08-24T09:10:10Z","title":"Don't Look into the Sun: Adversarial Solarization Attacks on Image\n Classifiers","summary":" Assessing the robustness of deep neural networks against out-of-distribution\ninputs is crucial, especially in safety-critical domains like autonomous\ndriving, but also in safety systems where malicious actors can digitally alter\ninputs to circumvent safety guards. However, designing effective\nout-of-distribution tests that encompass all possible scenarios while\npreserving accurate label information is a challenging task. Existing\nmethodologies often entail a compromise between variety and constraint levels\nfor attacks and sometimes even both. In a first step towards a more holistic\nrobustness evaluation of image classification models, we introduce an attack\nmethod based on image solarization that is conceptually straightforward yet\navoids jeopardizing the global structure of natural images independent of the\nintensity. Through comprehensive evaluations of multiple ImageNet models, we\ndemonstrate the attack's capacity to degrade accuracy significantly, provided\nit is not integrated into the training augmentations. Interestingly, even then,\nno full immunity to accuracy deterioration is achieved. In other settings, the\nattack can often be simplified into a black-box attack with model-independent\nparameters. Defenses against other corruptions do not consistently extend to be\neffective against our specific attack.\n Project website: https://github.com/paulgavrikov/adversarial_solarization\n","authors":["Paul Gavrikov","Janis Keuper"],"pdf_url":"https://arxiv.org/pdf/2308.12661v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.03543v2","updated":"2023-08-24T08:57:54Z","published":"2023-04-07T08:48:07Z","title":"HyperTab: Hypernetwork Approach for Deep Learning on Small Tabular\n Datasets","summary":" Deep learning has achieved impressive performance in many domains, such as\ncomputer vision and natural language processing, but its advantage over\nclassical shallow methods on tabular datasets remains questionable. It is\nespecially challenging to surpass the performance of tree-like ensembles, such\nas XGBoost or Random Forests, on small-sized datasets (less than 1k samples).\nTo tackle this challenge, we introduce HyperTab, a hypernetwork-based approach\nto solving small sample problems on tabular datasets. By combining the\nadvantages of Random Forests and neural networks, HyperTab generates an\nensemble of neural networks, where each target model is specialized to process\na specific lower-dimensional view of the data. Since each view plays the role\nof data augmentation, we virtually increase the number of training samples\nwhile keeping the number of trainable parameters unchanged, which prevents\nmodel overfitting. We evaluated HyperTab on more than 40 tabular datasets of a\nvarying number of samples and domains of origin, and compared its performance\nwith shallow and deep learning models representing the current\nstate-of-the-art. We show that HyperTab consistently outranks other methods on\nsmall data (with a statistically significant difference) and scores comparable\nto them on larger datasets.\n We make a python package with the code available to download at\nhttps://pypi.org/project/hypertab/\n","authors":["Witold Wydmański","Oleksii Bulenok","Marek Śmieja"],"pdf_url":"https://arxiv.org/pdf/2304.03543v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12649v1","updated":"2023-08-24T08:46:43Z","published":"2023-08-24T08:46:43Z","title":"APART: Diverse Skill Discovery using All Pairs with Ascending Reward and\n DropouT","summary":" We study diverse skill discovery in reward-free environments, aiming to\ndiscover all possible skills in simple grid-world environments where prior\nmethods have struggled to succeed. This problem is formulated as mutual\ntraining of skills using an intrinsic reward and a discriminator trained to\npredict a skill given its trajectory. Our initial solution replaces the\nstandard one-vs-all (softmax) discriminator with a one-vs-one (all pairs)\ndiscriminator and combines it with a novel intrinsic reward function and a\ndropout regularization technique. The combined approach is named APART: Diverse\nSkill Discovery using All Pairs with Ascending Reward and Dropout. We\ndemonstrate that APART discovers all the possible skills in grid worlds with\nremarkably fewer samples than previous works. Motivated by the empirical\nsuccess of APART, we further investigate an even simpler algorithm that\nachieves maximum skills by altering VIC, rescaling its intrinsic reward, and\ntuning the temperature of its softmax discriminator. We believe our findings\nshed light on the crucial factors underlying success of skill discovery\nalgorithms in reinforcement learning.\n","authors":["Hadar Schreiber Galler","Tom Zahavy","Guillaume Desjardins","Alon Cohen"],"pdf_url":"https://arxiv.org/pdf/2308.12649v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12646v1","updated":"2023-08-24T08:42:06Z","published":"2023-08-24T08:42:06Z","title":"The GENEA Challenge 2023: A large scale evaluation of gesture generation\n models in monadic and dyadic settings","summary":" This paper reports on the GENEA Challenge 2023, in which participating teams\nbuilt speech-driven gesture-generation systems using the same speech and motion\ndataset, followed by a joint evaluation. This year's challenge provided data on\nboth sides of a dyadic interaction, allowing teams to generate full-body motion\nfor an agent given its speech (text and audio) and the speech and motion of the\ninterlocutor. We evaluated 12 submissions and 2 baselines together with\nheld-out motion-capture data in several large-scale user studies. The studies\nfocused on three aspects: 1) the human-likeness of the motion, 2) the\nappropriateness of the motion for the agent's own speech whilst controlling for\nthe human-likeness of the motion, and 3) the appropriateness of the motion for\nthe behaviour of the interlocutor in the interaction, using a setup that\ncontrols for both the human-likeness of the motion and the agent's own speech.\nWe found a large span in human-likeness between challenge submissions, with a\nfew systems rated close to human mocap. Appropriateness seems far from being\nsolved, with most submissions performing in a narrow range slightly above\nchance, far behind natural motion. The effect of the interlocutor is even more\nsubtle, with submitted systems at best performing barely above chance.\nInterestingly, a dyadic system being highly appropriate for agent speech does\nnot necessarily imply high appropriateness for the interlocutor. Additional\nmaterial is available via the project website at\nhttps://svito-zar.github.io/GENEAchallenge2023/ .\n","authors":["Taras Kucherenko","Rajmund Nagy","Youngwoo Yoon","Jieyeon Woo","Teodor Nikolov","Mihail Tsakov","Gustav Eje Henter"],"pdf_url":"https://arxiv.org/pdf/2308.12646v1.pdf","comment":"The first three authors made equal contributions. Accepted for\n publication at the ACM International Conference on Multimodal Interaction\n (ICMI)"},{"id":"http://arxiv.org/abs/2302.09624v2","updated":"2023-08-24T08:40:08Z","published":"2023-02-19T16:58:53Z","title":"Breaking the Communication-Privacy-Accuracy Tradeoff with\n $f$-Differential Privacy","summary":" We consider a federated data analytics problem in which a server coordinates\nthe collaborative data analysis of multiple users with privacy concerns and\nlimited communication capability. The commonly adopted compression schemes\nintroduce information loss into local data while improving communication\nefficiency, and it remains an open problem whether such discrete-valued\nmechanisms provide any privacy protection. In this paper, we study the local\ndifferential privacy guarantees of discrete-valued mechanisms with finite\noutput space through the lens of $f$-differential privacy (DP). More\nspecifically, we advance the existing literature by deriving tight $f$-DP\nguarantees for a variety of discrete-valued mechanisms, including the binomial\nnoise and the binomial mechanisms that are proposed for privacy preservation,\nand the sign-based methods that are proposed for data compression, in\nclosed-form expressions. We further investigate the amplification in privacy by\nsparsification and propose a ternary stochastic compressor. By leveraging\ncompression for privacy amplification, we improve the existing methods by\nremoving the dependency of accuracy (in terms of mean square error) on\ncommunication cost in the popular use case of distributed mean estimation,\ntherefore breaking the three-way tradeoff between privacy, communication, and\naccuracy. Finally, we discuss the Byzantine resilience of the proposed\nmechanism and its application in federated learning.\n","authors":["Richeng Jin","Zhonggen Su","Caijun Zhong","Zhaoyang Zhang","Tony Quek","Huaiyu Dai"],"pdf_url":"https://arxiv.org/pdf/2302.09624v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.06965v3","updated":"2023-08-24T08:33:05Z","published":"2023-03-13T10:06:41Z","title":"Bridging the Gap between Chemical Reaction Pretraining and Conditional\n Molecule Generation with a Unified Model","summary":" Chemical reactions are the fundamental building blocks of drug design and\norganic chemistry research. In recent years, there has been a growing need for\na large-scale deep-learning framework that can efficiently capture the basic\nrules of chemical reactions. In this paper, we have proposed a unified\nframework that addresses both the reaction representation learning and molecule\ngeneration tasks, which allows for a more holistic approach. Inspired by the\norganic chemistry mechanism, we develop a novel pretraining framework that\nenables us to incorporate inductive biases into the model. Our framework\nachieves state-of-the-art results on challenging downstream tasks. By\npossessing chemical knowledge, our generative framework overcome the\nlimitations of current molecule generation models that rely on a small number\nof reaction templates. In the extensive experiments, our model generates\nsynthesizable drug-like structures of high quality. Overall, our work presents\na significant step toward a large-scale deep-learning framework for a variety\nof reaction-based applications.\n","authors":["Bo Qiang","Yiran Zhou","Yuheng Ding","Ningfeng Liu","Song Song","Liangren Zhang","Bo Huang","Zhenming Liu"],"pdf_url":"https://arxiv.org/pdf/2303.06965v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.08134v4","updated":"2023-08-24T08:31:31Z","published":"2023-04-17T10:29:26Z","title":"Tackling Face Verification Edge Cases: In-Depth Analysis and\n Human-Machine Fusion Approach","summary":" Nowadays, face recognition systems surpass human performance on several\ndatasets. However, there are still edge cases that the machine can't correctly\nclassify. This paper investigates the effect of a combination of machine and\nhuman operators in the face verification task. First, we look closer at the\nedge cases for several state-of-the-art models to discover common datasets'\nchallenging settings. Then, we conduct a study with 60 participants on these\nselected tasks with humans and provide an extensive analysis. Finally, we\ndemonstrate that combining machine and human decisions can further improve the\nperformance of state-of-the-art face verification systems on various benchmark\ndatasets. Code and data are publicly available on GitHub.\n","authors":["Martin Knoche","Gerhard Rigoll"],"pdf_url":"https://arxiv.org/pdf/2304.08134v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.08040v2","updated":"2023-08-24T08:24:12Z","published":"2023-03-14T16:19:44Z","title":"Equal Treatment: Measuring Fairness using Explanation Distributions","summary":" Liberalism-oriented political philosophy reasons that all individuals should\nbe treated equally independently of their protected characteristics. Related\nwork in machine learning has translated the concept of equal treatment into\nterms of equal outcome and measured it as demographic parity (also called\nstatistical parity). Our analysis reveals that the two concepts of equal\noutcome and equal treatment diverge; therefore, demographic parity does not\nfaithfully represent the notion of equal treatment. We propose a new\nformalization for equal treatment by (i) considering the influence of feature\nvalues on predictions, such as computed by Shapley values explaining\nclassifications, (ii) defining distributions of explanations, and (iii)\ncomparing explanation distributions between populations with different\nprotected characteristics. We show the theoretical properties of our notion of\nequal treatment and devise a classifier two-sample test based on the AUC of an\nequal treatment inspector. We study our formalization of equal treatment on\nsynthetic and natural data. We release explanationspace, an open-source Python\npackage with methods and tutorials.\n","authors":["Carlos Mougan","Laura State","Antonio Ferrara","Salvatore Ruggieri","Steffen Staab"],"pdf_url":"https://arxiv.org/pdf/2303.08040v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12634v1","updated":"2023-08-24T08:19:15Z","published":"2023-08-24T08:19:15Z","title":"Towards Hierarchical Regional Transformer-based Multiple Instance\n Learning","summary":" The classification of gigapixel histopathology images with deep multiple\ninstance learning models has become a critical task in digital pathology and\nprecision medicine. In this work, we propose a Transformer-based multiple\ninstance learning approach that replaces the traditional learned attention\nmechanism with a regional, Vision Transformer inspired self-attention\nmechanism. We present a method that fuses regional patch information to derive\nslide-level predictions and show how this regional aggregation can be stacked\nto hierarchically process features on different distance levels. To increase\npredictive accuracy, especially for datasets with small, local morphological\nfeatures, we introduce a method to focus the image processing on high attention\nregions during inference. Our approach is able to significantly improve\nperformance over the baseline on two histopathology datasets and points towards\npromising directions for further research.\n","authors":["Josef Cersovsky","Sadegh Mohammadi","Dagmar Kainmueller","Johannes Hoehne"],"pdf_url":"https://arxiv.org/pdf/2308.12634v1.pdf","comment":"To be published as ICCV 2023 workshop paper"},{"id":"http://arxiv.org/abs/2209.01566v4","updated":"2023-08-24T08:16:38Z","published":"2022-09-04T08:35:16Z","title":"Towards Top-Down Automated Development in Limited Scopes: A\n Neuro-Symbolic Framework from Expressibles to Executables","summary":" Deep code generation is a topic of deep learning for software engineering\n(DL4SE), which adopts neural models to generate code for the intended\nfunctions. Since end-to-end neural methods lack domain knowledge and software\nhierarchy awareness, they tend to perform poorly w.r.t project-level tasks. To\nsystematically explore the potential improvements of code generation, we let it\nparticipate in the whole top-down development from \\emph{expressibles} to\n\\emph{executables}, which is possible in limited scopes. In the process, it\nbenefits from massive samples, features, and knowledge. As the foundation, we\nsuggest building a taxonomy on code data, namely code taxonomy, leveraging the\ncategorization of code information. Moreover, we introduce a three-layer\nsemantic pyramid (SP) to associate text data and code data. It identifies the\ninformation of different abstraction levels, and thus introduces the domain\nknowledge on development and reveals the hierarchy of software. Furthermore, we\npropose a semantic pyramid framework (SPF) as the approach, focusing on\nsoftware of high modularity and low complexity. SPF divides the code generation\nprocess into stages and reserves spots for potential interactions. In addition,\nwe conceived preliminary applications in software development to confirm the\nneuro-symbolic framework.\n","authors":["Jian Gu","Harald C. Gall"],"pdf_url":"https://arxiv.org/pdf/2209.01566v4.pdf","comment":"5 pages, 3 figures, 2 tables, accepted by ESEC/FSE 2023, the\n camera-ready version"},{"id":"http://arxiv.org/abs/2308.12625v1","updated":"2023-08-24T08:03:15Z","published":"2023-08-24T08:03:15Z","title":"Uncertainty and Explainable Analysis of Machine Learning Model for\n Reconstruction of Sonic Slowness Logs","summary":" Logs are valuable information for oil and gas fields as they help to\ndetermine the lithology of the formations surrounding the borehole and the\nlocation and reserves of subsurface oil and gas reservoirs. However, important\nlogs are often missing in horizontal or old wells, which poses a challenge in\nfield applications. In this paper, we utilize data from the 2020 machine\nlearning competition of the SPWLA, which aims to predict the missing\ncompressional wave slowness and shear wave slowness logs using other logs in\nthe same borehole. We employ the NGBoost algorithm to construct an Ensemble\nLearning model that can predicate the results as well as their uncertainty.\nFurthermore, we combine the SHAP method to investigate the interpretability of\nthe machine learning model. We compare the performance of the NGBosst model\nwith four other commonly used Ensemble Learning methods, including Random\nForest, GBDT, XGBoost, LightGBM. The results show that the NGBoost model\nperforms well in the testing set and can provide a probability distribution for\nthe prediction results. In addition, the variance of the probability\ndistribution of the predicted log can be used to justify the quality of the\nconstructed log. Using the SHAP explainable machine learning model, we\ncalculate the importance of each input log to the predicted results as well as\nthe coupling relationship among input logs. Our findings reveal that the\nNGBoost model tends to provide greater slowness prediction results when the\nneutron porosity and gamma ray are large, which is consistent with the\ncognition of petrophysical models. Furthermore, the machine learning model can\ncapture the influence of the changing borehole caliper on slowness, where the\ninfluence of borehole caliper on slowness is complex and not easy to establish\na direct relationship. These findings are in line with the physical principle\nof borehole acoustics.\n","authors":["Hua Wang","Yuqiong Wu","Yushun Zhang","Fuqiang Lai","Zhou Feng","Bing Xie","Ailin Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.12625v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12044v2","updated":"2023-08-24T07:37:06Z","published":"2023-08-23T10:08:52Z","title":"A multiobjective continuation method to compute the regularization path\n of deep neural networks","summary":" Sparsity is a highly desired feature in deep neural networks (DNNs) since it\nensures numerical efficiency, improves the interpretability of models (due to\nthe smaller number of relevant features), and robustness. In machine learning\napproaches based on linear models, it is well known that there exists a\nconnecting path between the sparsest solution in terms of the $\\ell^1$ norm\n(i.e., zero weights) and the non-regularized solution, which is called the\nregularization path. Very recently, there was a first attempt to extend the\nconcept of regularization paths to DNNs by means of treating the empirical loss\nand sparsity ($\\ell^1$ norm) as two conflicting criteria and solving the\nresulting multiobjective optimization problem. However, due to the\nnon-smoothness of the $\\ell^1$ norm and the high number of parameters, this\napproach is not very efficient from a computational perspective. To overcome\nthis limitation, we present an algorithm that allows for the approximation of\nthe entire Pareto front for the above-mentioned objectives in a very efficient\nmanner. We present numerical examples using both deterministic and stochastic\ngradients. We furthermore demonstrate that knowledge of the regularization path\nallows for a well-generalizing network parametrization.\n","authors":["Augustina C. Amakor","Konstantin Sonntag","Sebastian Peitz"],"pdf_url":"https://arxiv.org/pdf/2308.12044v2.pdf","comment":"7 pages, 6 figures"},{"id":"http://arxiv.org/abs/2210.14598v3","updated":"2023-08-24T07:30:34Z","published":"2022-10-26T10:12:31Z","title":"Exact Manifold Gaussian Variational Bayes","summary":" We propose an optimization algorithm for Variational Inference (VI) in\ncomplex models. Our approach relies on natural gradient updates where the\nvariational space is a Riemann manifold. We develop an efficient algorithm for\nGaussian Variational Inference that implicitly satisfies the positive definite\nconstraint on the variational covariance matrix. Our Exact manifold Gaussian\nVariational Bayes (EMGVB) provides exact but simple update rules and is\nstraightforward to implement. Due to its black-box nature, EMGVB stands as a\nready-to-use solution for VI in complex models. Over five datasets, we\nempirically validate our feasible approach on different statistical,\neconometric, and deep learning models, discussing its performance with respect\nto baseline methods.\n","authors":["Martin Magris","Mostafa Shabani","Alexandros Iosifidis"],"pdf_url":"https://arxiv.org/pdf/2210.14598v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12612v1","updated":"2023-08-24T07:22:29Z","published":"2023-08-24T07:22:29Z","title":"Try with Simpler -- An Evaluation of Improved Principal Component\n Analysis in Log-based Anomaly Detection","summary":" The rapid growth of deep learning (DL) has spurred interest in enhancing\nlog-based anomaly detection. This approach aims to extract meaning from log\nevents (log message templates) and develop advanced DL models for anomaly\ndetection. However, these DL methods face challenges like heavy reliance on\ntraining data, labels, and computational resources due to model complexity. In\ncontrast, traditional machine learning and data mining techniques are less\ndata-dependent and more efficient but less effective than DL. To make log-based\nanomaly detection more practical, the goal is to enhance traditional techniques\nto match DL's effectiveness. Previous research in a different domain (linking\nquestions on Stack Overflow) suggests that optimized traditional techniques can\nrival state-of-the-art DL methods. Drawing inspiration from this concept, we\nconducted an empirical study. We optimized the unsupervised PCA (Principal\nComponent Analysis), a traditional technique, by incorporating lightweight\nsemantic-based log representation. This addresses the issue of unseen log\nevents in training data, enhancing log representation. Our study compared seven\nlog-based anomaly detection methods, including four DL-based, two traditional,\nand the optimized PCA technique, using public and industrial datasets. Results\nindicate that the optimized unsupervised PCA technique achieves similar\neffectiveness to advanced supervised/semi-supervised DL methods while being\nmore stable with limited training data and resource-efficient. This\ndemonstrates the adaptability and strength of traditional techniques through\nsmall yet impactful adaptations.\n","authors":["Lin Yang","Junjie Chen","Zhihao Gong","Shutao Gao","Hongyu Zhang","Yue Kang","Huaan Li"],"pdf_url":"https://arxiv.org/pdf/2308.12612v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12606v1","updated":"2023-08-24T07:11:51Z","published":"2023-08-24T07:11:51Z","title":"A Greedy Approach for Offering to Telecom Subscribers","summary":" Customer retention or churn prevention is a challenging task of a telecom\noperator. One of the effective approaches is to offer some attractive incentive\nor additional services or money to the subscribers for keeping them engaged and\nmake sure they stay in the operator's network for longer time. Often, operators\nallocate certain amount of monetary budget to carry out the offer campaign. The\ndifficult part of this campaign is the selection of a set of customers from a\nlarge subscriber-base and deciding the amount that should be offered to an\nindividual so that operator's objective is achieved. There may be multiple\nobjectives (e.g., maximizing revenue, minimizing number of churns) for\nselection of subscriber and selection of an offer to the selected subscriber.\nApart from monetary benefit, offers may include additional data, SMS, hots-spot\ntethering, and many more. This problem is known as offer optimization. In this\npaper, we propose a novel combinatorial algorithm for solving offer\noptimization under heterogeneous offers by maximizing expected revenue under\nthe scenario of subscriber churn, which is, in general, seen in telecom domain.\nThe proposed algorithm is efficient and accurate even for a very large\nsubscriber-base.\n","authors":["Piyush Kanti Bhunre","Tanmay Sen","Arijit Sarkar"],"pdf_url":"https://arxiv.org/pdf/2308.12606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04528v3","updated":"2023-08-24T07:09:25Z","published":"2023-06-07T15:37:00Z","title":"PromptBench: Towards Evaluating the Robustness of Large Language Models\n on Adversarial Prompts","summary":" The increasing reliance on Large Language Models (LLMs) across academia and\nindustry necessitates a comprehensive understanding of their robustness to\nprompts. In response to this vital need, we introduce PromptBench, a robustness\nbenchmark designed to measure LLMs' resilience to adversarial prompts. This\nstudy uses a plethora of adversarial textual attacks targeting prompts across\nmultiple levels: character, word, sentence, and semantic. These prompts are\nthen employed in diverse tasks, such as sentiment analysis, natural language\ninference, reading comprehension, machine translation, and math\nproblem-solving. Our study generates 4,032 adversarial prompts, meticulously\nevaluated over 8 tasks and 13 datasets, with 567,084 test samples in total. Our\nfindings demonstrate that contemporary LLMs are vulnerable to adversarial\nprompts. Furthermore, we present comprehensive analysis to understand the\nmystery behind prompt robustness and its transferability. We then offer\ninsightful robustness analysis and pragmatic recommendations for prompt\ncomposition, beneficial to both researchers and everyday users. We make our\ncode, prompts, and methodologies to generate adversarial prompts publicly\naccessible, thereby enabling and encouraging collaborative exploration in this\npivotal field: https://github.com/microsoft/promptbench.\n","authors":["Kaijie Zhu","Jindong Wang","Jiaheng Zhou","Zichen Wang","Hao Chen","Yidong Wang","Linyi Yang","Wei Ye","Neil Zhenqiang Gong","Yue Zhang","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2306.04528v3.pdf","comment":"Technical report; updated with new experiments and related work; 27\n pages; code is at: https://github.com/microsoft/promptbench"},{"id":"http://arxiv.org/abs/2306.05838v2","updated":"2023-08-24T06:59:44Z","published":"2023-06-09T12:12:07Z","title":"Expectation-Complete Graph Representations with Homomorphisms","summary":" We investigate novel random graph embeddings that can be computed in expected\npolynomial time and that are able to distinguish all non-isomorphic graphs in\nexpectation. Previous graph embeddings have limited expressiveness and either\ncannot distinguish all graphs or cannot be computed efficiently for every\ngraph. To be able to approximate arbitrary functions on graphs, we are\ninterested in efficient alternatives that become arbitrarily expressive with\nincreasing resources. Our approach is based on Lov\\'asz' characterisation of\ngraph isomorphism through an infinite dimensional vector of homomorphism\ncounts. Our empirical evaluation shows competitive results on several benchmark\ngraph learning tasks.\n","authors":["Pascal Welke","Maximilian Thiessen","Fabian Jogl","Thomas Gärtner"],"pdf_url":"https://arxiv.org/pdf/2306.05838v2.pdf","comment":"accepted for publication at ICML 2023"},{"id":"http://arxiv.org/abs/2308.12599v1","updated":"2023-08-24T06:56:54Z","published":"2023-08-24T06:56:54Z","title":"Exploiting Time-Frequency Conformers for Music Audio Enhancement","summary":" With the proliferation of video platforms on the internet, recording musical\nperformances by mobile devices has become commonplace. However, these\nrecordings often suffer from degradation such as noise and reverberation, which\nnegatively impact the listening experience. Consequently, the necessity for\nmusic audio enhancement (referred to as music enhancement from this point\nonward), involving the transformation of degraded audio recordings into\npristine high-quality music, has surged to augment the auditory experience. To\naddress this issue, we propose a music enhancement system based on the\nConformer architecture that has demonstrated outstanding performance in speech\nenhancement tasks. Our approach explores the attention mechanisms of the\nConformer and examines their performance to discover the best approach for the\nmusic enhancement task. Our experimental results show that our proposed model\nachieves state-of-the-art performance on single-stem music enhancement.\nFurthermore, our system can perform general music enhancement with multi-track\nmixtures, which has not been examined in previous work.\n","authors":["Yunkee Chae","Junghyun Koo","Sungho Lee","Kyogu Lee"],"pdf_url":"https://arxiv.org/pdf/2308.12599v1.pdf","comment":"Accepted by ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2208.12263v2","updated":"2023-08-24T06:53:50Z","published":"2022-08-24T08:05:18Z","title":"Augmenting Reinforcement Learning with Transformer-based Scene\n Representation Learning for Decision-making of Autonomous Driving","summary":" Decision-making for urban autonomous driving is challenging due to the\nstochastic nature of interactive traffic participants and the complexity of\nroad structures. Although reinforcement learning (RL)-based decision-making\nscheme is promising to handle urban driving scenarios, it suffers from low\nsample efficiency and poor adaptability. In this paper, we propose Scene-Rep\nTransformer to improve the RL decision-making capabilities with better scene\nrepresentation encoding and sequential predictive latent distillation.\nSpecifically, a multi-stage Transformer (MST) encoder is constructed to model\nnot only the interaction awareness between the ego vehicle and its neighbors\nbut also intention awareness between the agents and their candidate routes. A\nsequential latent Transformer (SLT) with self-supervised learning objectives is\nemployed to distill the future predictive information into the latent scene\nrepresentation, in order to reduce the exploration space and speed up training.\nThe final decision-making module based on soft actor-critic (SAC) takes as\ninput the refined latent scene representation from the Scene-Rep Transformer\nand outputs driving actions. The framework is validated in five challenging\nsimulated urban scenarios with dense traffic, and its performance is manifested\nquantitatively by the substantial improvements in data efficiency and\nperformance in terms of success rate, safety, and efficiency. The qualitative\nresults reveal that our framework is able to extract the intentions of neighbor\nagents to help make decisions and deliver more diversified driving behaviors.\n","authors":["Haochen Liu","Zhiyu Huang","Xiaoyu Mo","Chen Lv"],"pdf_url":"https://arxiv.org/pdf/2208.12263v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.05699v2","updated":"2023-08-24T06:46:13Z","published":"2023-03-10T04:49:01Z","title":"Feature Unlearning for Pre-trained GANs and VAEs","summary":" We tackle the problem of feature unlearning from a pre-trained image\ngenerative model: GANs and VAEs. Unlike a common unlearning task where an\nunlearning target is a subset of the training set, we aim to unlearn a specific\nfeature, such as hairstyle from facial images, from the pre-trained generative\nmodels. As the target feature is only presented in a local region of an image,\nunlearning the entire image from the pre-trained model may result in losing\nother details in the remaining region of the image. To specify which features\nto unlearn, we collect randomly generated images that contain the target\nfeatures. We then identify a latent representation corresponding to the target\nfeature and then use the representation to fine-tune the pre-trained model.\nThrough experiments on MNIST and CelebA datasets, we show that target features\nare successfully removed while keeping the fidelity of the original models.\nFurther experiments with an adversarial attack show that the unlearned model is\nmore robust under the presence of malicious parties.\n","authors":["Saemi Moon","Seunghyuk Cho","Dongwoo Kim"],"pdf_url":"https://arxiv.org/pdf/2303.05699v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.01805v2","updated":"2023-08-24T06:28:02Z","published":"2023-01-04T20:08:23Z","title":"Unsupervised Manifold Linearizing and Clustering","summary":" We consider the problem of simultaneously clustering and learning a linear\nrepresentation of data lying close to a union of low-dimensional manifolds, a\nfundamental task in machine learning and computer vision. When the manifolds\nare assumed to be linear subspaces, this reduces to the classical problem of\nsubspace clustering, which has been studied extensively over the past two\ndecades. Unfortunately, many real-world datasets such as natural images can not\nbe well approximated by linear subspaces. On the other hand, numerous works\nhave attempted to learn an appropriate transformation of the data, such that\ndata is mapped from a union of general non-linear manifolds to a union of\nlinear subspaces (with points from the same manifold being mapped to the same\nsubspace). However, many existing works have limitations such as assuming\nknowledge of the membership of samples to clusters, requiring high sampling\ndensity, or being shown theoretically to learn trivial representations. In this\npaper, we propose to optimize the Maximal Coding Rate Reduction metric with\nrespect to both the data representation and a novel doubly stochastic cluster\nmembership, inspired by state-of-the-art subspace clustering results. We give a\nparameterization of such a representation and membership, allowing efficient\nmini-batching and one-shot initialization. Experiments on CIFAR-10, -20, -100,\nand TinyImageNet-200 datasets show that the proposed method is much more\naccurate and scalable than state-of-the-art deep clustering methods, and\nfurther learns a latent linear representation of the data.\n","authors":["Tianjiao Ding","Shengbang Tong","Kwan Ho Ryan Chan","Xili Dai","Yi Ma","Benjamin D. Haeffele"],"pdf_url":"https://arxiv.org/pdf/2301.01805v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11217v3","updated":"2023-08-24T06:24:13Z","published":"2023-08-22T06:05:11Z","title":"Federated Learning in Big Model Era: Domain-Specific Multimodal Large\n Models","summary":" Multimodal data, which can comprehensively perceive and recognize the\nphysical world, has become an essential path towards general artificial\nintelligence. However, multimodal large models trained on public datasets often\nunderperform in specific industrial domains. This paper proposes a multimodal\nfederated learning framework that enables multiple enterprises to utilize\nprivate domain data to collaboratively train large models for vertical domains,\nachieving intelligent services across scenarios. The authors discuss in-depth\nthe strategic transformation of federated learning in terms of intelligence\nfoundation and objectives in the era of big model, as well as the new\nchallenges faced in heterogeneous data, model aggregation, performance and cost\ntrade-off, data privacy, and incentive mechanism. The paper elaborates a case\nstudy of leading enterprises contributing multimodal data and expert knowledge\nto city safety operation management , including distributed deployment and\nefficient coordination of the federated learning platform, technical\ninnovations on data quality improvement based on large model capabilities and\nefficient joint fine-tuning approaches. Preliminary experiments show that\nenterprises can enhance and accumulate intelligent capabilities through\nmultimodal model federated learning, thereby jointly creating an smart city\nmodel that provides high-quality intelligent services covering energy\ninfrastructure safety, residential community security, and urban operation\nmanagement. The established federated learning cooperation ecosystem is\nexpected to further aggregate industry, academia, and research resources,\nrealize large models in multiple vertical domains, and promote the large-scale\nindustrial application of artificial intelligence and cutting-edge research on\nmultimodal federated learning.\n","authors":["Zengxiang Li","Zhaoxiang Hou","Hui Liu","Ying Wang","Tongzhi Li","Longfei Xie","Chao Shi","Chengyi Yang","Weishan Zhang","Zelei Liu","Liang Xu"],"pdf_url":"https://arxiv.org/pdf/2308.11217v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.09091v3","updated":"2023-08-24T06:18:42Z","published":"2023-01-22T10:17:02Z","title":"BallGAN: 3D-aware Image Synthesis with a Spherical Background","summary":" 3D-aware GANs aim to synthesize realistic 3D scenes such that they can be\nrendered in arbitrary perspectives to produce images. Although previous methods\nproduce realistic images, they suffer from unstable training or degenerate\nsolutions where the 3D geometry is unnatural. We hypothesize that the 3D\ngeometry is underdetermined due to the insufficient constraint, i.e., being\nclassified as real image to the discriminator is not enough. To solve this\nproblem, we propose to approximate the background as a spherical surface and\nrepresent a scene as a union of the foreground placed in the sphere and the\nthin spherical background. It reduces the degree of freedom in the background\nfield. Accordingly, we modify the volume rendering equation and incorporate\ndedicated constraints to design a novel 3D-aware GAN framework named BallGAN.\nBallGAN has multiple advantages as follows. 1) It produces more reasonable 3D\ngeometry; the images of a scene across different viewpoints have better\nphotometric consistency and fidelity than the state-of-the-art methods. 2) The\ntraining becomes much more stable. 3) The foreground can be separately rendered\non top of different arbitrary backgrounds.\n","authors":["Minjung Shin","Yunji Seo","Jeongmin Bae","Young Sun Choi","Hyunsu Kim","Hyeran Byun","Youngjung Uh"],"pdf_url":"https://arxiv.org/pdf/2301.09091v3.pdf","comment":"ICCV 2023, Project Page: https://minjung-s.github.io/ballgan"},{"id":"http://arxiv.org/abs/2308.12584v1","updated":"2023-08-24T06:12:41Z","published":"2023-08-24T06:12:41Z","title":"LORD: Leveraging Open-Set Recognition with Unknown Data","summary":" Handling entirely unknown data is a challenge for any deployed classifier.\nClassification models are typically trained on a static pre-defined dataset and\nare kept in the dark for the open unassigned feature space. As a result, they\nstruggle to deal with out-of-distribution data during inference. Addressing\nthis task on the class-level is termed open-set recognition (OSR). However,\nmost OSR methods are inherently limited, as they train closed-set classifiers\nand only adapt the downstream predictions to OSR. This work presents LORD, a\nframework to Leverage Open-set Recognition by exploiting unknown Data. LORD\nexplicitly models open space during classifier training and provides a\nsystematic evaluation for such approaches. We identify three model-agnostic\ntraining strategies that exploit background data and applied them to\nwell-established classifiers. Due to LORD's extensive evaluation protocol, we\nconsistently demonstrate improved recognition of unknown data. The benchmarks\nfacilitate in-depth analysis across various requirement levels. To mitigate\ndependency on extensive and costly background datasets, we explore mixup as an\noff-the-shelf data generation technique. Our experiments highlight mixup's\neffectiveness as a substitute for background datasets. Lightweight constraints\non mixup synthesis further improve OSR performance.\n","authors":["Tobias Koch","Christian Riess","Thomas Köhler"],"pdf_url":"https://arxiv.org/pdf/2308.12584v1.pdf","comment":"Accepted at ICCV 2023 Workshop (Out-Of-Distribution Generalization in\n Computer Vision)"},{"id":"http://arxiv.org/abs/2308.12585v1","updated":"2023-08-24T06:12:41Z","published":"2023-08-24T06:12:41Z","title":"Persistent learning signals and working memory without continuous\n attractors","summary":" Neural dynamical systems with stable attractor structures, such as point\nattractors and continuous attractors, are hypothesized to underlie meaningful\ntemporal behavior that requires working memory. However, working memory may not\nsupport useful learning signals necessary to adapt to changes in the temporal\nstructure of the environment. We show that in addition to the continuous\nattractors that are widely implicated, periodic and quasi-periodic attractors\ncan also support learning arbitrarily long temporal relationships. Unlike the\ncontinuous attractors that suffer from the fine-tuning problem, the less\nexplored quasi-periodic attractors are uniquely qualified for learning to\nproduce temporally structured behavior. Our theory has broad implications for\nthe design of artificial learning systems and makes predictions about\nobservable signatures of biological neural dynamics that can support temporal\ndependence learning and working memory. Based on our theory, we developed a new\ninitialization scheme for artificial recurrent neural networks that outperforms\nstandard methods for tasks that require learning temporal dynamics. Moreover,\nwe propose a robust recurrent memory mechanism for integrating and maintaining\nhead direction without a ring attractor.\n","authors":["Il Memming Park","Ábel Ságodi","Piotr Aleksander Sokół"],"pdf_url":"https://arxiv.org/pdf/2308.12585v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12581v1","updated":"2023-08-24T05:49:58Z","published":"2023-08-24T05:49:58Z","title":"A Huber Loss Minimization Approach to Byzantine Robust Federated\n Learning","summary":" Federated learning systems are susceptible to adversarial attacks. To combat\nthis, we introduce a novel aggregator based on Huber loss minimization, and\nprovide a comprehensive theoretical analysis. Under independent and identically\ndistributed (i.i.d) assumption, our approach has several advantages compared to\nexisting methods. Firstly, it has optimal dependence on $\\epsilon$, which\nstands for the ratio of attacked clients. Secondly, our approach does not need\nprecise knowledge of $\\epsilon$. Thirdly, it allows different clients to have\nunequal data sizes. We then broaden our analysis to include non-i.i.d data,\nsuch that clients have slightly different distributions.\n","authors":["Puning Zhao","Fei Yu","Zhiguo Wan"],"pdf_url":"https://arxiv.org/pdf/2308.12581v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12000v2","updated":"2023-08-24T05:46:22Z","published":"2023-08-23T08:38:53Z","title":"On Uniformly Optimal Algorithms for Best Arm Identification in Two-Armed\n Bandits with Fixed Budget","summary":" We study the problem of best-arm identification with fixed budget in\nstochastic two-arm bandits with Bernoulli rewards. We prove that surprisingly,\nthere is no algorithm that (i) performs as well as the algorithm sampling each\narm equally (this algorithm is referred to as the {\\it uniform sampling}\nalgorithm) on all instances, and that (ii) strictly outperforms this algorithm\non at least one instance. In short, there is no algorithm better than the\nuniform sampling algorithm. Towards this result, we introduce the natural class\nof {\\it consistent} and {\\it stable} algorithms, and show that any algorithm\nthat performs as well as the uniform sampling algorithm on all instances\nbelongs to this class. The proof is completed by deriving a lower bound on the\nerror rate satisfied by any consistent and stable algorithm, and by showing\nthat the uniform sampling algorithm matches this lower bound. Our results\nprovide a solution to the two open problems presented in \\cite{qin2022open}.\n","authors":["Po-An Wang","Kaito Ariu","Alexandre Proutiere"],"pdf_url":"https://arxiv.org/pdf/2308.12000v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12575v1","updated":"2023-08-24T05:26:56Z","published":"2023-08-24T05:26:56Z","title":"Hypergraph Convolutional Networks for Fine-grained ICU Patient\n Similarity Analysis and Risk Prediction","summary":" The Intensive Care Unit (ICU) is one of the most important parts of a\nhospital, which admits critically ill patients and provides continuous\nmonitoring and treatment. Various patient outcome prediction methods have been\nattempted to assist healthcare professionals in clinical decision-making.\nExisting methods focus on measuring the similarity between patients using deep\nneural networks to capture the hidden feature structures. However, the\nhigher-order relationships are ignored, such as patient characteristics (e.g.,\ndiagnosis codes) and their causal effects on downstream clinical predictions.\n In this paper, we propose a novel Hypergraph Convolutional Network that\nallows the representation of non-pairwise relationships among diagnosis codes\nin a hypergraph to capture the hidden feature structures so that fine-grained\npatient similarity can be calculated for personalized mortality risk\nprediction. Evaluation using a publicly available eICU Collaborative Research\nDatabase indicates that our method achieves superior performance over the\nstate-of-the-art models on mortality risk prediction. Moreover, the results of\nseveral case studies demonstrated the effectiveness of constructing graph\nnetworks in providing good transparency and robustness in decision-making.\n","authors":["Yuxi Liu","Zhenhao Zhang","Shaowen Qin","Flora D. Salim","Antonio Jimeno Yepes","Jun Shen"],"pdf_url":"https://arxiv.org/pdf/2308.12575v1.pdf","comment":"7 pages, 2 figures, submitted to IEEE BIBM 2023"},{"id":"http://arxiv.org/abs/2308.12573v1","updated":"2023-08-24T05:26:42Z","published":"2023-08-24T05:26:42Z","title":"Conditional Kernel Imitation Learning for Continuous State Environments","summary":" Imitation Learning (IL) is an important paradigm within the broader\nreinforcement learning (RL) methodology. Unlike most of RL, it does not assume\navailability of reward-feedback. Reward inference and shaping are known to be\ndifficult and error-prone methods particularly when the demonstration data\ncomes from human experts. Classical methods such as behavioral cloning and\ninverse reinforcement learning are highly sensitive to estimation errors, a\nproblem that is particularly acute in continuous state space problems.\nMeanwhile, state-of-the-art IL algorithms convert behavioral policy learning\nproblems into distribution-matching problems which often require additional\nonline interaction data to be effective. In this paper, we consider the problem\nof imitation learning in continuous state space environments based solely on\nobserved behavior, without access to transition dynamics information, reward\nstructure, or, most importantly, any additional interactions with the\nenvironment. Our approach is based on the Markov balance equation and\nintroduces a novel conditional kernel density estimation-based imitation\nlearning framework. It involves estimating the environment's transition\ndynamics using conditional kernel density estimators and seeks to satisfy the\nprobabilistic balance equations for the environment. We establish that our\nestimators satisfy basic asymptotic consistency requirements. Through a series\nof numerical experiments on continuous state benchmark environments, we show\nconsistently superior empirical performance over many state-of-the-art IL\nalgorithms.\n","authors":["Rishabh Agrawal","Nathan Dahlin","Rahul Jain","Ashutosh Nayyar"],"pdf_url":"https://arxiv.org/pdf/2308.12573v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12563v1","updated":"2023-08-24T05:10:18Z","published":"2023-08-24T05:10:18Z","title":"Multivariate Time-Series Anomaly Detection with Contaminated Data:\n Application to Physiological Signals","summary":" Mainstream unsupervised anomaly detection algorithms often excel in academic\ndatasets, yet their real-world performance is restricted due to the controlled\nexperimental conditions involving clean training data. Addressing the challenge\nof training with noise, a prevalent issue in practical anomaly detection, is\nfrequently overlooked. In a pioneering endeavor, this study delves into the\nrealm of label-level noise within sensory time-series anomaly detection (TSAD).\nThis paper presents a novel and practical end-to-end unsupervised TSAD when the\ntraining data are contaminated with anomalies. The introduced approach, called\nTSAD-C, is devoid of access to abnormality labels during the training phase.\nTSAD-C encompasses three modules: a Decontaminator to rectify the abnormalities\n(aka noise) present in the training data, a Variable Dependency Modeling module\nto capture both long-term intra- and inter-variable dependencies within the\ndecontaminated data that can be considered as a surrogate of the pure normal\ndata, and an Anomaly Scoring module to detect anomalies. Our extensive\nexperiments conducted on three widely used physiological datasets conclusively\ndemonstrate that our approach surpasses existing methodologies, thus\nestablishing a new state-of-the-art performance in the field.\n","authors":["Thi Kieu Khanh Ho","Narges Armanfard"],"pdf_url":"https://arxiv.org/pdf/2308.12563v1.pdf","comment":"9 pages, 2 tables, 3 figures"},{"id":"http://arxiv.org/abs/2308.12562v1","updated":"2023-08-24T05:04:10Z","published":"2023-08-24T05:04:10Z","title":"Variational Information Pursuit with Large Language and Multimodal\n Models for Interpretable Predictions","summary":" Variational Information Pursuit (V-IP) is a framework for making\ninterpretable predictions by design by sequentially selecting a short chain of\ntask-relevant, user-defined and interpretable queries about the data that are\nmost informative for the task. While this allows for built-in interpretability\nin predictive models, applying V-IP to any task requires data samples with\ndense concept-labeling by domain experts, limiting the application of V-IP to\nsmall-scale tasks where manual data annotation is feasible. In this work, we\nextend the V-IP framework with Foundational Models (FMs) to address this\nlimitation. More specifically, we use a two-step process, by first leveraging\nLarge Language Models (LLMs) to generate a sufficiently large candidate set of\ntask-relevant interpretable concepts, then using Large Multimodal Models to\nannotate each data sample by semantic similarity with each concept in the\ngenerated concept set. While other interpretable-by-design frameworks such as\nConcept Bottleneck Models (CBMs) require an additional step of removing\nrepetitive and non-discriminative concepts to have good interpretability and\ntest performance, we mathematically and empirically justify that, with a\nsufficiently informative and task-relevant query (concept) set, the proposed\nFM+V-IP method does not require any type of concept filtering. In addition, we\nshow that FM+V-IP with LLM generated concepts can achieve better test\nperformance than V-IP with human annotated concepts, demonstrating the\neffectiveness of LLMs at generating efficient query sets. Finally, when\ncompared to other interpretable-by-design frameworks such as CBMs, FM+V-IP can\nachieve competitive test performance using fewer number of concepts/queries in\nboth cases with filtered or unfiltered concept sets.\n","authors":["Kwan Ho Ryan Chan","Aditya Chattopadhyay","Benjamin David Haeffele","Rene Vidal"],"pdf_url":"https://arxiv.org/pdf/2308.12562v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12554v1","updated":"2023-08-24T04:42:18Z","published":"2023-08-24T04:42:18Z","title":"Deep Reinforcement Learning-driven Cross-Community Energy Interaction\n Optimal Scheduling","summary":" In order to coordinate energy interactions among various communities and\nenergy conversions among multi-energy subsystems within the multi-community\nintegrated energy system under uncertain conditions, and achieve overall\noptimization and scheduling of the comprehensive energy system, this paper\nproposes a comprehensive scheduling model that utilizes a multi-agent deep\nreinforcement learning algorithm to learn load characteristics of different\ncommunities and make decisions based on this knowledge. In this model, the\nscheduling problem of the integrated energy system is transformed into a Markov\ndecision process and solved using a data-driven deep reinforcement learning\nalgorithm, which avoids the need for modeling complex energy coupling\nrelationships between multi-communities and multi-energy subsystems. The\nsimulation results show that the proposed method effectively captures the load\ncharacteristics of different communities and utilizes their complementary\nfeatures to coordinate reasonable energy interactions among them. This leads to\na reduction in wind curtailment rate from 16.3% to 0% and lowers the overall\noperating cost by 5445.6 Yuan, demonstrating significant economic and\nenvironmental benefits.\n","authors":["Yang Li","Fanjin Bu","Zhen Yang","Bin Wang","Meng Han"],"pdf_url":"https://arxiv.org/pdf/2308.12554v1.pdf","comment":"in Chinese language, Accepted by Electric Power Construction"},{"id":"http://arxiv.org/abs/2308.12553v1","updated":"2023-08-24T04:39:25Z","published":"2023-08-24T04:39:25Z","title":"Don't blame Dataset Shift! Shortcut Learning due to Gradients and Cross\n Entropy","summary":" Common explanations for shortcut learning assume that the shortcut improves\nprediction under the training distribution but not in the test distribution.\nThus, models trained via the typical gradient-based optimization of\ncross-entropy, which we call default-ERM, utilize the shortcut. However, even\nwhen the stable feature determines the label in the training distribution and\nthe shortcut does not provide any additional information, like in perception\ntasks, default-ERM still exhibits shortcut learning. Why are such solutions\npreferred when the loss for default-ERM can be driven to zero using the stable\nfeature alone? By studying a linear perception task, we show that default-ERM's\npreference for maximizing the margin leads to models that depend more on the\nshortcut than the stable feature, even without overparameterization. This\ninsight suggests that default-ERM's implicit inductive bias towards max-margin\nis unsuitable for perception tasks. Instead, we develop an inductive bias\ntoward uniform margins and show that this bias guarantees dependence only on\nthe perfect stable feature in the linear perception task. We develop loss\nfunctions that encourage uniform-margin solutions, called margin control\n(MARG-CTRL). MARG-CTRL mitigates shortcut learning on a variety of vision and\nlanguage tasks, showing that better inductive biases can remove the need for\nexpensive two-stage shortcut-mitigating methods in perception tasks.\n","authors":["Aahlad Puli","Lily Zhang","Yoav Wald","Rajesh Ranganath"],"pdf_url":"https://arxiv.org/pdf/2308.12553v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09725v2","updated":"2023-08-24T04:38:45Z","published":"2023-08-17T10:49:48Z","title":"MoCLIM: Towards Accurate Cancer Subtyping via Multi-Omics Contrastive\n Learning with Omics-Inference Modeling","summary":" Precision medicine fundamentally aims to establish causality between\ndysregulated biochemical mechanisms and cancer subtypes. Omics-based cancer\nsubtyping has emerged as a revolutionary approach, as different level of omics\nrecords the biochemical products of multistep processes in cancers. This paper\nfocuses on fully exploiting the potential of multi-omics data to improve cancer\nsubtyping outcomes, and hence developed MoCLIM, a representation learning\nframework. MoCLIM independently extracts the informative features from distinct\nomics modalities. Using a unified representation informed by contrastive\nlearning of different omics modalities, we can well-cluster the subtypes, given\ncancer, into a lower latent space. This contrast can be interpreted as a\nprojection of inter-omics inference observed in biological networks.\nExperimental results on six cancer datasets demonstrate that our approach\nsignificantly improves data fit and subtyping performance in fewer\nhigh-dimensional cancer instances. Moreover, our framework incorporates various\nmedical evaluations as the final component, providing high interpretability in\nmedical analysis.\n","authors":["Ziwei Yang","Zheng Chen","Yasuko Matsubara","Yasushi Sakurai"],"pdf_url":"https://arxiv.org/pdf/2308.09725v2.pdf","comment":"CIKM'23 Long/Full Papers"},{"id":"http://arxiv.org/abs/2308.12551v1","updated":"2023-08-24T04:33:30Z","published":"2023-08-24T04:33:30Z","title":"A Co-training Approach for Noisy Time Series Learning","summary":" In this work, we focus on robust time series representation learning. Our\nassumption is that real-world time series is noisy and complementary\ninformation from different views of the same time series plays an important\nrole while analyzing noisy input. Based on this, we create two views for the\ninput time series through two different encoders. We conduct co-training based\ncontrastive learning iteratively to learn the encoders. Our experiments\ndemonstrate that this co-training approach leads to a significant improvement\nin performance. Especially, by leveraging the complementary information from\ndifferent views, our proposed TS-CoT method can mitigate the impact of data\nnoise and corruption. Empirical evaluations on four time series benchmarks in\nunsupervised and semi-supervised settings reveal that TS-CoT outperforms\nexisting methods. Furthermore, the representations learned by TS-CoT can\ntransfer well to downstream tasks through fine-tuning.\n","authors":["Weiqi Zhang","Jianfeng Zhang","Jia Li","Fugee Tsung"],"pdf_url":"https://arxiv.org/pdf/2308.12551v1.pdf","comment":"Accepted by CIKM2023"},{"id":"http://arxiv.org/abs/2304.02169v2","updated":"2023-08-24T04:25:36Z","published":"2023-04-04T23:53:34Z","title":"Synthesize High-dimensional Longitudinal Electronic Health Records via\n Hierarchical Autoregressive Language Model","summary":" Synthetic electronic health records (EHRs) that are both realistic and\npreserve privacy can serve as an alternative to real EHRs for machine learning\n(ML) modeling and statistical analysis. However, generating high-fidelity and\ngranular electronic health record (EHR) data in its original,\nhighly-dimensional form poses challenges for existing methods due to the\ncomplexities inherent in high-dimensional data. In this paper, we propose\nHierarchical Autoregressive Language mOdel (HALO) for generating longitudinal\nhigh-dimensional EHR, which preserve the statistical properties of real EHR and\ncan be used to train accurate ML models without privacy concerns. Our HALO\nmethod, designed as a hierarchical autoregressive model, generates a\nprobability density function of medical codes, clinical visits, and patient\nrecords, allowing for the generation of realistic EHR data in its original,\nunaggregated form without the need for variable selection or aggregation.\nAdditionally, our model also produces high-quality continuous variables in a\nlongitudinal and probabilistic manner. We conducted extensive experiments and\ndemonstrate that HALO can generate high-fidelity EHR data with high-dimensional\ndisease code probabilities (d > 10,000), disease co-occurrence probabilities\nwithin visits (d > 1,000,000), and conditional probabilities across consecutive\nvisits (d > 5,000,000) and achieve above 0.9 R2 correlation in comparison to\nreal EHR data. This performance then enables downstream ML models trained on\nits synthetic data to achieve comparable accuracy to models trained on real\ndata (0.938 AUROC with HALO data vs. 0.943 with real data). Finally, using a\ncombination of real and synthetic data enhances the accuracy of ML models\nbeyond that achieved by using only real EHR data.\n","authors":["Brandon Theodorou","Cao Xiao","Jimeng Sun"],"pdf_url":"https://arxiv.org/pdf/2304.02169v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07134v3","updated":"2023-08-24T03:54:45Z","published":"2023-08-14T13:41:09Z","title":"Natural Language is All a Graph Needs","summary":" The emergence of large-scale pre-trained language models, such as ChatGPT,\nhas revolutionized various research fields in artificial intelligence.\nTransformers-based large language models (LLMs) have gradually replaced CNNs\nand RNNs to unify fields of computer vision and natural language processing.\nCompared with the data that exists relatively independently such as images,\nvideos or texts, graph is a type of data that contains rich structural and\nrelational information. Meanwhile, natural language, as one of the most\nexpressive mediums, excels in describing complex structures. However, existing\nwork on incorporating graph learning problems into the generative language\nmodeling framework remains very limited. As the importance of large language\nmodels continues to grow, it becomes essential to explore whether LLMs can also\nreplace GNNs as the foundation model for graphs. In this paper, we propose\nInstructGLM (Instruction-finetuned Graph Language Model), systematically design\nhighly scalable prompts based on natural language instructions, and use natural\nlanguage to describe the geometric structure and node features of the graph for\ninstruction tuning an LLM to perform learning and inference on graphs in a\ngenerative manner. Our method exceeds all competitive GNN baselines on\nogbn-arxiv, Cora and PubMed datasets, which demonstrates the effectiveness of\nour method and sheds light on generative large language models as the\nfoundation model for graph machine learning.\n","authors":["Ruosong Ye","Caiqi Zhang","Runhui Wang","Shuyuan Xu","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.07134v3.pdf","comment":"21 pages, 2 figures, 5 tables"},{"id":"http://arxiv.org/abs/2308.12539v1","updated":"2023-08-24T03:53:55Z","published":"2023-08-24T03:53:55Z","title":"CALM : A Multi-task Benchmark for Comprehensive Assessment of Language\n Model Bias","summary":" As language models (LMs) become increasingly powerful, it is important to\nquantify and compare them for sociodemographic bias with potential for harm.\nPrior bias measurement datasets are sensitive to perturbations in their\nmanually designed templates, therefore unreliable. To achieve reliability, we\nintroduce the Comprehensive Assessment of Language Model bias (CALM), a\nbenchmark dataset to quantify bias in LMs across three tasks. We integrate 16\nexisting datasets across different domains, such as Wikipedia and news\narticles, to filter 224 templates from which we construct a dataset of 78,400\nexamples. We compare the diversity of CALM with prior datasets on metrics such\nas average semantic similarity, and variation in template length, and test the\nsensitivity to small perturbations. We show that our dataset is more diverse\nand reliable than previous datasets, thus better capture the breadth of\nlinguistic variation required to reliably evaluate model bias. We evaluate 20\nlarge language models including six prominent families of LMs such as Llama-2.\nIn two LM series, OPT and Bloom, we found that larger parameter models are more\nbiased than lower parameter models. We found the T0 series of models to be the\nleast biased. Furthermore, we noticed a tradeoff between gender and racial bias\nwith increasing model size in some model series. The code is available at\nhttps://github.com/vipulgupta1011/CALM.\n","authors":["Vipul Gupta","Pranav Narayanan Venkit","Hugo Laurençon","Shomir Wilson","Rebecca J. Passonneau"],"pdf_url":"https://arxiv.org/pdf/2308.12539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12532v1","updated":"2023-08-24T03:43:02Z","published":"2023-08-24T03:43:02Z","title":"FedSoL: Bridging Global Alignment and Local Generality in Federated\n Learning","summary":" Federated Learning (FL) aggregates locally trained models from individual\nclients to construct a global model. While FL enables learning a model with\ndata privacy, it often suffers from significant performance degradation when\nclient data distributions are heterogeneous. Many previous FL algorithms have\naddressed this issue by introducing various proximal restrictions. These\nrestrictions aim to encourage global alignment by constraining the deviation of\nlocal learning from the global objective. However, they inherently limit local\nlearning by interfering with the original local objectives. Recently, an\nalternative approach has emerged to improve local learning generality. By\nobtaining local models within a smooth loss landscape, this approach mitigates\nconflicts among different local objectives of the clients. Yet, it does not\nensure stable global alignment, as local learning does not take the global\nobjective into account. In this study, we propose Federated Stability on\nLearning (FedSoL), which combines both the concepts of global alignment and\nlocal generality. In FedSoL, the local learning seeks a parameter region robust\nagainst proximal perturbations. This strategy introduces an implicit proximal\nrestriction effect in local learning while maintaining the original local\nobjective for parameter update. Our experiments show that FedSoL consistently\nachieves state-of-the-art performance on various setups.\n","authors":["Gihun Lee","Minchan Jeong","Sangmook Kim","Jaehoon Oh","Se-Young Yun"],"pdf_url":"https://arxiv.org/pdf/2308.12532v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12530v1","updated":"2023-08-24T03:40:16Z","published":"2023-08-24T03:40:16Z","title":"SieveNet: Selecting Point-Based Features for Mesh Networks","summary":" Meshes are widely used in 3D computer vision and graphics, but their\nirregular topology poses challenges in applying them to existing neural network\narchitectures. Recent advances in mesh neural networks turn to remeshing and\npush the boundary of pioneer methods that solely take the raw meshes as input.\nAlthough the remeshing offers a regular topology that significantly facilitates\nthe design of mesh network architectures, features extracted from such remeshed\nproxies may struggle to retain the underlying geometry faithfully, limiting the\nsubsequent neural network's capacity. To address this issue, we propose\nSieveNet, a novel paradigm that takes into account both the regular topology\nand the exact geometry. Specifically, this method utilizes structured mesh\ntopology from remeshing and accurate geometric information from\ndistortion-aware point sampling on the surface of the original mesh.\nFurthermore, our method eliminates the need for hand-crafted feature\nengineering and can leverage off-the-shelf network architectures such as the\nvision transformer. Comprehensive experimental results on classification and\nsegmentation tasks well demonstrate the effectiveness and superiority of our\nmethod.\n","authors":["Shengchao Yuan","Yishun Dou","Rui Shi","Bingbing Ni","Zhong Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.12530v1.pdf","comment":"The project homepage is https://sievenet.github.io/"},{"id":"http://arxiv.org/abs/2308.12526v1","updated":"2023-08-24T03:30:38Z","published":"2023-08-24T03:30:38Z","title":"UNISOUND System for VoxCeleb Speaker Recognition Challenge 2023","summary":" This report describes the UNISOUND submission for Track1 and Track2 of\nVoxCeleb Speaker Recognition Challenge 2023 (VoxSRC 2023). We submit the same\nsystem on Track 1 and Track 2, which is trained with only VoxCeleb2-dev.\nLarge-scale ResNet and RepVGG architectures are developed for the challenge. We\npropose a consistency-aware score calibration method, which leverages the\nstability of audio voiceprints in similarity score by a Consistency Measure\nFactor (CMF). CMF brings a huge performance boost in this challenge. Our final\nsystem is a fusion of six models and achieves the first place in Track 1 and\nsecond place in Track 2 of VoxSRC 2023. The minDCF of our submission is 0.0855\nand the EER is 1.5880%.\n","authors":["Yu Zheng","Yajun Zhang","Chuanying Niu","Yibin Zhan","Yanhua Long","Dongxing Xu"],"pdf_url":"https://arxiv.org/pdf/2308.12526v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11881v2","updated":"2023-08-24T03:16:55Z","published":"2023-08-23T02:58:02Z","title":"Adversarial Training Using Feedback Loops","summary":" Deep neural networks (DNN) have found wide applicability in numerous fields\ndue to their ability to accurately learn very complex input-output relations.\nDespite their accuracy and extensive use, DNNs are highly susceptible to\nadversarial attacks due to limited generalizability. For future progress in the\nfield, it is essential to build DNNs that are robust to any kind of\nperturbations to the data points. In the past, many techniques have been\nproposed to robustify DNNs using first-order derivative information of the\nnetwork.\n This paper proposes a new robustification approach based on control theory. A\nneural network architecture that incorporates feedback control, named Feedback\nNeural Networks, is proposed. The controller is itself a neural network, which\nis trained using regular and adversarial data such as to stabilize the system\noutputs. The novel adversarial training approach based on the feedback control\narchitecture is called Feedback Looped Adversarial Training (FLAT). Numerical\nresults on standard test problems empirically show that our FLAT method is more\neffective than the state-of-the-art to guard against adversarial attacks.\n","authors":["Ali Haisam Muhammad Rafid","Adrian Sandu"],"pdf_url":"https://arxiv.org/pdf/2308.11881v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12517v1","updated":"2023-08-24T03:06:20Z","published":"2023-08-24T03:06:20Z","title":"Not Only Rewards But Also Constraints: Applications on Legged Robot\n Locomotion","summary":" Several earlier studies have shown impressive control performance in complex\nrobotic systems by designing the controller using a neural network and training\nit with model-free reinforcement learning. However, these outstanding\ncontrollers with natural motion style and high task performance are developed\nthrough extensive reward engineering, which is a highly laborious and\ntime-consuming process of designing numerous reward terms and determining\nsuitable reward coefficients. In this work, we propose a novel reinforcement\nlearning framework for training neural network controllers for complex robotic\nsystems consisting of both rewards and constraints. To let the engineers\nappropriately reflect their intent to constraints and handle them with minimal\ncomputation overhead, two constraint types and an efficient policy optimization\nalgorithm are suggested. The learning framework is applied to train locomotion\ncontrollers for several legged robots with different morphology and physical\nattributes to traverse challenging terrains. Extensive simulation and\nreal-world experiments demonstrate that performant controllers can be trained\nwith significantly less reward engineering, by tuning only a single reward\ncoefficient. Furthermore, a more straightforward and intuitive engineering\nprocess can be utilized, thanks to the interpretability and generalizability of\nconstraints. The summary video is available at https://youtu.be/KAlm3yskhvM.\n","authors":["Yunho Kim","Hyunsik Oh","Jeonghyun Lee","Jinhyeok Choi","Gwanghyeon Ji","Moonkyu Jung","Donghoon Youm","Jemin Hwangbo"],"pdf_url":"https://arxiv.org/pdf/2308.12517v1.pdf","comment":"Submitted to Transactions on Robotics (T-RO)"},{"id":"http://arxiv.org/abs/2308.12510v1","updated":"2023-08-24T02:49:30Z","published":"2023-08-24T02:49:30Z","title":"Masked Autoencoders are Efficient Class Incremental Learners","summary":" Class Incremental Learning (CIL) aims to sequentially learn new classes while\navoiding catastrophic forgetting of previous knowledge. We propose to use\nMasked Autoencoders (MAEs) as efficient learners for CIL. MAEs were originally\ndesigned to learn useful representations through reconstructive unsupervised\nlearning, and they can be easily integrated with a supervised loss for\nclassification. Moreover, MAEs can reliably reconstruct original input images\nfrom randomly selected patches, which we use to store exemplars from past tasks\nmore efficiently for CIL. We also propose a bilateral MAE framework to learn\nfrom image-level and embedding-level fusion, which produces better-quality\nreconstructed images and more stable representations. Our experiments confirm\nthat our approach performs better than the state-of-the-art on CIFAR-100,\nImageNet-Subset, and ImageNet-Full. The code is available at\nhttps://github.com/scok30/MAE-CIL .\n","authors":["Jiang-Tian Zhai","Xialei Liu","Andrew D. Bagdanov","Ke Li","Ming-Ming Cheng"],"pdf_url":"https://arxiv.org/pdf/2308.12510v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2208.11945v3","updated":"2023-08-24T01:53:24Z","published":"2022-08-25T09:02:32Z","title":"Efficient Adaptive Activation Rounding for Post-Training Quantization","summary":" Post-training quantization attracts increasing attention due to its\nconvenience in deploying quantized neural networks. Although\nrounding-to-nearest remains the prevailing method for DNN quantization, prior\nresearch has demonstrated its suboptimal nature when applied to weight\nquantization. They propose optimizing weight rounding schemes by leveraging\noutput error rather than the traditional weight quantization error. Our study\nreveals that similar rounding challenges also extend to activation\nquantization. Despite the easy generalization, the challenges lie in the\ndynamic nature of activation. Adaptive rounding is expected for varying\nactivations and the method is subjected to runtime overhead. To tackle this, we\npropose the AQuant quantization framework with a novel perspective to reduce\noutput error by adjusting rounding schemes of activations. Instead of using the\nconstant rounding border 0.5 of the rounding-to-nearest operation, we make the\nborder become a function w.r.t. the activation value to change the activation\nrounding by the adaptive border. To deal with the runtime overhead, we use a\ncoarse-grained version of the border function. Finally, we introduce our\nframework to optimize the border function. Extensive experiments show that\nAQuant achieves notable improvements compared to state-of-the-art works and\npushes the accuracy of ResNet-18 up to 60.31% under the 2-bit weight and\nactivation quantization.\n","authors":["Zhengyi Li","Cong Guo","Zhanda Zhu","Yangjie Zhou","Yuxian Qiu","Xiaotian Gao","Jingwen Leng","Minyi Guo"],"pdf_url":"https://arxiv.org/pdf/2208.11945v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12497v1","updated":"2023-08-24T01:37:33Z","published":"2023-08-24T01:37:33Z","title":"False Information, Bots and Malicious Campaigns: Demystifying Elements\n of Social Media Manipulations","summary":" The rapid spread of false information and persistent manipulation attacks on\nonline social networks (OSNs), often for political, ideological, or financial\ngain, has affected the openness of OSNs. While researchers from various\ndisciplines have investigated different manipulation-triggering elements of\nOSNs (such as understanding information diffusion on OSNs or detecting\nautomated behavior of accounts), these works have not been consolidated to\npresent a comprehensive overview of the interconnections among these elements.\nNotably, user psychology, the prevalence of bots, and their tactics in relation\nto false information detection have been overlooked in previous research. To\naddress this research gap, this paper synthesizes insights from various\ndisciplines to provide a comprehensive analysis of the manipulation landscape.\nBy integrating the primary elements of social media manipulation (SMM),\nincluding false information, bots, and malicious campaigns, we extensively\nexamine each SMM element. Through a systematic investigation of prior research,\nwe identify commonalities, highlight existing gaps, and extract valuable\ninsights in the field. Our findings underscore the urgent need for\ninterdisciplinary research to effectively combat social media manipulations,\nand our systematization can guide future research efforts and assist OSN\nproviders in ensuring the safety and integrity of their platforms.\n","authors":["Mohammad Majid Akhtar","Rahat Masood","Muhammad Ikram","Salil S. Kanhere"],"pdf_url":"https://arxiv.org/pdf/2308.12497v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12492v1","updated":"2023-08-24T01:26:31Z","published":"2023-08-24T01:26:31Z","title":"Optimizing Neural Network Scale for ECG Classification","summary":" We study scaling convolutional neural networks (CNNs), specifically targeting\nResidual neural networks (ResNet), for analyzing electrocardiograms (ECGs).\nAlthough ECG signals are time-series data, CNN-based models have been shown to\noutperform other neural networks with different architectures in ECG analysis.\nHowever, most previous studies in ECG analysis have overlooked the importance\nof network scaling optimization, which significantly improves performance. We\nexplored and demonstrated an efficient approach to scale ResNet by examining\nthe effects of crucial parameters, including layer depth, the number of\nchannels, and the convolution kernel size. Through extensive experiments, we\nfound that a shallower network, a larger number of channels, and smaller kernel\nsizes result in better performance for ECG classifications. The optimal network\nscale might differ depending on the target task, but our findings provide\ninsight into obtaining more efficient and accurate models with fewer computing\nresources or less time. In practice, we demonstrate that a narrower search\nspace based on our findings leads to higher performance.\n","authors":["Byeong Tak Lee","Yong-Yeon Jo","Joon-Myoung Kwon"],"pdf_url":"https://arxiv.org/pdf/2308.12492v1.pdf","comment":"30pages"},{"id":"http://arxiv.org/abs/2308.12481v1","updated":"2023-08-24T00:49:07Z","published":"2023-08-24T00:49:07Z","title":"Fall Detection using Knowledge Distillation Based Long short-term memory\n for Offline Embedded and Low Power Devices","summary":" This paper presents a cost-effective, low-power approach to unintentional\nfall detection using knowledge distillation-based LSTM (Long Short-Term Memory)\nmodels to significantly improve accuracy. With a primary focus on analyzing\ntime-series data collected from various sensors, the solution offers real-time\ndetection capabilities, ensuring prompt and reliable identification of falls.\nThe authors investigate fall detection models that are based on different\nsensors, comparing their accuracy rates and performance. Furthermore, they\nemploy the technique of knowledge distillation to enhance the models'\nprecision, resulting in refined accurate configurations that consume lower\npower. As a result, this proposed solution presents a compelling avenue for the\ndevelopment of energy-efficient fall detection systems for future advancements\nin this critical domain.\n","authors":["Hannah Zhou","Allison Chen","Celine Buer","Emily Chen","Kayleen Tang","Lauryn Gong","Zhiqi Liu","Jianbin Tang"],"pdf_url":"https://arxiv.org/pdf/2308.12481v1.pdf","comment":"4 pages"},{"id":"http://arxiv.org/abs/2306.10466v2","updated":"2023-08-24T00:28:53Z","published":"2023-06-18T03:33:46Z","title":"Graph Ladling: Shockingly Simple Parallel GNN Training without\n Intermediate Communication","summary":" Graphs are omnipresent and GNNs are a powerful family of neural networks for\nlearning over graphs. Despite their popularity, scaling GNNs either by\ndeepening or widening suffers from prevalent issues of unhealthy gradients,\nover-smoothening, information squashing, which often lead to sub-standard\nperformance. In this work, we are interested in exploring a principled way to\nscale GNNs capacity without deepening or widening, which can improve its\nperformance across multiple small and large graphs. Motivated by the recent\nintriguing phenomenon of model soups, which suggest that fine-tuned weights of\nmultiple large-language pre-trained models can be merged to a better minima, we\nargue to exploit the fundamentals of model soups to mitigate the aforementioned\nissues of memory bottleneck and trainability during GNNs scaling. More\nspecifically, we propose not to deepen or widen current GNNs, but instead\npresent a data-centric perspective of model soups tailored for GNNs, i.e., to\nbuild powerful GNNs. By dividing giant graph data, we build multiple\nindependently and parallelly trained weaker GNNs (soup ingredient) without any\nintermediate communication, and combine their strength using a greedy\ninterpolation soup procedure to achieve state-of-the-art performance. Compared\nto concurrent distributed GNN training works such as Jiong et. al. 2023, we\ntrain each soup ingredient by sampling different subgraphs per epoch and their\nrespective sub-models are merged only after being fully trained (rather than\nintermediately so). Moreover, we provide a wide variety of model soup\npreparation techniques by leveraging state-of-the-art graph sampling and graph\npartitioning approaches that can handle large graphs. Codes are available at:\n\\url{https://github.com/VITA-Group/graph_ladling}.\n","authors":["Ajay Jaiswal","Shiwei Liu","Tianlong Chen","Ying Ding","Zhangyang Wang"],"pdf_url":"https://arxiv.org/pdf/2306.10466v2.pdf","comment":"Accepted in ICML 2023. Included comparison with a concurrent work\n (Jiong et. al. 2023) which independently presents similar ideas, among other\n SOTA distributed GNN training works"},{"id":"http://arxiv.org/abs/2308.13118v1","updated":"2023-08-24T23:49:27Z","published":"2023-08-24T23:49:27Z","title":"Business Metric-Aware Forecasting for Inventory Management","summary":" Time-series forecasts play a critical role in business planning. However,\nforecasters typically optimize objectives that are agnostic to downstream\nbusiness goals and thus can produce forecasts misaligned with business\npreferences. In this work, we demonstrate that optimization of conventional\nforecasting metrics can often lead to sub-optimal downstream business\nperformance. Focusing on the inventory management setting, we derive an\nefficient procedure for computing and optimizing proxies of common downstream\nbusiness metrics in an end-to-end differentiable manner. We explore a wide\nrange of plausible cost trade-off scenarios, and empirically demonstrate that\nend-to-end optimization often outperforms optimization of standard\nbusiness-agnostic forecasting metrics (by up to 45.7% for a simple scaling\nmodel, and up to 54.0% for an LSTM encoder-decoder model). Finally, we discuss\nhow our findings could benefit other business contexts.\n","authors":["Helen Zhou","Sercan O. Arik","Jingtao Wang"],"pdf_url":"https://arxiv.org/pdf/2308.13118v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13111v1","updated":"2023-08-24T23:06:21Z","published":"2023-08-24T23:06:21Z","title":"Bayesian low-rank adaptation for large language models","summary":" Parameter-efficient fine-tuning (PEFT) has emerged as a new paradigm for\ncost-efficient fine-tuning of large language models (LLMs), with low-rank\nadaptation (LoRA) being a widely adopted choice. However, fine-tuned LLMs often\nbecome overconfident especially on when fine-tuned on smaller datasets.\nBayesian methods, with their inherent ability to estimate uncertainty, serve as\npotent tools to mitigate overconfidence and enhance calibration. In this work,\nwe introduce Laplace-LoRA, a straightforward yet effective Bayesian method,\nwhich applies the Laplace approximation to the LoRA parameters and,\nconsiderably boosts the calibration of fine-tuned LLMs.\n","authors":["Adam X. Yang","Maxime Robeyns","Xi Wang","Laurence Aitchison"],"pdf_url":"https://arxiv.org/pdf/2308.13111v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.05527v3","updated":"2023-08-24T23:01:23Z","published":"2023-04-11T22:45:18Z","title":"Black Box Variational Inference with a Deterministic Objective: Faster,\n More Accurate, and Even More Black Box","summary":" Automatic differentiation variational inference (ADVI) offers fast and\neasy-to-use posterior approximation in multiple modern probabilistic\nprogramming languages. However, its stochastic optimizer lacks clear\nconvergence criteria and requires tuning parameters. Moreover, ADVI inherits\nthe poor posterior uncertainty estimates of mean-field variational Bayes\n(MFVB). We introduce \"deterministic ADVI\" (DADVI) to address these issues.\nDADVI replaces the intractable MFVB objective with a fixed Monte Carlo\napproximation, a technique known in the stochastic optimization literature as\nthe \"sample average approximation\" (SAA). By optimizing an approximate but\ndeterministic objective, DADVI can use off-the-shelf second-order optimization,\nand, unlike standard mean-field ADVI, is amenable to more accurate posterior\ncovariances via linear response (LR). In contrast to existing worst-case\ntheory, we show that, on certain classes of common statistical problems, DADVI\nand the SAA can perform well with relatively few samples even in very high\ndimensions, though we also show that such favorable results cannot extend to\nvariational approximations that are too expressive relative to mean-field ADVI.\nWe show on a variety of real-world problems that DADVI reliably finds good\nsolutions with default settings (unlike ADVI) and, together with LR\ncovariances, is typically faster and more accurate than standard ADVI.\n","authors":["Ryan Giordano","Martin Ingram","Tamara Broderick"],"pdf_url":"https://arxiv.org/pdf/2304.05527v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13104v1","updated":"2023-08-24T22:36:22Z","published":"2023-08-24T22:36:22Z","title":"Contrastive Learning of Temporal Distinctiveness for Survival Analysis\n in Electronic Health Records","summary":" Survival analysis plays a crucial role in many healthcare decisions, where\nthe risk prediction for the events of interest can support an informative\noutlook for a patient's medical journey. Given the existence of data censoring,\nan effective way of survival analysis is to enforce the pairwise temporal\nconcordance between censored and observed data, aiming to utilize the time\ninterval before censoring as partially observed time-to-event labels for\nsupervised learning. Although existing studies mostly employed ranking methods\nto pursue an ordering objective, contrastive methods which learn a\ndiscriminative embedding by having data contrast against each other, have not\nbeen explored thoroughly for survival analysis. Therefore, in this paper, we\npropose a novel Ontology-aware Temporality-based Contrastive Survival (OTCSurv)\nanalysis framework that utilizes survival durations from both censored and\nobserved data to define temporal distinctiveness and construct negative sample\npairs with adjustable hardness for contrastive learning. Specifically, we first\nuse an ontological encoder and a sequential self-attention encoder to represent\nthe longitudinal EHR data with rich contexts. Second, we design a temporal\ncontrastive loss to capture varying survival durations in a supervised setting\nthrough a hardness-aware negative sampling mechanism. Last, we incorporate the\ncontrastive task into the time-to-event predictive task with multiple loss\ncomponents. We conduct extensive experiments using a large EHR dataset to\nforecast the risk of hospitalized patients who are in danger of developing\nacute kidney injury (AKI), a critical and urgent medical condition. The\neffectiveness and explainability of the proposed model are validated through\ncomprehensive quantitative and qualitative studies.\n","authors":["Mohsen Nayebi Kerdabadi","Arya Hadizadeh Moghaddam","Bin Liu","Mei Liu","Zijun Yao"],"pdf_url":"https://arxiv.org/pdf/2308.13104v1.pdf","comment":"This paper has been accepted for publication at the CIKM 2023\n conference"},{"id":"http://arxiv.org/abs/2201.08865v2","updated":"2023-08-24T21:58:18Z","published":"2022-01-21T19:18:42Z","title":"On the in vivo recognition of kidney stones using machine learning","summary":" Determining the type of kidney stones allows urologists to prescribe a\ntreatment to avoid recurrence of renal lithiasis. An automated in-vivo\nimage-based classification method would be an important step towards an\nimmediate identification of the kidney stone type required as a first phase of\nthe diagnosis. In the literature it was shown on ex-vivo data (i.e., in very\ncontrolled scene and image acquisition conditions) that an automated kidney\nstone classification is indeed feasible. This pilot study compares the kidney\nstone recognition performances of six shallow machine learning methods and\nthree deep-learning architectures which were tested with in-vivo images of the\nfour most frequent urinary calculi types acquired with an endoscope during\nstandard ureteroscopies. This contribution details the database construction\nand the design of the tested kidney stones classifiers. Even if the best\nresults were obtained by the Inception v3 architecture (weighted precision,\nrecall and F1-score of 0.97, 0.98 and 0.97, respectively), it is also shown\nthat choosing an appropriate colour space and texture features allows a shallow\nmachine learning method to approach closely the performances of the most\npromising deep-learning methods (the XGBoost classifier led to weighted\nprecision, recall and F1-score values of 0.96). This paper is the first one\nthat explores the most discriminant features to be extracted from images\nacquired during ureteroscopies.\n","authors":["Francisco Lopez-Tiro","Vincent Estrade","Jacques Hubert","Daniel Flores-Araiza","Miguel Gonzalez-Mendoza","Gilberto Ochoa-Ruiz","Christian Daul"],"pdf_url":"https://arxiv.org/pdf/2201.08865v2.pdf","comment":"Paper submitted to IEEE Access"},{"id":"http://arxiv.org/abs/2306.12589v2","updated":"2023-08-24T21:26:05Z","published":"2023-06-21T22:01:12Z","title":"Rapid building damage assessment workflow: An implementation for the\n 2023 Rolling Fork, Mississippi tornado event","summary":" Rapid and accurate building damage assessments from high-resolution satellite\nimagery following a natural disaster is essential to inform and optimize first\nresponder efforts. However, performing such building damage assessments in an\nautomated manner is non-trivial due to the challenges posed by variations in\ndisaster-specific damage, diversity in satellite imagery, and the dearth of\nextensive, labeled datasets. To circumvent these issues, this paper introduces\na human-in-the-loop workflow for rapidly training building damage assessment\nmodels after a natural disaster. This article details a case study using this\nworkflow, executed in partnership with the American Red Cross during a tornado\nevent in Rolling Fork, Mississippi in March, 2023. The output from our\nhuman-in-the-loop modeling process achieved a precision of 0.86 and recall of\n0.80 for damaged buildings when compared to ground truth data collected\npost-disaster. This workflow was implemented end-to-end in under 2 hours per\nsatellite imagery scene, highlighting its potential for real-time deployment.\n","authors":["Caleb Robinson","Simone Fobi Nsutezo","Anthony Ortiz","Tina Sederholm","Rahul Dodhia","Cameron Birge","Kasie Richards","Kris Pitcher","Paulo Duarte","Juan M. Lavista Ferres"],"pdf_url":"https://arxiv.org/pdf/2306.12589v2.pdf","comment":"Accepted at the 2023 ICCV Humanitarian Assistance and Disaster\n Response workshop"},{"id":"http://arxiv.org/abs/2308.13088v1","updated":"2023-08-24T21:16:03Z","published":"2023-08-24T21:16:03Z","title":"Racing Towards Reinforcement Learning based control of an Autonomous\n Formula SAE Car","summary":" With the rising popularity of autonomous navigation research, Formula Student\n(FS) events are introducing a Driverless Vehicle (DV) category to their event\nlist. This paper presents the initial investigation into utilising Deep\nReinforcement Learning (RL) for end-to-end control of an autonomous FS race car\nfor these competitions. We train two state-of-the-art RL algorithms in\nsimulation on tracks analogous to the full-scale design on a Turtlebot2\nplatform. The results demonstrate that our approach can successfully learn to\nrace in simulation and then transfer to a real-world racetrack on the physical\nplatform. Finally, we provide insights into the limitations of the presented\napproach and guidance into the future directions for applying RL toward\nfull-scale autonomous FS racing.\n","authors":["Aakaash Salvaji","Harry Taylor","David Valencia","Trevor Gee","Henry Williams"],"pdf_url":"https://arxiv.org/pdf/2308.13088v1.pdf","comment":"Accepted at the Australasian Conference on Robotics and Automation\n (ACRA 2022)"},{"id":"http://arxiv.org/abs/2308.13086v1","updated":"2023-08-24T21:11:55Z","published":"2023-08-24T21:11:55Z","title":"SHIELD: Sustainable Hybrid Evolutionary Learning Framework for Carbon,\n Wastewater, and Energy-Aware Data Center Management","summary":" Today's cloud data centers are often distributed geographically to provide\nrobust data services. But these geo-distributed data centers (GDDCs) have a\nsignificant associated environmental impact due to their increasing carbon\nemissions and water usage, which needs to be curtailed. Moreover, the energy\ncosts of operating these data centers continue to rise. This paper proposes a\nnovel framework to co-optimize carbon emissions, water footprint, and energy\ncosts of GDDCs, using a hybrid workload management framework called SHIELD that\nintegrates machine learning guided local search with a decomposition-based\nevolutionary algorithm. Our framework considers geographical factors and\ntime-based differences in power generation/use, costs, and environmental\nimpacts to intelligently manage workload distribution across GDDCs and data\ncenter operation. Experimental results show that SHIELD can realize 34.4x\nspeedup and 2.1x improvement in Pareto Hypervolume while reducing the carbon\nfootprint by up to 3.7x, water footprint by up to 1.8x, energy costs by up to\n1.3x, and a cumulative improvement across all objectives (carbon, water, cost)\nof up to 4.8x compared to the state-of-the-art.\n","authors":["Sirui Qi","Dejan Milojicic","Cullen Bash","Sudeep Pasricha"],"pdf_url":"https://arxiv.org/pdf/2308.13086v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11490v2","updated":"2023-08-24T20:52:01Z","published":"2023-08-22T15:10:45Z","title":"Can Authorship Representation Learning Capture Stylistic Features?","summary":" Automatically disentangling an author's style from the content of their\nwriting is a longstanding and possibly insurmountable problem in computational\nlinguistics. At the same time, the availability of large text corpora furnished\nwith author labels has recently enabled learning authorship representations in\na purely data-driven manner for authorship attribution, a task that ostensibly\ndepends to a greater extent on encoding writing style than encoding content.\nHowever, success on this surrogate task does not ensure that such\nrepresentations capture writing style since authorship could also be correlated\nwith other latent variables, such as topic. In an effort to better understand\nthe nature of the information these representations convey, and specifically to\nvalidate the hypothesis that they chiefly encode writing style, we\nsystematically probe these representations through a series of targeted\nexperiments. The results of these experiments suggest that representations\nlearned for the surrogate authorship prediction task are indeed sensitive to\nwriting style. As a consequence, authorship representations may be expected to\nbe robust to certain kinds of data shift, such as topic drift over time.\nAdditionally, our findings may open the door to downstream applications that\nrequire stylistic representations, such as style transfer.\n","authors":["Andrew Wang","Cristina Aggazzotti","Rebecca Kotula","Rafael Rivera Soto","Marcus Bishop","Nicholas Andrews"],"pdf_url":"https://arxiv.org/pdf/2308.11490v2.pdf","comment":"appearing at TACL 2023"},{"id":"http://arxiv.org/abs/2308.13068v1","updated":"2023-08-24T20:24:12Z","published":"2023-08-24T20:24:12Z","title":"Multivariate Time Series Anomaly Detection: Fancy Algorithms and Flawed\n Evaluation Methodology","summary":" Multivariate Time Series (MVTS) anomaly detection is a long-standing and\nchallenging research topic that has attracted tremendous research effort from\nboth industry and academia recently. However, a careful study of the literature\nmakes us realize that 1) the community is active but not as organized as other\nsibling machine learning communities such as Computer Vision (CV) and Natural\nLanguage Processing (NLP), and 2) most proposed solutions are evaluated using\neither inappropriate or highly flawed protocols, with an apparent lack of\nscientific foundation. So flawed is one very popular protocol, the so-called\n\\pa protocol, that a random guess can be shown to systematically outperform\n\\emph{all} algorithms developed so far. In this paper, we review and evaluate\nmany recent algorithms using more robust protocols and discuss how a normally\ngood protocol may have weaknesses in the context of MVTS anomaly detection and\nhow to mitigate them. We also share our concerns about benchmark datasets,\nexperiment design and evaluation methodology we observe in many works.\nFurthermore, we propose a simple, yet challenging, baseline algorithm based on\nPrincipal Components Analysis (PCA) that surprisingly outperforms many recent\nDeep Learning (DL) based approaches on popular benchmark datasets. The main\nobjective of this work is to stimulate more effort towards important aspects of\nthe research such as data, experiment design, evaluation methodology and result\ninterpretability, instead of putting the highest weight on the design of\nincreasingly more complex and \"fancier\" algorithms.\n","authors":["Mohamed El Amine Sehili","Zonghua Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.13068v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13066v1","updated":"2023-08-24T20:22:22Z","published":"2023-08-24T20:22:22Z","title":"Objective-Agnostic Enhancement of Molecule Properties via Multi-Stage\n VAE","summary":" Variational autoencoder (VAE) is a popular method for drug discovery and\nvarious architectures and pipelines have been proposed to improve its\nperformance. However, VAE approaches are known to suffer from poor manifold\nrecovery when the data lie on a low-dimensional manifold embedded in a higher\ndimensional ambient space [Dai and Wipf, 2019]. The consequences of it in drug\ndiscovery are somewhat under-explored. In this paper, we explore applying a\nmulti-stage VAE approach, that can improve manifold recovery on a synthetic\ndataset, to the field of drug discovery. We experimentally evaluate our\nmulti-stage VAE approach using the ChEMBL dataset and demonstrate its ability\nto improve the property statistics of generated molecules substantially from\npre-existing methods without incorporating property predictors into the\ntraining pipeline. We further fine-tune our models on two curated and much\nsmaller molecule datasets that target different proteins. Our experiments show\nan increase in the number of active molecules generated by the multi-stage VAE\nin comparison to their one-stage equivalent. For each of the two tasks, our\nbaselines include methods that use learned property predictors to incorporate\ntarget metrics directly into the training objective and we discuss\ncomplications that arise with this methodology.\n","authors":["Chenghui Zhou","Barnabas Poczos"],"pdf_url":"https://arxiv.org/pdf/2308.13066v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2212.02750"},{"id":"http://arxiv.org/abs/2308.13062v1","updated":"2023-08-24T20:04:36Z","published":"2023-08-24T20:04:36Z","title":"ZeroLeak: Using LLMs for Scalable and Cost Effective Side-Channel\n Patching","summary":" Security critical software, e.g., OpenSSL, comes with numerous side-channel\nleakages left unpatched due to a lack of resources or experts. The situation\nwill only worsen as the pace of code development accelerates, with developers\nrelying on Large Language Models (LLMs) to automatically generate code. In this\nwork, we explore the use of LLMs in generating patches for vulnerable code with\nmicroarchitectural side-channel leakages. For this, we investigate the\ngenerative abilities of powerful LLMs by carefully crafting prompts following a\nzero-shot learning approach. All generated code is dynamically analyzed by\nleakage detection tools, which are capable of pinpointing information leakage\nat the instruction level leaked either from secret dependent accesses or\nbranches or vulnerable Spectre gadgets, respectively. Carefully crafted prompts\nare used to generate candidate replacements for vulnerable code, which are then\nanalyzed for correctness and for leakage resilience. From a cost/performance\nperspective, the GPT4-based configuration costs in API calls a mere few cents\nper vulnerability fixed. Our results show that LLM-based patching is far more\ncost-effective and thus provides a scalable solution. Finally, the framework we\npropose will improve in time, especially as vulnerability detection tools and\nLLMs mature.\n","authors":["M. Caner Tol","Berk Sunar"],"pdf_url":"https://arxiv.org/pdf/2308.13062v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.09270v2","updated":"2023-08-24T20:01:16Z","published":"2023-04-18T20:17:08Z","title":"Coarse race data conceals disparities in clinical risk score performance","summary":" Healthcare data in the United States often records only a patient's coarse\nrace group: for example, both Indian and Chinese patients are typically coded\nas \"Asian.\" It is unknown, however, whether this coarse coding conceals\nmeaningful disparities in the performance of clinical risk scores across\ngranular race groups. Here we show that it does. Using data from 418K emergency\ndepartment visits, we assess clinical risk score performance disparities across\n26 granular groups for three outcomes, five risk scores, and four performance\nmetrics. Across outcomes and metrics, we show that the risk scores exhibit\nsignificant granular performance disparities within coarse race groups. In\nfact, variation in performance within coarse groups often *exceeds* the\nvariation between coarse groups. We explore why these disparities arise,\nfinding that outcome rates, feature distributions, and the relationships\nbetween features and outcomes all vary significantly across granular groups.\nOur results suggest that healthcare providers, hospital systems, and machine\nlearning researchers should strive to collect, release, and use granular race\ndata in place of coarse race data, and that existing analyses may significantly\nunderestimate racial disparities in performance.\n","authors":["Rajiv Movva","Divya Shanmugam","Kaihua Hou","Priya Pathak","John Guttag","Nikhil Garg","Emma Pierson"],"pdf_url":"https://arxiv.org/pdf/2304.09270v2.pdf","comment":"Published at MLHC 2023. v2 includes minor changes from the\n camera-ready, such as a link to code. Code is available at\n https://github.com/rmovva/granular-race-disparities_MLHC23"},{"id":"http://arxiv.org/abs/2302.00270v3","updated":"2023-08-24T19:43:02Z","published":"2023-02-01T06:25:46Z","title":"Internally Rewarded Reinforcement Learning","summary":" We study a class of reinforcement learning problems where the reward signals\nfor policy learning are generated by an internal reward model that is dependent\non and jointly optimized with the policy. This interdependence between the\npolicy and the reward model leads to an unstable learning process because\nreward signals from an immature reward model are noisy and impede policy\nlearning, and conversely, an under-optimized policy impedes reward estimation\nlearning. We call this learning setting $\\textit{Internally Rewarded\nReinforcement Learning}$ (IRRL) as the reward is not provided directly by the\nenvironment but $\\textit{internally}$ by a reward model. In this paper, we\nformally formulate IRRL and present a class of problems that belong to IRRL. We\ntheoretically derive and empirically analyze the effect of the reward function\nin IRRL and based on these analyses propose the clipped linear reward function.\nExperimental results show that the proposed reward function can consistently\nstabilize the training process by reducing the impact of reward noise, which\nleads to faster convergence and higher performance compared with baselines in\ndiverse tasks.\n","authors":["Mengdi Li","Xufeng Zhao","Jae Hee Lee","Cornelius Weber","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2302.00270v3.pdf","comment":"Accepted at ICML 2023. Update: adopt the term \"reward model\" instead\n of using \"critic\" to prevent confusion with the term \"critic\" in actor-critic\n algorithms. Project webpage at https://ir-rl.github.io"},{"id":"http://arxiv.org/abs/2308.13049v1","updated":"2023-08-24T19:35:58Z","published":"2023-08-24T19:35:58Z","title":"Bayesian Exploration Networks","summary":" Bayesian reinforcement learning (RL) offers a principled and elegant approach\nfor sequential decision making under uncertainty. Most notably, Bayesian agents\ndo not face an exploration/exploitation dilemma, a major pathology of\nfrequentist methods. A key challenge for Bayesian RL is the computational\ncomplexity of learning Bayes-optimal policies, which is only tractable in toy\ndomains. In this paper we propose a novel model-free approach to address this\nchallenge. Rather than modelling uncertainty in high-dimensional state\ntransition distributions as model-based approaches do, we model uncertainty in\na one-dimensional Bellman operator. Our theoretical analysis reveals that\nexisting model-free approaches either do not propagate epistemic uncertainty\nthrough the MDP or optimise over a set of contextual policies instead of all\nhistory-conditioned policies. Both approximations yield policies that can be\narbitrarily Bayes-suboptimal. To overcome these issues, we introduce the\nBayesian exploration network (BEN) which uses normalising flows to model both\nthe aleatoric uncertainty (via density estimation) and epistemic uncertainty\n(via variational inference) in the Bellman operator. In the limit of complete\noptimisation, BEN learns true Bayes-optimal policies, but like in variational\nexpectation-maximisation, partial optimisation renders our approach tractable.\nEmpirical results demonstrate that BEN can learn true Bayes-optimal policies in\ntasks where existing model-free approaches fail.\n","authors":["Mattie Fellows","Brandon Kaplowitz","Christian Schroeder de Witt","Shimon Whiteson"],"pdf_url":"https://arxiv.org/pdf/2308.13049v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13047v1","updated":"2023-08-24T19:27:59Z","published":"2023-08-24T19:27:59Z","title":"Federated Learning of Causal Effects from Incomplete Observational Data","summary":" Decentralized and incomplete data sources are prevalent in real-world\napplications, posing a formidable challenge for causal inference. These sources\ncannot be consolidated into a single entity owing to privacy constraints, and\nthe presence of missing values within them can potentially introduce bias to\nthe causal estimands. We introduce a new approach for federated causal\ninference from incomplete data, enabling the estimation of causal effects from\nmultiple decentralized and incomplete data sources. Our approach disentangles\nthe loss function into multiple components, each corresponding to a specific\ndata source with missing values. Our approach accounts for the missing data\nunder the missing at random assumption, while also estimating higher-order\nstatistics of the causal estimands. Our method recovers the conditional\ndistribution of missing confounders given the observed confounders from the\ndecentralized data sources to identify causal effects. Our framework estimates\nheterogeneous causal effects without the sharing of raw training data among\nsources, which helps to mitigate privacy risks. The efficacy of our approach is\ndemonstrated through a collection of simulated and real-world instances,\nillustrating its potential and practicality.\n","authors":["Thanh Vinh Vo","Young lee","Tze-Yun Leong"],"pdf_url":"https://arxiv.org/pdf/2308.13047v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2212.09844v4","updated":"2023-08-24T19:05:08Z","published":"2022-12-19T20:41:44Z","title":"Robust Design and Evaluation of Predictive Algorithms under Unobserved\n Confounding","summary":" Predictive algorithms inform consequential decisions in settings where the\noutcome is selectively observed given some choices made by human decision\nmakers. There often exists unobserved confounders that affected the decision\nmaker's choice and the outcome. We propose a unified methodology for the robust\ndesign and evaluation of predictive algorithms in selectively observed data\nunder such unobserved confounding. Our approach imposes general assumptions on\nhow much the outcome may vary on average between unselected and selected units\nconditional on observed covariates and identified nuisance parameters,\nformalizing popular empirical strategies for imputing missing data such as\nproxy outcomes and instrumental variables. We develop debiased machine learning\nestimators for the bounds on a large class of predictive performance estimands,\nsuch as the conditional likelihood of the outcome, a predictive algorithm's\nmean square error, true/false positive rate, and many others, under these\nassumptions. In an administrative dataset from a large Australian financial\ninstitution, we illustrate how varying assumptions on unobserved confounding\nleads to meaningful changes in default risk predictions and evaluations of\ncredit scores across sensitive groups.\n","authors":["Ashesh Rambachan","Amanda Coston","Edward Kennedy"],"pdf_url":"https://arxiv.org/pdf/2212.09844v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13035v1","updated":"2023-08-24T19:00:26Z","published":"2023-08-24T19:00:26Z","title":"The intersection of video capsule endoscopy and artificial intelligence:\n addressing unique challenges using machine learning","summary":" Introduction: Technical burdens and time-intensive review processes limit the\npractical utility of video capsule endoscopy (VCE). Artificial intelligence\n(AI) is poised to address these limitations, but the intersection of AI and VCE\nreveals challenges that must first be overcome. We identified five challenges\nto address. Challenge #1: VCE data are stochastic and contains significant\nartifact. Challenge #2: VCE interpretation is cost-intensive. Challenge #3: VCE\ndata are inherently imbalanced. Challenge #4: Existing VCE AIMLT are\ncomputationally cumbersome. Challenge #5: Clinicians are hesitant to accept\nAIMLT that cannot explain their process.\n Methods: An anatomic landmark detection model was used to test the\napplication of convolutional neural networks (CNNs) to the task of classifying\nVCE data. We also created a tool that assists in expert annotation of VCE data.\nWe then created more elaborate models using different approaches including a\nmulti-frame approach, a CNN based on graph representation, and a few-shot\napproach based on meta-learning.\n Results: When used on full-length VCE footage, CNNs accurately identified\nanatomic landmarks (99.1%), with gradient weighted-class activation mapping\nshowing the parts of each frame that the CNN used to make its decision. The\ngraph CNN with weakly supervised learning (accuracy 89.9%, sensitivity of\n91.1%), the few-shot model (accuracy 90.8%, precision 91.4%, sensitivity\n90.9%), and the multi-frame model (accuracy 97.5%, precision 91.5%, sensitivity\n94.8%) performed well. Discussion: Each of these five challenges is addressed,\nin part, by one of our AI-based models. Our goal of producing high performance\nusing lightweight models that aim to improve clinician confidence was achieved.\n","authors":["Shan Guleria","Benjamin Schwartz","Yash Sharma","Philip Fernandes","James Jablonski","Sodiq Adewole","Sanjana Srivastava","Fisher Rhoads","Michael Porter","Michelle Yeghyayan","Dylan Hyatt","Andrew Copland","Lubaina Ehsan","Donald Brown","Sana Syed"],"pdf_url":"https://arxiv.org/pdf/2308.13035v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.09929v2","updated":"2023-08-24T18:58:47Z","published":"2022-10-18T15:20:47Z","title":"Differentially Private Diffusion Models","summary":" While modern machine learning models rely on increasingly large training\ndatasets, data is often limited in privacy-sensitive domains. Generative models\ntrained with differential privacy (DP) on sensitive data can sidestep this\nchallenge, providing access to synthetic data instead. We build on the recent\nsuccess of diffusion models (DMs) and introduce Differentially Private\nDiffusion Models (DPDMs), which enforce privacy using differentially private\nstochastic gradient descent (DP-SGD). We investigate the DM parameterization\nand the sampling algorithm, which turn out to be crucial ingredients in DPDMs,\nand propose noise multiplicity, a powerful modification of DP-SGD tailored to\nthe training of DMs. We validate our novel DPDMs on image generation benchmarks\nand achieve state-of-the-art performance in all experiments. Moreover, on\nstandard benchmarks, classifiers trained on DPDM-generated synthetic data\nperform on par with task-specific DP-SGD-trained classifiers, which has not\nbeen demonstrated before for DP generative models. Project page and code:\nhttps://nv-tlabs.github.io/DPDM.\n","authors":["Tim Dockhorn","Tianshi Cao","Arash Vahdat","Karsten Kreis"],"pdf_url":"https://arxiv.org/pdf/2210.09929v2.pdf","comment":"Accepted at TMLR (https://openreview.net/forum?id=ZPpQk7FJXF)"},{"id":"http://arxiv.org/abs/2308.13032v1","updated":"2023-08-24T18:58:10Z","published":"2023-08-24T18:58:10Z","title":"Financial News Analytics Using Fine-Tuned Llama 2 GPT Model","summary":" The paper considers the possibility to fine-tune Llama 2 Large Language Model\n(LLM) for the multitask analysis of financial news. For fine-tuning, the\nPEFT/LoRA based approach was used. In the study, the model was fine-tuned for\nthe following tasks: analysing a text from financial market perspectives,\nhighlighting main points of a text, summarizing a text and extracting named\nentities with appropriate sentiments. The obtained results show that the\nfine-tuned Llama 2 model can perform a multitask financial news analysis with a\nspecified structure of response, part of response can be a structured text and\nanother part of data can have JSON format for further processing. Extracted\nsentiments for named entities can be considered as predictive features in\nsupervised machine learning models with quantitative target variables.\n","authors":["Bohdan M. Pavlyshenko"],"pdf_url":"https://arxiv.org/pdf/2308.13032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13028v1","updated":"2023-08-24T18:51:50Z","published":"2023-08-24T18:51:50Z","title":"Training Neural Networks with Universal Adiabatic Quantum Computing","summary":" The training of neural networks (NNs) is a computationally intensive task\nrequiring significant time and resources. This paper presents a novel approach\nto NN training using Adiabatic Quantum Computing (AQC), a paradigm that\nleverages the principles of adiabatic evolution to solve optimisation problems.\nWe propose a universal AQC method that can be implemented on gate quantum\ncomputers, allowing for a broad range of Hamiltonians and thus enabling the\ntraining of expressive neural networks. We apply this approach to various\nneural networks with continuous, discrete, and binary weights. Our results\nindicate that AQC can very efficiently find the global minimum of the loss\nfunction, offering a promising alternative to classical training methods.\n","authors":["Steve Abel","Juan Carlos Criado","Michael Spannowsky"],"pdf_url":"https://arxiv.org/pdf/2308.13028v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2111.01166v3","updated":"2023-08-24T18:42:27Z","published":"2021-11-01T18:00:14Z","title":"Dynamics of Local Elasticity During Training of Neural Nets","summary":" In the recent past, a property of neural training trajectories in\nweight-space had been isolated, that of \"local elasticity\" (denoted as $S_{\\rm\nrel}$). Local elasticity attempts to quantify the propagation of the influence\nof a sampled data point on the prediction at another data. In this work, we\nembark on a comprehensive study of the existing notion of $S_{\\rm rel}$ and\nalso propose a new definition that addresses the limitations that we point out\nfor the original definition in the classification setting. On various\nstate-of-the-art neural network training on SVHN, CIFAR-10 and CIFAR-100 we\ndemonstrate how our new proposal of $S_{\\rm rel}$, as opposed to the original\ndefinition, much more sharply detects the property of the weight updates\npreferring to make prediction changes within the same class as the sampled\ndata.\n In neural regression experiments we demonstrate that the original $S_{\\rm\nrel}$ reveals a $2-$phase behavior -- that the training proceeds via an initial\nelastic phase when $S_{\\rm rel}$ changes rapidly and an eventual inelastic\nphase when $S_{\\rm rel}$ remains large. We show that some of these properties\ncan be analytically reproduced in various instances of doing regression via\ngradient flows on model predictor classes.\n","authors":["Soham Dan","Anirbit Mukherjee","Avirup Das","Phanideep Gampa"],"pdf_url":"https://arxiv.org/pdf/2111.01166v3.pdf","comment":"40 pages (single column), the experiments have been significantly\n improved than the previous version"},{"id":"http://arxiv.org/abs/2308.13011v1","updated":"2023-08-24T18:23:59Z","published":"2023-08-24T18:23:59Z","title":"Extreme Risk Mitigation in Reinforcement Learning using Extreme Value\n Theory","summary":" Risk-sensitive reinforcement learning (RL) has garnered significant attention\nin recent years due to the growing interest in deploying RL agents in\nreal-world scenarios. A critical aspect of risk awareness involves modeling\nhighly rare risk events (rewards) that could potentially lead to catastrophic\noutcomes. These infrequent occurrences present a formidable challenge for\ndata-driven methods aiming to capture such risky events accurately. While\nrisk-aware RL techniques do exist, their level of risk aversion heavily relies\non the precision of the state-action value function estimation when modeling\nthese rare occurrences. Our work proposes to enhance the resilience of RL\nagents when faced with very rare and risky events by focusing on refining the\npredictions of the extreme values predicted by the state-action value function\ndistribution. To achieve this, we formulate the extreme values of the\nstate-action value function distribution as parameterized distributions,\ndrawing inspiration from the principles of extreme value theory (EVT). This\napproach effectively addresses the issue of infrequent occurrence by leveraging\nEVT-based parameterization. Importantly, we theoretically demonstrate the\nadvantages of employing these parameterized distributions in contrast to other\nrisk-averse algorithms. Our evaluations show that the proposed method\noutperforms other risk averse RL algorithms on a diverse range of benchmark\ntasks, each encompassing distinct risk scenarios.\n","authors":["Karthik Somayaji NS","Yu Wang","Malachi Schram","Jan Drgona","Mahantesh Halappanavar","Frank Liu","Peng Li"],"pdf_url":"https://arxiv.org/pdf/2308.13011v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.05501v2","updated":"2023-08-24T17:48:05Z","published":"2023-03-09T18:54:12Z","title":"PDSketch: Integrated Planning Domain Programming and Learning","summary":" This paper studies a model learning and online planning approach towards\nbuilding flexible and general robots. Specifically, we investigate how to\nexploit the locality and sparsity structures in the underlying environmental\ntransition model to improve model generalization, data-efficiency, and\nruntime-efficiency. We present a new domain definition language, named\nPDSketch. It allows users to flexibly define high-level structures in the\ntransition models, such as object and feature dependencies, in a way similar to\nhow programmers use TensorFlow or PyTorch to specify kernel sizes and hidden\ndimensions of a convolutional neural network. The details of the transition\nmodel will be filled in by trainable neural networks. Based on the defined\nstructures and learned parameters, PDSketch automatically generates\ndomain-independent planning heuristics without additional training. The derived\nheuristics accelerate the performance-time planning for novel goals.\n","authors":["Jiayuan Mao","Tomás Lozano-Pérez","Joshua B. Tenenbaum","Leslie Pack Kaelbling"],"pdf_url":"https://arxiv.org/pdf/2303.05501v2.pdf","comment":"Minor typo fixes. NeurIPS 2022. Project page:\n https://pdsketch.csail.mit.edu"},{"id":"http://arxiv.org/abs/2202.08806v2","updated":"2023-08-24T17:46:12Z","published":"2022-02-17T18:19:53Z","title":"Grammar-Based Grounded Lexicon Learning","summary":" We present Grammar-Based Grounded Lexicon Learning (G2L2), a lexicalist\napproach toward learning a compositional and grounded meaning representation of\nlanguage from grounded data, such as paired images and texts. At the core of\nG2L2 is a collection of lexicon entries, which map each word to a tuple of a\nsyntactic type and a neuro-symbolic semantic program. For example, the word\nshiny has a syntactic type of adjective; its neuro-symbolic semantic program\nhas the symbolic form {\\lambda}x. filter(x, SHINY), where the concept SHINY is\nassociated with a neural network embedding, which will be used to classify\nshiny objects. Given an input sentence, G2L2 first looks up the lexicon entries\nassociated with each token. It then derives the meaning of the sentence as an\nexecutable neuro-symbolic program by composing lexical meanings based on\nsyntax. The recovered meaning programs can be executed on grounded inputs. To\nfacilitate learning in an exponentially-growing compositional space, we\nintroduce a joint parsing and expected execution algorithm, which does local\nmarginalization over derivations to reduce the training time. We evaluate G2L2\non two domains: visual reasoning and language-driven navigation. Results show\nthat G2L2 can generalize from small amounts of data to novel compositions of\nwords.\n","authors":["Jiayuan Mao","Haoyue Shi","Jiajun Wu","Roger P. Levy","Joshua B. Tenenbaum"],"pdf_url":"https://arxiv.org/pdf/2202.08806v2.pdf","comment":"Minor typo fixes. NeurIPS 2021. Project page:\n https://g2l2.csail.mit.edu/"}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.12898v1","updated":"2023-08-24T16:17:40Z","published":"2023-08-24T16:17:40Z","title":"Can Linguistic Knowledge Improve Multimodal Alignment in Vision-Language\n Pretraining?","summary":" The multimedia community has shown a significant interest in perceiving and\nrepresenting the physical world with multimodal pretrained neural network\nmodels, and among them, the visual-language pertaining (VLP) is, currently, the\nmost captivating topic. However, there have been few endeavors dedicated to the\nexploration of 1) whether essential linguistic knowledge (e.g., semantics and\nsyntax) can be extracted during VLP, and 2) how such linguistic knowledge\nimpact or enhance the multimodal alignment. In response, here we aim to\nelucidate the impact of comprehensive linguistic knowledge, including semantic\nexpression and syntactic structure, on multimodal alignment. Specifically, we\ndesign and release the SNARE, the first large-scale multimodal alignment\nprobing benchmark, to detect the vital linguistic components, e.g., lexical,\nsemantic, and syntax knowledge, containing four tasks: Semantic structure,\nNegation logic, Attribute ownership, and Relationship composition. Based on our\nproposed probing benchmarks, our holistic analyses of five advanced VLP models\nillustrate that the VLP model: i) shows insensitivity towards complex syntax\nstructures and relies on content words for sentence comprehension; ii)\ndemonstrates limited comprehension of combinations between sentences and\nnegations; iii) faces challenges in determining the presence of actions or\nspatial relationships within visual information and struggles with verifying\nthe correctness of triple combinations. We make our benchmark and code\navailable at \\url{https://github.com/WangFei-2019/SNARE/}.\n","authors":["Fei Wang","Liang Ding","Jun Rao","Ye Liu","Li Shen","Changxing Ding"],"pdf_url":"https://arxiv.org/pdf/2308.12898v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12673v1","updated":"2023-08-24T09:31:02Z","published":"2023-08-24T09:31:02Z","title":"Masked Feature Modelling: Feature Masking for the Unsupervised\n Pre-training of a Graph Attention Network Block for Bottom-up Video Event\n Recognition","summary":" In this paper, we introduce Masked Feature Modelling (MFM), a novel approach\nfor the unsupervised pre-training of a Graph Attention Network (GAT) block. MFM\nutilizes a pretrained Visual Tokenizer to reconstruct masked features of\nobjects within a video, leveraging the MiniKinetics dataset. We then\nincorporate the pre-trained GAT block into a state-of-the-art bottom-up\nsupervised video-event recognition architecture, ViGAT, to improve the model's\nstarting point and overall accuracy. Experimental evaluations on the YLI-MED\ndataset demonstrate the effectiveness of MFM in improving event recognition\nperformance.\n","authors":["Dimitrios Daskalakis","Nikolaos Gkalelis","Vasileios Mezaris"],"pdf_url":"https://arxiv.org/pdf/2308.12673v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2307.07540v2","updated":"2023-08-24T09:11:26Z","published":"2023-07-14T14:09:09Z","title":"Flow-Guided Controllable Line Drawing Generation","summary":" In this paper, we investigate the problem of automatically controllable\nartistic character line drawing generation from photographs by proposing a\nVector Flow Aware and Line Controllable Image-to-Image Translation\narchitecture, which can be viewed as an appealing intersection between\nArtificial Intelligence and Arts. Specifically, we first present an\nImage-to-Flow network (I2FNet) to efficiently and robustly create the vector\nflow field in a learning-based manner, which can provide a direction guide for\ndrawing lines. Then, we introduce our well-designed Double Flow Generator (DFG)\nframework to fuse features from learned vector flow and input image flow\nguaranteeing the spatial coherence of lines. Meanwhile, in order to allow for\ncontrollable character line drawing generation, we integrate a Line Control\nMatrix (LCM) into DFG and train a Line Control Regressor (LCR) to synthesize\ndrawings with different styles by elaborately controlling the level of details,\nsuch as thickness, smoothness, and continuity, of lines. Finally, we design a\nFourier Transformation Loss to further constrain the character line generation\nfrom the frequency domain view of the point. Quantitative and qualitative\nexperiments demonstrate that our approach can obtain superior performance in\nproducing high-resolution character line-drawing images with perceptually\nrealistic characteristics.\n","authors":["Chengyu Fang","Xianfeng Han"],"pdf_url":"https://arxiv.org/pdf/2307.07540v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12636v1","updated":"2023-08-24T08:22:21Z","published":"2023-08-24T08:22:21Z","title":"Exploring Transferability of Multimodal Adversarial Samples for\n Vision-Language Pre-training Models with Contrastive Learning","summary":" Vision-language pre-training models (VLP) are vulnerable, especially to\nmultimodal adversarial samples, which can be crafted by adding imperceptible\nperturbations on both original images and texts. However, under the black-box\nsetting, there have been no works to explore the transferability of multimodal\nadversarial attacks against the VLP models. In this work, we take CLIP as the\nsurrogate model and propose a gradient-based multimodal attack method to\ngenerate transferable adversarial samples against the VLP models. By applying\nthe gradient to optimize the adversarial images and adversarial texts\nsimultaneously, our method can better search for and attack the vulnerable\nimages and text information pairs. To improve the transferability of the\nattack, we utilize contrastive learning including image-text contrastive\nlearning and intra-modal contrastive learning to have a more generalized\nunderstanding of the underlying data distribution and mitigate the overfitting\nof the surrogate model so that the generated multimodal adversarial samples\nhave a higher transferability for VLP models. Extensive experiments validate\nthe effectiveness of the proposed method.\n","authors":["Youze Wang","Wenbo Hu","Yinpeng Dong","Richang Hong"],"pdf_url":"https://arxiv.org/pdf/2308.12636v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12610v1","updated":"2023-08-24T07:20:47Z","published":"2023-08-24T07:20:47Z","title":"Emotion-Aligned Contrastive Learning Between Images and Music","summary":" Traditional music search engines rely on retrieval methods that match natural\nlanguage queries with music metadata. There have been increasing efforts to\nexpand retrieval methods to consider the audio characteristics of music itself,\nusing queries of various modalities including text, video, and speech. Most\napproaches aim to match general music semantics to the input queries, while\nonly a few focus on affective qualities. We address the task of retrieving\nemotionally-relevant music from image queries by proposing a framework for\nlearning an affective alignment between images and music audio. Our approach\nfocuses on learning an emotion-aligned joint embedding space between images and\nmusic. This joint embedding space is learned via emotion-supervised contrastive\nlearning, using an adapted cross-modal version of the SupCon loss. We directly\nevaluate the joint embeddings with cross-modal retrieval tasks (image-to-music\nand music-to-image) based on emotion labels. In addition, we investigate the\ngeneralizability of the learned music embeddings with automatic music tagging\nas a downstream task. Our experiments show that our approach successfully\naligns images and music, and that the learned embedding space is effective for\ncross-modal retrieval applications.\n","authors":["Shanti Stewart","Tiantian Feng","Kleanthis Avramidis","Shrikanth Narayanan"],"pdf_url":"https://arxiv.org/pdf/2308.12610v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2305.06152v2","updated":"2023-08-24T04:44:39Z","published":"2023-05-06T03:57:05Z","title":"Structure-CLIP: Towards Scene Graph Knowledge to Enhance Multi-modal\n Structured Representations","summary":" Large-scale vision-language pre-training has achieved significant performance\nin multi-modal understanding and generation tasks. However, existing methods\noften perform poorly on image-text matching tasks that require structured\nrepresentations, i.e., representations of objects, attributes, and relations.\nPrevious models cannot make a distinction between ``An astronaut rides a horse\"\nand ``A horse rides an astronaut\". This is because they fail to fully leverage\nstructured knowledge when learning representations in multi-modal scenarios. In\nthis paper, we present an end-to-end framework Structure-CLIP, which integrates\nScene Graph Knowledge (SGK) to enhance multi-modal structured representations.\nFirstly, we use scene graphs to guide the construction of semantic negative\nexamples, which results in an increased emphasis on learning structured\nrepresentations. Moreover, a Knowledge-Enhance Encoder (KEE) is proposed to\nleverage SGK as input to further enhance structured representations. To verify\nthe effectiveness of the proposed framework, we pre-train our model with the\naforementioned approaches and conduct experiments on downstream tasks.\nExperimental results demonstrate that Structure-CLIP achieves state-of-the-art\n(SOTA) performance on VG-Attribution and VG-Relation datasets, with 12.5% and\n4.1% ahead of the multi-modal SOTA model respectively. Meanwhile, the results\non MSCOCO indicate that Structure-CLIP significantly enhances the structured\nrepresentations while maintaining the ability of general representations. Our\ncode will be available soon.\n","authors":["Yufeng Huang","Jiji Tang","Zhuo Chen","Rongsheng Zhang","Xinfeng Zhang","Weijie Chen","Zeng Zhao","Zhou Zhao","Tangjie Lv","Zhipeng Hu","Wen Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.06152v2.pdf","comment":"Version 2.0. Improve grammar and experiments"},{"id":"http://arxiv.org/abs/2306.15401v3","updated":"2023-08-24T00:27:48Z","published":"2023-06-27T11:54:57Z","title":"Explainable Multimodal Emotion Reasoning","summary":" Multimodal emotion recognition is an active research topic in artificial\nintelligence. Its primary objective is to integrate multi-modalities (such as\nacoustic, visual, and lexical clues) to identify human emotional states.\nCurrent works generally assume accurate emotion labels for benchmark datasets\nand focus on developing more effective architectures. But due to the inherent\nsubjectivity of emotions, existing datasets often lack high annotation\nconsistency, resulting in potentially inaccurate labels. Consequently, models\nbuilt on these datasets may struggle to meet the demands of practical\napplications. To address this issue, it is crucial to enhance the reliability\nof emotion annotations. In this paper, we propose a novel task called\n``\\textbf{Explainable Multimodal Emotion Reasoning (EMER)}''. In contrast to\nprevious works that primarily focus on predicting emotions, EMER takes a step\nfurther by providing explanations for these predictions. The prediction is\nconsidered correct as long as the reasoning process behind the predicted\nemotion is plausible. This paper presents our initial efforts on EMER, where we\nintroduce a benchmark dataset, establish baseline models, and define evaluation\nmetrics. Meanwhile, we observe the necessity of integrating multi-faceted\ncapabilities to deal with EMER. Therefore, we propose the first multimodal\nlarge language model (LLM) in affective computing, called \\textbf{AffectGPT}.\nWe aim to tackle the long-standing challenge of label ambiguity and chart a\npath toward more reliable techniques. Furthermore, EMER offers an opportunity\nto evaluate the audio-video-text understanding capabilities of recent\nmultimodal LLM. To facilitate further research, we make the code and data\navailable at: https://github.com/zeroQiaoba/AffectGPT.\n","authors":["Zheng Lian","Licai Sun","Mingyu Xu","Haiyang Sun","Ke Xu","Zhuofan Wen","Shun Chen","Bin Liu","Jianhua Tao"],"pdf_url":"https://arxiv.org/pdf/2306.15401v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13004v1","updated":"2023-08-24T18:07:37Z","published":"2023-08-24T18:07:37Z","title":"Spherical Vision Transformer for 360-degree Video Saliency Prediction","summary":" The growing interest in omnidirectional videos (ODVs) that capture the full\nfield-of-view (FOV) has gained 360-degree saliency prediction importance in\ncomputer vision. However, predicting where humans look in 360-degree scenes\npresents unique challenges, including spherical distortion, high resolution,\nand limited labelled data. We propose a novel vision-transformer-based model\nfor omnidirectional videos named SalViT360 that leverages tangent image\nrepresentations. We introduce a spherical geometry-aware spatiotemporal\nself-attention mechanism that is capable of effective omnidirectional video\nunderstanding. Furthermore, we present a consistency-based unsupervised\nregularization term for projection-based 360-degree dense-prediction models to\nreduce artefacts in the predictions that occur after inverse projection. Our\napproach is the first to employ tangent images for omnidirectional saliency\nprediction, and our experimental results on three ODV saliency datasets\ndemonstrate its effectiveness compared to the state-of-the-art.\n","authors":["Mert Cokelek","Nevrez Imamoglu","Cagri Ozcinar","Erkut Erdem","Aykut Erdem"],"pdf_url":"https://arxiv.org/pdf/2308.13004v1.pdf","comment":"12 pages, 4 figures, accepted to BMVC 2023"}]},"2023-08-25T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2308.13517v1","updated":"2023-08-25T17:51:23Z","published":"2023-08-25T17:51:23Z","title":"ChatGPT as Data Augmentation for Compositional Generalization: A Case\n Study in Open Intent Detection","summary":" Open intent detection, a crucial aspect of natural language understanding,\ninvolves the identification of previously unseen intents in user-generated\ntext. Despite the progress made in this field, challenges persist in handling\nnew combinations of language components, which is essential for compositional\ngeneralization. In this paper, we present a case study exploring the use of\nChatGPT as a data augmentation technique to enhance compositional\ngeneralization in open intent detection tasks. We begin by discussing the\nlimitations of existing benchmarks in evaluating this problem, highlighting the\nneed for constructing datasets for addressing compositional generalization in\nopen intent detection tasks. By incorporating synthetic data generated by\nChatGPT into the training process, we demonstrate that our approach can\neffectively improve model performance. Rigorous evaluation of multiple\nbenchmarks reveals that our method outperforms existing techniques and\nsignificantly enhances open intent detection capabilities. Our findings\nunderscore the potential of large language models like ChatGPT for data\naugmentation in natural language understanding tasks.\n","authors":["Yihao Fang","Xianzhi Li","Stephen W. Thomas","Xiaodan Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.13517v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13506v1","updated":"2023-08-25T17:31:46Z","published":"2023-08-25T17:31:46Z","title":"Training and Meta-Evaluating Machine Translation Evaluation Metrics at\n the Paragraph Level","summary":" As research on machine translation moves to translating text beyond the\nsentence level, it remains unclear how effective automatic evaluation metrics\nare at scoring longer translations. In this work, we first propose a method for\ncreating paragraph-level data for training and meta-evaluating metrics from\nexisting sentence-level data. Then, we use these new datasets to benchmark\nexisting sentence-level metrics as well as train learned metrics at the\nparagraph level. Interestingly, our experimental results demonstrate that using\nsentence-level metrics to score entire paragraphs is equally as effective as\nusing a metric designed to work at the paragraph level. We speculate this\nresult can be attributed to properties of the task of reference-based\nevaluation as well as limitations of our datasets with respect to capturing all\ntypes of phenomena that occur in paragraph-level translations.\n","authors":["Daniel Deutsch","Juraj Juraska","Mara Finkelstein","and Markus Freitag"],"pdf_url":"https://arxiv.org/pdf/2308.13506v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13497v1","updated":"2023-08-25T17:13:20Z","published":"2023-08-25T17:13:20Z","title":"Ngambay-French Neural Machine Translation (sba-Fr)","summary":" In Africa, and the world at large, there is an increasing focus on developing\nNeural Machine Translation (NMT) systems to overcome language barriers. NMT for\nLow-resource language is particularly compelling as it involves learning with\nlimited labelled data. However, obtaining a well-aligned parallel corpus for\nlow-resource languages can be challenging. The disparity between the\ntechnological advancement of a few global languages and the lack of research on\nNMT for local languages in Chad is striking. End-to-end NMT trials on\nlow-resource Chad languages have not been attempted. Additionally, there is a\ndearth of online and well-structured data gathering for research in Natural\nLanguage Processing, unlike some African languages. However, a guided approach\nfor data gathering can produce bitext data for many Chadian language\ntranslation pairs with well-known languages that have ample data. In this\nproject, we created the first sba-Fr Dataset, which is a corpus of\nNgambay-to-French translations, and fine-tuned three pre-trained models using\nthis dataset. Our experiments show that the M2M100 model outperforms other\nmodels with high BLEU scores on both original and original+synthetic data. The\npublicly available bitext dataset can be used for research purposes.\n","authors":["Sakayo Toadoum Sari","Angela Fan","Lema Logamou Seknewna"],"pdf_url":"https://arxiv.org/pdf/2308.13497v1.pdf","comment":"Accepted at RANLP 2023 - International Workshop NLP tools and\n resources for translation and interpreting applications"},{"id":"http://arxiv.org/abs/2308.13479v1","updated":"2023-08-25T16:35:06Z","published":"2023-08-25T16:35:06Z","title":"Prompting a Large Language Model to Generate Diverse Motivational\n Messages: A Comparison with Human-Written Messages","summary":" Large language models (LLMs) are increasingly capable and prevalent, and can\nbe used to produce creative content. The quality of content is influenced by\nthe prompt used, with more specific prompts that incorporate examples generally\nproducing better results. On from this, it could be seen that using\ninstructions written for crowdsourcing tasks (that are specific and include\nexamples to guide workers) could prove effective LLM prompts. To explore this,\nwe used a previous crowdsourcing pipeline that gave examples to people to help\nthem generate a collectively diverse corpus of motivational messages. We then\nused this same pipeline to generate messages using GPT-4, and compared the\ncollective diversity of messages from: (1) crowd-writers, (2) GPT-4 using the\npipeline, and (3 & 4) two baseline GPT-4 prompts. We found that the LLM prompts\nusing the crowdsourcing pipeline caused GPT-4 to produce more diverse messages\nthan the two baseline prompts. We also discuss implications from messages\ngenerated by both human writers and LLMs.\n","authors":["Samuel Rhys Cox","Ashraf Abdul","Wei Tsang Ooi"],"pdf_url":"https://arxiv.org/pdf/2308.13479v1.pdf","comment":"3 pages, 1 figure, 1 table, to be published in Proceedings of the\n 11th International Conference on Human-Agent Interaction (ACM HAI'23)"},{"id":"http://arxiv.org/abs/2308.12219v2","updated":"2023-08-25T16:32:31Z","published":"2023-08-23T16:01:12Z","title":"Diffusion Language Models Can Perform Many Tasks with Scaling and\n Instruction-Finetuning","summary":" The recent surge of generative AI has been fueled by the generative power of\ndiffusion probabilistic models and the scalable capabilities of large language\nmodels. Despite their potential, it remains elusive whether diffusion language\nmodels can solve general language tasks comparable to their autoregressive\ncounterparts. This paper demonstrates that scaling diffusion models w.r.t.\ndata, sizes, and tasks can effectively make them strong language learners. We\nbuild competent diffusion language models at scale by first acquiring knowledge\nfrom massive data via masked language modeling pretraining thanks to their\nintrinsic connections. We then reprogram pretrained masked language models into\ndiffusion language models via diffusive adaptation, wherein task-specific\nfinetuning and instruction finetuning are explored to unlock their versatility\nin solving general language tasks. Experiments show that scaling diffusion\nlanguage models consistently improves performance across downstream language\ntasks. We further discover that instruction finetuning can elicit zero-shot and\nfew-shot in-context learning abilities that help tackle many unseen tasks by\nfollowing natural language instructions, and show promise in advanced and\nchallenging abilities such as reasoning.\n","authors":["Jiasheng Ye","Zaixiang Zheng","Yu Bao","Lihua Qian","Quanquan Gu"],"pdf_url":"https://arxiv.org/pdf/2308.12219v2.pdf","comment":"added references"},{"id":"http://arxiv.org/abs/2308.13467v1","updated":"2023-08-25T16:11:08Z","published":"2023-08-25T16:11:08Z","title":"Leveraging Knowledge and Reinforcement Learning for Enhanced Reliability\n of Language Models","summary":" The Natural Language Processing(NLP) community has been using crowd sourcing\ntechniques to create benchmark datasets such as General Language Understanding\nand Evaluation(GLUE) for training modern Language Models such as BERT. GLUE\ntasks measure the reliability scores using inter annotator metrics i.e. Cohens\nKappa. However, the reliability aspect of LMs has often been overlooked. To\ncounter this problem, we explore a knowledge-guided LM ensembling approach that\nleverages reinforcement learning to integrate knowledge from ConceptNet and\nWikipedia as knowledge graph embeddings. This approach mimics human annotators\nresorting to external knowledge to compensate for information deficits in the\ndatasets. Across nine GLUE datasets, our research shows that ensembling\nstrengthens reliability and accuracy scores, outperforming state of the art.\n","authors":["Nancy Tyagi","Surjodeep Sarkar","Manas Gaur"],"pdf_url":"https://arxiv.org/pdf/2308.13467v1.pdf","comment":"Accepted at CIKM'23"},{"id":"http://arxiv.org/abs/2306.02207v3","updated":"2023-08-25T16:10:18Z","published":"2023-06-03T22:35:27Z","title":"SpeechGen: Unlocking the Generative Power of Speech Language Models with\n Prompts","summary":" Large language models (LLMs) have gained considerable attention for\nArtificial Intelligence Generated Content (AIGC), particularly with the\nemergence of ChatGPT. However, the direct adaptation of continuous speech to\nLLMs that process discrete tokens remains an unsolved challenge, hindering the\napplication of LLMs for speech generation. The advanced speech LMs are in the\ncorner, as that speech signals encapsulate a wealth of information, including\nspeaker and emotion, beyond textual data alone. Prompt tuning has demonstrated\nnotable gains in parameter efficiency and competitive performance on some\nspeech classification tasks. However, the extent to which prompts can\neffectively elicit generation tasks from speech LMs remains an open question.\nIn this paper, we present pioneering research that explores the application of\nprompt tuning to stimulate speech LMs for various generation tasks, within a\nunified framework called SpeechGen, with around 10M trainable parameters. The\nproposed unified framework holds great promise for efficiency and\neffectiveness, particularly with the imminent arrival of advanced speech LMs,\nwhich will significantly enhance the capabilities of the framework. The code\nand demos of SpeechGen will be available on the project website:\n\\url{https://ga642381.github.io/SpeechPrompt/speechgen}\n","authors":["Haibin Wu","Kai-Wei Chang","Yuan-Kuei Wu","Hung-yi Lee"],"pdf_url":"https://arxiv.org/pdf/2306.02207v3.pdf","comment":"Work in progress. The first three authors contributed equally"},{"id":"http://arxiv.org/abs/2308.13458v1","updated":"2023-08-25T16:06:06Z","published":"2023-08-25T16:06:06Z","title":"ARTIST: ARTificial Intelligence for Simplified Text","summary":" Complex text is a major barrier for many citizens when accessing public\ninformation and knowledge. While often done manually, Text Simplification is a\nkey Natural Language Processing task that aims for reducing the linguistic\ncomplexity of a text while preserving the original meaning. Recent advances in\nGenerative Artificial Intelligence (AI) have enabled automatic text\nsimplification both on the lexical and syntactical levels. However, as\napplications often focus on English, little is understood about the\neffectiveness of Generative AI techniques on low-resource languages such as\nDutch. For this reason, we carry out empirical studies to understand the\nbenefits and limitations of applying generative technologies for text\nsimplification and provide the following outcomes: 1) the design and\nimplementation for a configurable text simplification pipeline that\norchestrates state-of-the-art generative text simplification models, domain and\nreader adaptation, and visualisation modules; 2) insights and lessons learned,\nshowing the strengths of automatic text simplification while exposing the\nchallenges in handling cultural and commonsense knowledge. These outcomes\nrepresent a first step in the exploration of Dutch text simplification and shed\nlight on future endeavours both for research and practice.\n","authors":["Lorenzo Corti","Jie Yang"],"pdf_url":"https://arxiv.org/pdf/2308.13458v1.pdf","comment":"6 pages, 1 figure. Presented at the 'Generative AI and HCI' workshop\n (https://generativeaiandhci.github.io/) at CHI 2023 in Hamburg, Germany"},{"id":"http://arxiv.org/abs/2308.13449v1","updated":"2023-08-25T15:51:15Z","published":"2023-08-25T15:51:15Z","title":"The Poison of Alignment","summary":" From the perspective of content safety issues, alignment has shown to limit\nlarge language models' (LLMs) harmful content generation. This intentional\nmethod of reinforcing models to not respond to certain user inputs seem to be\npresent in many modern open-source instruction tuning datasets such as\nOpenAssistant or Guanaco. We introduce a novel insight to an instruction-tuned\nmodel's performance affected by the presence of alignment in supervised\nfine-tuning dataset. To be specific, we noticed that alignment acts as if it is\npoisoning the instruction dataset. Experimentally, we demonstrate that aligned\nanswers significantly worsen the performance of the resulting fine-tuned\nmodel's on various reasoning benchmarks such as Big Bench (BBH), Massive\nMultitask Language Understanding (MMLU), Human Eval, and Discrete Reasoning\nOver Paragraphs (DROP), performing worse than the counterpart tuned without\nalignment by 4-33%.\n","authors":["Aibek Bekbayev","Sungbae Chun","Yerzat Dulat","James Yamazaki"],"pdf_url":"https://arxiv.org/pdf/2308.13449v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01423v2","updated":"2023-08-25T15:13:46Z","published":"2023-08-01T02:08:13Z","title":"ChatMOF: An Autonomous AI System for Predicting and Generating\n Metal-Organic Frameworks","summary":" ChatMOF is an autonomous Artificial Intelligence (AI) system that is built to\npredict and generate metal-organic frameworks (MOFs). By leveraging a\nlarge-scale language model (GPT-4 and GPT-3.5-turbo), ChatMOF extracts key\ndetails from textual inputs and delivers appropriate responses, thus\neliminating the necessity for rigid structured queries. The system is comprised\nof three core components (i.e. an agent, a toolkit, and an evaluator) and it\nforms a robust pipeline that manages a variety of tasks, including data\nretrieval, property prediction, and structure generations. The study further\nexplores the merits and constraints of using large language models (LLMs) AI\nsystem in material sciences using and showcases its transformative potential\nfor future advancements.\n","authors":["Yeonghun Kang","Jihan Kim"],"pdf_url":"https://arxiv.org/pdf/2308.01423v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13399v1","updated":"2023-08-25T14:23:40Z","published":"2023-08-25T14:23:40Z","title":"EntropyRank: Unsupervised Keyphrase Extraction via Side-Information\n Optimization for Language Model-based Text Compression","summary":" We propose an unsupervised method to extract keywords and keyphrases from\ntexts based on a pre-trained language model (LM) and Shannon's information\nmaximization. Specifically, our method extracts phrases having the highest\nconditional entropy under the LM. The resulting set of keyphrases turns out to\nsolve a relevant information-theoretic problem: if provided as side\ninformation, it leads to the expected minimal binary code length in compressing\nthe text using the LM and an entropy encoder. Alternately, the resulting set is\nan approximation via a causal LM to the set of phrases that minimize the\nentropy of the text when conditioned upon it. Empirically, the method provides\nresults comparable to the most commonly used methods in various keyphrase\nextraction benchmark challenges.\n","authors":["Alexander Tsvetkov. Alon Kipnis"],"pdf_url":"https://arxiv.org/pdf/2308.13399v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.08961v2","updated":"2023-08-25T14:12:43Z","published":"2023-02-17T15:49:19Z","title":"Grimm in Wonderland: Prompt Engineering with Midjourney to Illustrate\n Fairytales","summary":" The quality of text-to-image generation is continuously improving, yet the\nboundaries of its applicability are still unclear. In particular, refinement of\nthe text input with the objective of achieving better results - commonly called\nprompt engineering - so far seems to have not been geared towards work with\npre-existing texts. We investigate whether text-to-image generation and prompt\nengineering could be used to generate basic illustrations of popular\nfairytales. Using Midjourney v4, we engage in action research with a dual aim:\nto attempt to generate 5 believable illustrations for each of 5 popular\nfairytales, and to define a prompt engineering process that starts from a\npre-existing text and arrives at an illustration of it. We arrive at a\ntentative 4-stage process: i) initial prompt, ii) composition adjustment, iii)\nstyle refinement, and iv) variation selection. We also discuss three reasons\nwhy the generation model struggles with certain illustrations: difficulties\nwith counts, bias from stereotypical configurations and inability to depict\noverly fantastic situations. Our findings are not limited to the specific\ngeneration model and are intended to be generalisable to future ones.\n","authors":["Martin Ruskov"],"pdf_url":"https://arxiv.org/pdf/2302.08961v2.pdf","comment":"19th Conference on Information and Research science Connecting to\n Digital and Library Science, February 23-24, 2023, Bari, Italy"},{"id":"http://arxiv.org/abs/2304.14454v3","updated":"2023-08-25T14:08:38Z","published":"2023-04-27T18:29:05Z","title":"PMC-LLaMA: Towards Building Open-source Language Models for Medicine","summary":" Recently, Large Language Models (LLMs) have showcased remarkable capabilities\nin natural language understanding. While demonstrating proficiency in everyday\nconversations and question-answering situations, these models frequently\nstruggle in domains that require precision, such as medical applications, due\nto their lack of domain-specific knowledge. In this paper, we describe the\nprocedure for building a powerful, open-source language model specifically\ndesigned for medicine applications, termed as PMC-LLaMA. Our contributions are\nthreefold: (i) we systematically investigate the process of adapting a\ngeneral-purpose foundation language model towards medical domain, this involves\ndata-centric knowledge injection through the integration of 4.8M biomedical\nacademic papers and 30K medical textbooks, as well as comprehensive fine-tuning\nfor alignment with domain-specific instructions; (ii) we contribute a\nlarge-scale, comprehensive dataset for instruction tuning. This dataset\nencompasses medical question-answering (QA), rationale for reasoning, and\nconversational dialogues, comprising a total of 202M tokens; (iii) we conduct\nthorough ablation studies to demonstrate the effectiveness of each proposed\ncomponent. While evaluating on various public medical question-answering\nbenchmarks, our lightweight PMCLLaMA, which consists of only 13 billion\nparameters, exhibits superior performance, even surpassing ChatGPT. All models,\ncodes, datasets can be found in https://github.com/chaoyi-wu/PMC-LLaMA.\n","authors":["Chaoyi Wu","Weixiong Lin","Xiaoman Zhang","Ya Zhang","Yanfeng Wang","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2304.14454v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13387v1","updated":"2023-08-25T14:02:12Z","published":"2023-08-25T14:02:12Z","title":"Do-Not-Answer: A Dataset for Evaluating Safeguards in LLMs","summary":" With the rapid evolution of large language models (LLMs), new and\nhard-to-predict harmful capabilities are emerging. This requires developers to\nbe able to identify risks through the evaluation of \"dangerous capabilities\" in\norder to responsibly deploy LLMs. In this work, we collect the first\nopen-source dataset to evaluate safeguards in LLMs, and deploy safer\nopen-source LLMs at a low cost. Our dataset is curated and filtered to consist\nonly of instructions that responsible language models should not follow. We\nannotate and assess the responses of six popular LLMs to these instructions.\nBased on our annotation, we proceed to train several BERT-like classifiers, and\nfind that these small classifiers can achieve results that are comparable with\nGPT-4 on automatic safety evaluation. Warning: this paper contains example data\nthat may be offensive, harmful, or biased.\n","authors":["Yuxia Wang","Haonan Li","Xudong Han","Preslav Nakov","Timothy Baldwin"],"pdf_url":"https://arxiv.org/pdf/2308.13387v1.pdf","comment":"18 pages, 9 figures, 11 tables"},{"id":"http://arxiv.org/abs/2307.06954v2","updated":"2023-08-25T13:58:22Z","published":"2023-07-12T20:33:30Z","title":"ACTI at EVALITA 2023: Overview of the Conspiracy Theory Identification\n Task","summary":" Conspiracy Theory Identication task is a new shared task proposed for the\nfirst time at the Evalita 2023. The ACTI challenge, based exclusively on\ncomments published on conspiratorial channels of telegram, is divided into two\nsubtasks: (i) Conspiratorial Content Classification: identifying conspiratorial\ncontent and (ii) Conspiratorial Category Classification about specific\nconspiracy theory classification. A total of fifteen teams participated in the\ntask for a total of 81 submissions. We illustrate the best performing\napproaches were based on the utilization of large language models. We finally\ndraw conclusions about the utilization of these models for counteracting the\nspreading of misinformation in online platforms.\n","authors":["Giuseppe Russo","Niklas Stoehr","Manoel Horta Ribeiro"],"pdf_url":"https://arxiv.org/pdf/2307.06954v2.pdf","comment":"Accepted at the Evalita Workshop 2023"},{"id":"http://arxiv.org/abs/2308.13383v1","updated":"2023-08-25T13:52:57Z","published":"2023-08-25T13:52:57Z","title":"Assessing Keyness using Permutation Tests","summary":" We propose a resampling-based approach for assessing keyness in corpus\nlinguistics based on suggestions by Gries (2006, 2022). Traditional approaches\nbased on hypothesis tests (e.g. Likelihood Ratio) model the copora as\nindependent identically distributed samples of tokens. This model does not\naccount for the often observed uneven distribution of occurences of a word\nacross a corpus. When occurences of a word are concentrated in few documents,\nlarge values of LLR and similar scores are in fact much more likely than\naccounted for by the token-by-token sampling model, leading to false positives.\n We replace the token-by-token sampling model by a model where corpora are\nsamples of documents rather than tokens, which is much closer to the way\ncorpora are actually assembled. We then use a permutation approach to\napproximate the distribution of a given keyness score under the null hypothesis\nof equal frequencies and obtain p-values for assessing significance. We do not\nneed any assumption on how the tokens are organized within or across documents,\nand the approach works with basically *any* keyness score. Hence, appart from\nobtaining more accurate p-values for scores like LLR, we can also assess\nsignificance for e.g. the logratio which has been proposed as a measure of\neffect size.\n An efficient implementation of the proposed approach is provided in the `R`\npackage `keyperm` available from github.\n","authors":["Thoralf Mildenberger"],"pdf_url":"https://arxiv.org/pdf/2308.13383v1.pdf","comment":"Software available under https://github.com/thmild/keyperm"},{"id":"http://arxiv.org/abs/2306.01015v2","updated":"2023-08-25T13:50:19Z","published":"2023-06-01T04:52:26Z","title":"How to Estimate Model Transferability of Pre-Trained Speech Models?","summary":" In this work, we introduce a \"score-based assessment\" framework for\nestimating the transferability of pre-trained speech models (PSMs) for\nfine-tuning target tasks. We leverage upon two representation theories,\nBayesian likelihood estimation and optimal transport, to generate rank scores\nfor the PSM candidates using the extracted representations. Our framework\nefficiently computes transferability scores without actual fine-tuning of\ncandidate models or layers by making a temporal independent hypothesis. We\nevaluate some popular supervised speech models (e.g., Conformer RNN-Transducer)\nand self-supervised speech models (e.g., HuBERT) in cross-layer and cross-model\nsettings using public data. Experimental results show a high Spearman's rank\ncorrelation and low $p$-value between our estimation framework and fine-tuning\nground truth. Our proposed transferability framework requires less\ncomputational time and resources, making it a resource-saving and\ntime-efficient approach for tuning speech foundation models.\n","authors":["Zih-Ching Chen","Chao-Han Huck Yang","Bo Li","Yu Zhang","Nanxin Chen","Shou-Yiin Chang","Rohit Prabhavalkar","Hung-yi Lee","Tara N. Sainath"],"pdf_url":"https://arxiv.org/pdf/2306.01015v2.pdf","comment":"Accepted to Interspeech. Code is available at:\n https://github.com/virginiakm1988/LogME-CTC"},{"id":"http://arxiv.org/abs/2308.08253v2","updated":"2023-08-25T13:40:31Z","published":"2023-08-16T09:45:06Z","title":"Benchmarking Neural Network Generalization for Grammar Induction","summary":" How well do neural networks generalize? Even for grammar induction tasks,\nwhere the target generalization is fully known, previous works have left the\nquestion open, testing very limited ranges beyond the training set and using\ndifferent success criteria. We provide a measure of neural network\ngeneralization based on fully specified formal languages. Given a model and a\nformal grammar, the method assigns a generalization score representing how well\na model generalizes to unseen samples in inverse relation to the amount of data\nit was trained on. The benchmark includes languages such as $a^nb^n$,\n$a^nb^nc^n$, $a^nb^mc^{n+m}$, and Dyck-1 and 2. We evaluate selected\narchitectures using the benchmark and find that networks trained with a Minimum\nDescription Length objective (MDL) generalize better and using less data than\nnetworks trained using standard loss functions. The benchmark is available at\nhttps://github.com/taucompling/bliss.\n","authors":["Nur Lan","Emmanuel Chemla","Roni Katzir"],"pdf_url":"https://arxiv.org/pdf/2308.08253v2.pdf","comment":"10 pages, 4 figures, 2 tables. Conference: Learning with Small Data\n 2023"},{"id":"http://arxiv.org/abs/2304.05253v2","updated":"2023-08-25T12:58:41Z","published":"2023-04-11T14:45:01Z","title":"Approximating Online Human Evaluation of Social Chatbots with Prompting","summary":" As conversational models become increasingly available to the general public,\nusers are engaging with this technology in social interactions. Such\nunprecedented interaction experiences may pose considerable social and\npsychological risks to the users unless the technology is properly controlled.\nThis highlights the need for scalable and robust evaluation metrics for\nconversational chatbots. Existing evaluation metrics aim to automate offline\nuser evaluation and approximate human judgment of pre-curated dialogs. However,\nthey are limited in their ability to capture subjective perceptions of users\nwho actually interact with the bots and might not generalize to real-world\nsettings. To address this limitation, we propose an approach to approximate\nonline human evaluation leveraging large language models (LLMs) from the GPT\nfamily. We introduce a new Dialog system Evaluation framework based on\nPrompting (DEP), which enables a fully automatic evaluation pipeline that\nreplicates live user studies and achieves an impressive correlation with human\njudgment (up to Pearson r=0.95 on a system level). The DEP approach involves\ncollecting synthetic chat logs of evaluated bots with an LLM in the other-play\nsetting, where the LLM is carefully conditioned to follow a specific scenario.\nWe further explore different prompting approaches to produce evaluation scores\nwith the same LLM. The best performing prompts, which contain few-shot\ndemonstrations and instructions, show outstanding performance on the tested\ndataset and demonstrate the ability to generalize to other dialog corpora.\n","authors":["Ekaterina Svikhnushina","Pearl Pu"],"pdf_url":"https://arxiv.org/pdf/2304.05253v2.pdf","comment":"accepted to SIGDIAL 2023 (long paper)"},{"id":"http://arxiv.org/abs/2308.13354v1","updated":"2023-08-25T12:57:59Z","published":"2023-08-25T12:57:59Z","title":"On the Impact of Language Selection for Training and Evaluating\n Programming Language Models","summary":" The recent advancements in Transformer-based Language Models have\ndemonstrated significant potential in enhancing the multilingual capabilities\nof these models. The remarkable progress made in this domain not only applies\nto natural language tasks but also extends to the domain of programming\nlanguages. Despite the ability of these models to learn from multiple\nlanguages, evaluations typically focus on particular combinations of the same\nlanguages. In this study, we evaluate the similarity of programming languages\nby analyzing their representations using a CodeBERT-based model. Our\nexperiments reveal that token representation in languages such as C++, Python,\nand Java exhibit proximity to one another, whereas the same tokens in languages\nsuch as Mathematica and R display significant dissimilarity. Our findings\nsuggest that this phenomenon can potentially result in performance challenges\nwhen dealing with diverse languages. Thus, we recommend using our similarity\nmeasure to select a diverse set of programming languages when training and\nevaluating future models.\n","authors":["Jonathan Katzy","Maliheh Izadi","Arie van Deursen"],"pdf_url":"https://arxiv.org/pdf/2308.13354v1.pdf","comment":"Accepted to 2023 IEEE 23rd International Working Conference on Source\n Code Analysis and Manipulation (SCAM), NIER track"},{"id":"http://arxiv.org/abs/2305.00969v4","updated":"2023-08-25T12:54:35Z","published":"2023-05-01T17:56:32Z","title":"CryCeleb: A Speaker Verification Dataset Based on Infant Cry Sounds","summary":" This paper describes the Ubenwa CryCeleb dataset - a labeled collection of\ninfant cries, and the accompanying CryCeleb 2023 task - a public speaker\nverification challenge based on infant cry sounds. We release for academic\nusage more than 6 hours of manually segmented cry sounds from 786 newborns to\nencourage research in infant cry analysis.\n","authors":["David Budaghyan","Charles C. Onu","Arsenii Gorin","Cem Subakan","Doina Precup"],"pdf_url":"https://arxiv.org/pdf/2305.00969v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12898v2","updated":"2023-08-25T12:22:53Z","published":"2023-08-24T16:17:40Z","title":"Can Linguistic Knowledge Improve Multimodal Alignment in Vision-Language\n Pretraining?","summary":" The multimedia community has shown a significant interest in perceiving and\nrepresenting the physical world with multimodal pretrained neural network\nmodels, and among them, the visual-language pertaining (VLP) is, currently, the\nmost captivating topic. However, there have been few endeavors dedicated to the\nexploration of 1) whether essential linguistic knowledge (e.g., semantics and\nsyntax) can be extracted during VLP, and 2) how such linguistic knowledge\nimpact or enhance the multimodal alignment. In response, here we aim to\nelucidate the impact of comprehensive linguistic knowledge, including semantic\nexpression and syntactic structure, on multimodal alignment. Specifically, we\ndesign and release the SNARE, the first large-scale multimodal alignment\nprobing benchmark, to detect the vital linguistic components, e.g., lexical,\nsemantic, and syntax knowledge, containing four tasks: Semantic structure,\nNegation logic, Attribute ownership, and Relationship composition. Based on our\nproposed probing benchmarks, our holistic analyses of five advanced VLP models\nillustrate that the VLP model: i) shows insensitivity towards complex syntax\nstructures and relies on content words for sentence comprehension; ii)\ndemonstrates limited comprehension of combinations between sentences and\nnegations; iii) faces challenges in determining the presence of actions or\nspatial relationships within visual information and struggles with verifying\nthe correctness of triple combinations. We make our benchmark and code\navailable at \\url{https://github.com/WangFei-2019/SNARE/}.\n","authors":["Fei Wang","Liang Ding","Jun Rao","Ye Liu","Li Shen","Changxing Ding"],"pdf_url":"https://arxiv.org/pdf/2308.12898v2.pdf","comment":"[TL;DR] we design and release the SNARE, the first large-scale\n multimodal alignment probing benchmark for current vision-language pretrained\n models"},{"id":"http://arxiv.org/abs/2308.13315v1","updated":"2023-08-25T11:37:56Z","published":"2023-08-25T11:37:56Z","title":"Construction Grammar and Language Models","summary":" Recent progress in deep learning and natural language processing has given\nrise to powerful models that are primarily trained on a cloze-like task and\nshow some evidence of having access to substantial linguistic information,\nincluding some constructional knowledge. This groundbreaking discovery presents\nan exciting opportunity for a synergistic relationship between computational\nmethods and Construction Grammar research. In this chapter, we explore three\ndistinct approaches to the interplay between computational methods and\nConstruction Grammar: (i) computational methods for text analysis, (ii)\ncomputational Construction Grammar, and (iii) deep learning models, with a\nparticular focus on language models. We touch upon the first two approaches as\na contextual foundation for the use of computational methods before providing\nan accessible, yet comprehensive overview of deep learning models, which also\naddresses reservations construction grammarians may have. Additionally, we\ndelve into experiments that explore the emergence of constructionally relevant\ninformation within these models while also examining the aspects of\nConstruction Grammar that may pose challenges for these models. This chapter\naims to foster collaboration between researchers in the fields of natural\nlanguage processing and Construction Grammar. By doing so, we hope to pave the\nway for new insights and advancements in both these fields.\n","authors":["Harish Tayyar Madabushi","Laurence Romain","Petar Milin","Dagmar Divjak"],"pdf_url":"https://arxiv.org/pdf/2308.13315v1.pdf","comment":"Accepted for publication in The Cambridge Handbook of Construction\n Grammar, edited by Mirjam Fried and Kiki Nikiforidou. To appear in 2024"},{"id":"http://arxiv.org/abs/2305.00976v2","updated":"2023-08-25T09:35:46Z","published":"2023-05-02T17:52:41Z","title":"TMR: Text-to-Motion Retrieval Using Contrastive 3D Human Motion\n Synthesis","summary":" In this paper, we present TMR, a simple yet effective approach for text to 3D\nhuman motion retrieval. While previous work has only treated retrieval as a\nproxy evaluation metric, we tackle it as a standalone task. Our method extends\nthe state-of-the-art text-to-motion synthesis model TEMOS, and incorporates a\ncontrastive loss to better structure the cross-modal latent space. We show that\nmaintaining the motion generation loss, along with the contrastive training, is\ncrucial to obtain good performance. We introduce a benchmark for evaluation and\nprovide an in-depth analysis by reporting results on several protocols. Our\nextensive experiments on the KIT-ML and HumanML3D datasets show that TMR\noutperforms the prior work by a significant margin, for example reducing the\nmedian rank from 54 to 19. Finally, we showcase the potential of our approach\non moment retrieval. Our code and models are publicly available at\nhttps://mathis.petrovich.fr/tmr.\n","authors":["Mathis Petrovich","Michael J. Black","Gül Varol"],"pdf_url":"https://arxiv.org/pdf/2305.00976v2.pdf","comment":"ICCV 2023 Camera Ready, project page:\n https://mathis.petrovich.fr/tmr/"},{"id":"http://arxiv.org/abs/2308.13259v1","updated":"2023-08-25T09:23:55Z","published":"2023-08-25T09:23:55Z","title":"Knowledge-Driven CoT: Exploring Faithful Reasoning in LLMs for\n Knowledge-intensive Question Answering","summary":" Equipped with Chain-of-Thought (CoT), Large language models (LLMs) have shown\nimpressive reasoning ability in various downstream tasks. Even so, suffering\nfrom hallucinations and the inability to access external knowledge, LLMs often\ncome with incorrect or unfaithful intermediate reasoning steps, especially in\nthe context of answering knowledge-intensive tasks such as KBQA. To alleviate\nthis issue, we propose a framework called Knowledge-Driven Chain-of-Thought\n(KD-CoT) to verify and modify reasoning traces in CoT via interaction with\nexternal knowledge, and thus overcome the hallucinations and error propagation.\nConcretely, we formulate the CoT rationale process of LLMs into a structured\nmulti-round QA format. In each round, LLMs interact with a QA system that\nretrieves external knowledge and produce faithful reasoning traces based on\nretrieved precise answers. The structured CoT reasoning of LLMs is facilitated\nby our developed KBQA CoT collection, which serves as in-context learning\ndemonstrations and can also be utilized as feedback augmentation to train a\nrobust retriever. Extensive experiments on WebQSP and ComplexWebQuestion\ndatasets demonstrate the effectiveness of proposed KD-CoT in task-solving\nreasoning generation, which outperforms the vanilla CoT ICL with an absolute\nsuccess rate of 8.0% and 5.1%. Furthermore, our proposed feedback-augmented\nretriever outperforms the state-of-the-art baselines for retrieving knowledge,\nachieving significant improvement in Hit performance.\n","authors":["Keheng Wang","Feiyu Duan","Sirui Wang","Peiguang Li","Yunsen Xian","Chuantao Yin","Wenge Rong","Zhang Xiong"],"pdf_url":"https://arxiv.org/pdf/2308.13259v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14107v2","updated":"2023-08-25T09:00:08Z","published":"2023-07-26T11:10:04Z","title":"Decoding ChatGPT: A Taxonomy of Existing Research, Current Challenges,\n and Possible Future Directions","summary":" Chat Generative Pre-trained Transformer (ChatGPT) has gained significant\ninterest and attention since its launch in November 2022. It has shown\nimpressive performance in various domains, including passing exams and creative\nwriting. However, challenges and concerns related to biases and trust persist.\nIn this work, we present a comprehensive review of over 100 Scopus-indexed\npublications on ChatGPT, aiming to provide a taxonomy of ChatGPT research and\nexplore its applications. We critically analyze the existing literature,\nidentifying common approaches employed in the studies. Additionally, we\ninvestigate diverse application areas where ChatGPT has found utility, such as\nhealthcare, marketing and financial services, software engineering, academic\nand scientific writing, research and education, environmental science, and\nnatural language processing. Through examining these applications, we gain\nvaluable insights into the potential of ChatGPT in addressing real-world\nchallenges. We also discuss crucial issues related to ChatGPT, including biases\nand trustworthiness, emphasizing the need for further research and development\nin these areas. Furthermore, we identify potential future directions for\nChatGPT research, proposing solutions to current challenges and speculating on\nexpected advancements. By fully leveraging the capabilities of ChatGPT, we can\nunlock its potential across various domains, leading to advancements in\nconversational AI and transformative impacts in society.\n","authors":["Shahab Saquib Sohail","Faiza Farhat","Yassine Himeur","Mohammad Nadeem","Dag Øivind Madsen","Yashbir Singh","Shadi Atalla","Wathiq Mansoor"],"pdf_url":"https://arxiv.org/pdf/2307.14107v2.pdf","comment":"31 pages. 8 figures and 3 tables"},{"id":"http://arxiv.org/abs/2308.12950v2","updated":"2023-08-25T08:51:22Z","published":"2023-08-24T17:39:13Z","title":"Code Llama: Open Foundation Models for Code","summary":" We release Code Llama, a family of large language models for code based on\nLlama 2 providing state-of-the-art performance among open models, infilling\ncapabilities, support for large input contexts, and zero-shot instruction\nfollowing ability for programming tasks. We provide multiple flavors to cover a\nwide range of applications: foundation models (Code Llama), Python\nspecializations (Code Llama - Python), and instruction-following models (Code\nLlama - Instruct) with 7B, 13B and 34B parameters each. All models are trained\non sequences of 16k tokens and show improvements on inputs with up to 100k\ntokens. 7B and 13B Code Llama and Code Llama - Instruct variants support\ninfilling based on surrounding content. Code Llama reaches state-of-the-art\nperformance among open models on several code benchmarks, with scores of up to\n53% and 55% on HumanEval and MBPP, respectively. Notably, Code Llama - Python\n7B outperforms Llama 2 70B on HumanEval and MBPP, and all our models outperform\nevery other publicly available model on MultiPL-E. We release Code Llama under\na permissive license that allows for both research and commercial use.\n","authors":["Baptiste Rozière","Jonas Gehring","Fabian Gloeckle","Sten Sootla","Itai Gat","Xiaoqing Ellen Tan","Yossi Adi","Jingyu Liu","Tal Remez","Jérémy Rapin","Artyom Kozhevnikov","Ivan Evtimov","Joanna Bitton","Manish Bhatt","Cristian Canton Ferrer","Aaron Grattafiori","Wenhan Xiong","Alexandre Défossez","Jade Copet","Faisal Azhar","Hugo Touvron","Louis Martin","Nicolas Usunier","Thomas Scialom","Gabriel Synnaeve"],"pdf_url":"https://arxiv.org/pdf/2308.12950v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.06264v3","updated":"2023-08-25T08:50:34Z","published":"2022-02-13T08:58:04Z","title":"A Simplified Variant of Gödel's Ontological Argument","summary":" A simplified variant of G\\\"odel's ontological argument is presented. The\nsimplified argument is valid already in basic modal logics K or KT, it does not\nsuffer from modal collapse, and it avoids the rather complex predicates of\nessence (Ess.) and necessary existence (NE) as used by G\\\"odel. The variant\npresented has been obtained as a side result of a series of theory\nsimplification experiments conducted in interaction with a modern proof\nassistant system. The starting point for these experiments was the computer\nencoding of G\\\"odel's argument, and then automated reasoning techniques were\nsystematically applied to arrive at the simplified variant presented. The\npresented work thus exemplifies a fruitful human-computer interaction in\ncomputational metaphysics. Whether the presented result increases or decreases\nthe attractiveness and persuasiveness of the ontological argument is a question\nI would like to pass on to philosophy and theology.\n","authors":["Christoph Benzmüller"],"pdf_url":"https://arxiv.org/pdf/2202.06264v3.pdf","comment":"15 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.13207v1","updated":"2023-08-25T07:04:16Z","published":"2023-08-25T07:04:16Z","title":"LLM2KB: Constructing Knowledge Bases using instruction tuned context\n aware Large Language Models","summary":" The advent of Large Language Models (LLM) has revolutionized the field of\nnatural language processing, enabling significant progress in various\napplications. One key area of interest is the construction of Knowledge Bases\n(KB) using these powerful models. Knowledge bases serve as repositories of\nstructured information, facilitating information retrieval and inference tasks.\nOur paper proposes LLM2KB, a system for constructing knowledge bases using\nlarge language models, with a focus on the Llama 2 architecture and the\nWikipedia dataset. We perform parameter efficient instruction tuning for\nLlama-2-13b-chat and StableBeluga-13B by training small injection models that\nhave only 0.05 % of the parameters of the base models using the Low Rank\nAdaptation (LoRA) technique. These injection models have been trained with\nprompts that are engineered to utilize Wikipedia page contexts of subject\nentities fetched using a Dense Passage Retrieval (DPR) algorithm, to answer\nrelevant object entities for a given subject entity and relation. Our best\nperforming model achieved an average F1 score of 0.6185 across 21 relations in\nthe LM-KBC challenge held at the ISWC 2023 conference.\n","authors":["Anmol Nayak","Hari Prasad Timmapathini"],"pdf_url":"https://arxiv.org/pdf/2308.13207v1.pdf","comment":"16 pages, 1 figure, LM-KBC 2023 Challenge at International Semantic\n Web Conference 2023"},{"id":"http://arxiv.org/abs/2308.13198v1","updated":"2023-08-25T06:26:05Z","published":"2023-08-25T06:26:05Z","title":"Journey to the Center of the Knowledge Neurons: Discoveries of\n Language-Independent Knowledge Neurons and Degenerate Knowledge Neurons","summary":" Pre-trained language models (PLMs) contain vast amounts of factual knowledge,\nbut how the knowledge is stored in the parameters remains unclear. This paper\ndelves into the complex task of understanding how factual knowledge is stored\nin multilingual PLMs, and introduces the Architecture-adapted Multilingual\nIntegrated Gradients method, which successfully localizes knowledge neurons\nmore precisely compared to current methods, and is more universal across\nvarious architectures and languages. Moreover, we conduct an in-depth\nexploration of knowledge neurons, leading to the following two important\ndiscoveries: (1) The discovery of Language-Independent Knowledge Neurons, which\nstore factual knowledge in a form that transcends language. We design\ncross-lingual knowledge editing experiments, demonstrating that the PLMs can\naccomplish this task based on language-independent neurons; (2) The discovery\nof Degenerate Knowledge Neurons, a novel type of neuron showing that different\nknowledge neurons can store the same fact. Its property of functional overlap\nendows the PLMs with a robust mastery of factual knowledge. We design\nfact-checking experiments, proving that the degenerate knowledge neurons can\nhelp the PLMs to detect wrong facts. Experiments corroborate these findings,\nshedding light on the mechanisms of factual knowledge storage in multilingual\nPLMs, and contribute valuable insights to the field. The source code will be\nmade publicly available for further research.\n","authors":["Yuheng Chen","Pengfei Cao","Yubo Chen","Kang Liu","Jun Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.13198v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13192v1","updated":"2023-08-25T06:05:57Z","published":"2023-08-25T06:05:57Z","title":"Formalising Natural Language Quantifiers for Human-Robot Interactions","summary":" We present a method for formalising quantifiers in natural language in the\ncontext of human-robot interactions. The solution is based on first-order logic\nextended with capabilities to represent the cardinality of variables, operating\nsimilarly to generalised quantifiers. To demonstrate the method, we designed an\nend-to-end system able to receive input as natural language, convert it into a\nformal logical representation, evaluate it, and return a result or send a\ncommand to a simulated robot.\n","authors":["Stefan Morar","Adrian Groza","Mihai Pomarlan"],"pdf_url":"https://arxiv.org/pdf/2308.13192v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13191v1","updated":"2023-08-25T05:52:05Z","published":"2023-08-25T05:52:05Z","title":"Chunk, Align, Select: A Simple Long-sequence Processing Method for\n Transformers","summary":" Although dominant in natural language processing, transformer-based models\nremain challenged by the task of long-sequence processing, because the\ncomputational cost of self-attention operations in transformers swells\nquadratically with the input sequence length. To alleviate the complexity of\nlong-sequence processing, we propose a simple framework to enable the\noffthe-shelf pre-trained transformers to process much longer sequences, while\nthe computation and memory costs remain growing linearly with the input\nsequence lengths. More specifically, our method divides each long-sequence\ninput into a batch of chunks, then aligns the interchunk information during the\nencoding steps, and finally selects the most representative hidden states from\nthe encoder for the decoding process. To extract inter-chunk semantic\ninformation, we align the start and end token embeddings among chunks in each\nencoding transformer block. To learn an effective hidden selection policy, we\ndesign a dual updating scheme inspired by reinforcement learning, which regards\nthe decoders of transformers as environments, and the downstream performance\nmetrics as the rewards to evaluate the hidden selection actions. Our empirical\nresults on real-world long-text summarization and reading comprehension tasks\ndemonstrate effective improvements compared to prior longsequence processing\nbaselines.\n","authors":["Jiawen Xie","Pengyu Cheng","Xiao Liang","Yong Dai","Nan Du"],"pdf_url":"https://arxiv.org/pdf/2308.13191v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13177v1","updated":"2023-08-25T04:54:32Z","published":"2023-08-25T04:54:32Z","title":"How to Evaluate the Generalization of Detection? A Benchmark for\n Comprehensive Open-Vocabulary Detection","summary":" Object detection (OD) in computer vision has made significant progress in\nrecent years, transitioning from closed-set labels to open-vocabulary detection\n(OVD) based on large-scale vision-language pre-training (VLP). However, current\nevaluation methods and datasets are limited to testing generalization over\nobject types and referral expressions, which do not provide a systematic,\nfine-grained, and accurate benchmark of OVD models' abilities. In this paper,\nwe propose a new benchmark named OVDEval, which includes 9 sub-tasks and\nintroduces evaluations on commonsense knowledge, attribute understanding,\nposition understanding, object relation comprehension, and more. The dataset is\nmeticulously created to provide hard negatives that challenge models' true\nunderstanding of visual and linguistic input. Additionally, we identify a\nproblem with the popular Average Precision (AP) metric when benchmarking models\non these fine-grained label datasets and propose a new metric called\nNon-Maximum Suppression Average Precision (NMS-AP) to address this issue.\nExtensive experimental results show that existing top OVD models all fail on\nthe new tasks except for simple object types, demonstrating the value of the\nproposed dataset in pinpointing the weakness of current OVD models and guiding\nfuture research. Furthermore, the proposed NMS-AP metric is verified by\nexperiments to provide a much more truthful evaluation of OVD models, whereas\ntraditional AP metrics yield deceptive results. Data is available at\n\\url{https://github.com/om-ai-lab/OVDEval}\n","authors":["Yiyang Yao","Peng Liu","Tiancheng Zhao","Qianqian Zhang","Jiajia Liao","Chunxin Fang","Kyusong Lee","Qing Wang"],"pdf_url":"https://arxiv.org/pdf/2308.13177v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13173v1","updated":"2023-08-25T04:45:37Z","published":"2023-08-25T04:45:37Z","title":"DISGO: Automatic End-to-End Evaluation for Scene Text OCR","summary":" This paper discusses the challenges of optical character recognition (OCR) on\nnatural scenes, which is harder than OCR on documents due to the wild content\nand various image backgrounds. We propose to uniformly use word error rates\n(WER) as a new measurement for evaluating scene-text OCR, both end-to-end (e2e)\nperformance and individual system component performances. Particularly for the\ne2e metric, we name it DISGO WER as it considers Deletion, Insertion,\nSubstitution, and Grouping/Ordering errors. Finally we propose to utilize the\nconcept of super blocks to automatically compute BLEU scores for e2e OCR\nmachine translation. The small SCUT public test set is used to demonstrate WER\nperformance by a modularized OCR system.\n","authors":["Mei-Yuh Hwang","Yangyang Shi","Ankit Ramchandani","Guan Pang","Praveen Krishnan","Lucas Kabela","Frank Seide","Samyak Datta","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2308.13173v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2308.13170v1","updated":"2023-08-25T04:19:58Z","published":"2023-08-25T04:19:58Z","title":"Measuring Spurious Correlation in Classification: 'Clever Hans' in\n Translationese","summary":" Recent work has shown evidence of 'Clever Hans' behavior in high-performance\nneural translationese classifiers, where BERT-based classifiers capitalize on\nspurious correlations, in particular topic information, between data and target\nclassification labels, rather than genuine translationese signals.\nTranslationese signals are subtle (especially for professional translation) and\ncompete with many other signals in the data such as genre, style, author, and,\nin particular, topic. This raises the general question of how much of the\nperformance of a classifier is really due to spurious correlations in the data\nversus the signals actually targeted for by the classifier, especially for\nsubtle target signals and in challenging (low resource) data settings. We focus\non topic-based spurious correlation and approach the question from two\ndirections: (i) where we have no knowledge about spurious topic information and\nits distribution in the data, (ii) where we have some indication about the\nnature of spurious topic correlations. For (i) we develop a measure from first\nprinciples capturing alignment of unsupervised topics with target\nclassification labels as an indication of spurious topic information in the\ndata. We show that our measure is the same as purity in clustering and propose\na 'topic floor' (as in a 'noise floor') for classification. For (ii) we\ninvestigate masking of known spurious topic carriers in classification. Both\n(i) and (ii) contribute to quantifying and (ii) to mitigating spurious\ncorrelations.\n","authors":["Angana Borah","Daria Pylypenko","Cristina Espana-Bonet","Josef van Genabith"],"pdf_url":"https://arxiv.org/pdf/2308.13170v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13149v1","updated":"2023-08-25T03:05:33Z","published":"2023-08-25T03:05:33Z","title":"SciEval: A Multi-Level Large Language Model Evaluation Benchmark for\n Scientific Research","summary":" Recently, there has been growing interest in using Large Language Models\n(LLMs) for scientific research. Numerous benchmarks have been proposed to\nevaluate the ability of LLMs for scientific research. However, current\nbenchmarks are mostly based on pre-collected objective questions. This design\nsuffers from data leakage problem and lacks the evaluation of subjective Q/A\nability. In this paper, we propose SciEval, a comprehensive and\nmulti-disciplinary evaluation benchmark to address these issues. Based on\nBloom's taxonomy, SciEval covers four dimensions to systematically evaluate\nscientific research ability. In particular, we design a \"dynamic\" subset based\non scientific principles to prevent evaluation from potential data leakage.\nBoth objective and subjective questions are included in SciEval. These\ncharacteristics make SciEval a more effective benchmark for scientific research\nability evaluation of LLMs. Comprehensive experiments on most advanced LLMs\nshow that, although GPT-4 achieves SOTA performance compared to other LLMs,\nthere is still substantial room for improvement, especially for dynamic\nquestions. The data and codes are now publicly available.\n","authors":["Liangtai Sun","Yang Han","Zihan Zhao","Da Ma","Zhennan Shen","Baocai Chen","Lu Chen","Kai Yu"],"pdf_url":"https://arxiv.org/pdf/2308.13149v1.pdf","comment":"12 pages, 17 figures, 12 tables. Under Review"},{"id":"http://arxiv.org/abs/2308.13139v1","updated":"2023-08-25T02:32:36Z","published":"2023-08-25T02:32:36Z","title":"MatchXML: An Efficient Text-label Matching Framework for Extreme\n Multi-label Text Classification","summary":" The eXtreme Multi-label text Classification(XMC) refers to training a\nclassifier that assigns a text sample with relevant labels from an extremely\nlarge-scale label set (e.g., millions of labels). We propose MatchXML, an\nefficient text-label matching framework for XMC. We observe that the label\nembeddings generated from the sparse Term Frequency-Inverse Document\nFrequency(TF-IDF) features have several limitations. We thus propose label2vec\nto effectively train the semantic dense label embeddings by the Skip-gram\nmodel. The dense label embeddings are then used to build a Hierarchical Label\nTree by clustering. In fine-tuning the pre-trained encoder Transformer, we\nformulate the multi-label text classification as a text-label matching problem\nin a bipartite graph. We then extract the dense text representations from the\nfine-tuned Transformer. Besides the fine-tuned dense text embeddings, we also\nextract the static dense sentence embeddings from a pre-trained Sentence\nTransformer. Finally, a linear ranker is trained by utilizing the sparse TF-IDF\nfeatures, the fine-tuned dense text representations and static dense sentence\nfeatures. Experimental results demonstrate that MatchXML achieves\nstate-of-the-art accuracy on five out of six datasets. As for the speed,\nMatchXML outperforms the competing methods on all the six datasets. Our source\ncode is publicly available at https://github.com/huiyegit/MatchXML.\n","authors":["Hui Ye","Rajshekhar Sunderraman","Shihao Ji"],"pdf_url":"https://arxiv.org/pdf/2308.13139v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13137v1","updated":"2023-08-25T02:28:35Z","published":"2023-08-25T02:28:35Z","title":"OmniQuant: Omnidirectionally Calibrated Quantization for Large Language\n Models","summary":" Large language models (LLMs) have revolutionized natural language processing\ntasks. However, their practical deployment is hindered by their immense memory\nand computation requirements. Although recent post-training quantization (PTQ)\nmethods are effective in reducing memory footprint and improving the\ncomputational efficiency of LLM, they hand-craft quantization parameters, which\nleads to low performance and fails to deal with extremely low-bit quantization.\nTo tackle this issue, we introduce an Omnidirectionally calibrated Quantization\n(OmniQuant) technique for LLMs, which achieves good performance in diverse\nquantization settings while maintaining the computational efficiency of PTQ by\nefficiently optimizing various quantization parameters. OmniQuant comprises two\ninnovative components including Learnable Weight Clipping (LWC) and Learnable\nEquivalent Transformation (LET). LWC modulates the extreme values of weights by\noptimizing the clipping threshold. Meanwhile, LET tackles activation outliers\nby shifting the challenge of quantization from activations to weights through a\nlearnable equivalent transformation. Operating within a differentiable\nframework using block-wise error minimization, OmniQuant can optimize the\nquantization process efficiently for both weight-only and weight-activation\nquantization. For instance, the LLaMA-2 model family with the size of 7-70B can\nbe processed with OmniQuant on a single A100-40G GPU within 1-16 hours using\n128 samples. Extensive experiments validate OmniQuant's superior performance\nacross diverse quantization configurations such as W4A4, W6A6, W4A16, W3A16,\nand W2A16. Additionally, OmniQuant demonstrates effectiveness in\ninstruction-tuned models and delivers notable improvements in inference speed\nand memory reduction on real devices. Codes and models are available at\n\\url{https://github.com/OpenGVLab/OmniQuant}.\n","authors":["Wenqi Shao","Mengzhao Chen","Zhaoyang Zhang","Peng Xu","Lirui Zhao","Zhiqian Li","Kaipeng Zhang","Peng Gao","Yu Qiao","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2308.13137v1.pdf","comment":"A differentiable quantization method for LLM"},{"id":"http://arxiv.org/abs/2308.10370v2","updated":"2023-08-25T01:41:17Z","published":"2023-08-20T21:30:34Z","title":"cantnlp@LT-EDI-2023: Homophobia/Transphobia Detection in Social Media\n Comments using Spatio-Temporally Retrained Language Models","summary":" This paper describes our multiclass classification system developed as part\nof the LTEDI@RANLP-2023 shared task. We used a BERT-based language model to\ndetect homophobic and transphobic content in social media comments across five\nlanguage conditions: English, Spanish, Hindi, Malayalam, and Tamil. We\nretrained a transformer-based crosslanguage pretrained language model,\nXLMRoBERTa, with spatially and temporally relevant social media language data.\nWe also retrained a subset of models with simulated script-mixed social media\nlanguage data with varied performance. We developed the best performing\nseven-label classification system for Malayalam based on weighted macro\naveraged F1 score (ranked first out of six) with variable performance for other\nlanguage and class-label conditions. We found the inclusion of this\nspatio-temporal data improved the classification performance for all language\nand task conditions when compared with the baseline. The results suggests that\ntransformer-based language classification systems are sensitive to\nregister-specific and language-specific retraining.\n","authors":["Sidney G. -J. Wong","Matthew Durward","Benjamin Adams","Jonathan Dunn"],"pdf_url":"https://arxiv.org/pdf/2308.10370v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11521v2","updated":"2023-08-25T00:25:06Z","published":"2023-08-16T09:04:36Z","title":"Self-Deception: Reverse Penetrating the Semantic Firewall of Large\n Language Models","summary":" Large language models (LLMs), such as ChatGPT, have emerged with astonishing\ncapabilities approaching artificial general intelligence. While providing\nconvenience for various societal needs, LLMs have also lowered the cost of\ngenerating harmful content. Consequently, LLM developers have deployed\nsemantic-level defenses to recognize and reject prompts that may lead to\ninappropriate content. Unfortunately, these defenses are not foolproof, and\nsome attackers have crafted \"jailbreak\" prompts that temporarily hypnotize the\nLLM into forgetting content defense rules and answering any improper questions.\nTo date, there is no clear explanation of the principles behind these\nsemantic-level attacks and defenses in both industry and academia.\n This paper investigates the LLM jailbreak problem and proposes an automatic\njailbreak method for the first time. We propose the concept of a semantic\nfirewall and provide three technical implementation approaches. Inspired by the\nattack that penetrates traditional firewalls through reverse tunnels, we\nintroduce a \"self-deception\" attack that can bypass the semantic firewall by\ninducing LLM to generate prompts that facilitate jailbreak. We generated a\ntotal of 2,520 attack payloads in six languages (English, Russian, French,\nSpanish, Chinese, and Arabic) across seven virtual scenarios, targeting the\nthree most common types of violations: violence, hate, and pornography. The\nexperiment was conducted on two models, namely the GPT-3.5-Turbo and GPT-4. The\nsuccess rates on the two models were 86.2% and 67%, while the failure rates\nwere 4.7% and 2.2%, respectively. This highlighted the effectiveness of the\nproposed attack method. All experimental code and raw data will be released as\nopen-source to inspire future research. We believe that manipulating AI\nbehavior through carefully crafted prompts will become an important research\ndirection in the future.\n","authors":["Zhenhua Wang","Wei Xie","Kai Chen","Baosheng Wang","Zhiwen Gui","Enze Wang"],"pdf_url":"https://arxiv.org/pdf/2308.11521v2.pdf","comment":"Serious errors were found in the experiment, which may lead to the\n overturning of the overall conclusions of the paper"},{"id":"http://arxiv.org/abs/2308.11507v2","updated":"2023-08-25T00:07:50Z","published":"2023-08-22T15:28:49Z","title":"Unsupervised Prototype Adapter for Vision-Language Models","summary":" Recently, large-scale pre-trained vision-language models (e.g. CLIP and\nALIGN) have demonstrated remarkable effectiveness in acquiring transferable\nvisual representations. To leverage the valuable knowledge encoded within these\nmodels for downstream tasks, several fine-tuning approaches, including prompt\ntuning methods and adapter-based methods, have been developed to adapt\nvision-language models effectively with supervision. However, these methods\nrely on the availability of annotated samples, which can be labor-intensive and\ntime-consuming to acquire, thus limiting scalability. To address this issue, in\nthis work, we design an unsupervised fine-tuning approach for vision-language\nmodels called Unsupervised Prototype Adapter (UP-Adapter). Specifically, for\nthe unannotated target datasets, we leverage the text-image aligning capability\nof CLIP to automatically select the most confident samples for each class.\nUtilizing these selected samples, we generate class prototypes, which serve as\nthe initialization for the learnable prototype model. After fine-tuning, the\nprototype model prediction is combined with the original CLIP's prediction by a\nresidual connection to perform downstream recognition tasks. Our extensive\nexperimental results on image recognition and domain generalization show that\nthe proposed unsupervised method outperforms 8-shot CoOp, 8-shot Tip-Adapter,\nand also the state-of-the-art UPL method by large margins.\n","authors":["Yi Zhang","Ce Zhang","Xueting Hu","Zhihai He"],"pdf_url":"https://arxiv.org/pdf/2308.11507v2.pdf","comment":"Accepted by PRCV 2023"},{"id":"http://arxiv.org/abs/2308.13710v1","updated":"2023-08-25T23:50:05Z","published":"2023-08-25T23:50:05Z","title":"WellXplain: Wellness Concept Extraction and Classification in Reddit\n Posts for Mental Health Analysis","summary":" During the current mental health crisis, the importance of identifying\npotential indicators of mental issues from social media content has surged.\nOverlooking the multifaceted nature of mental and social well-being can have\ndetrimental effects on one's mental state. In traditional therapy sessions,\nprofessionals manually pinpoint the origins and outcomes of underlying mental\nchallenges, a process both detailed and time-intensive. We introduce an\napproach to this intricate mental health analysis by framing the identification\nof wellness dimensions in Reddit content as a wellness concept extraction and\ncategorization challenge. We've curated a unique dataset named WELLXPLAIN,\ncomprising 3,092 entries and totaling 72,813 words. Drawing from Halbert L.\nDunn's well-regarded wellness theory, our team formulated an annotation\nframework along with guidelines. This dataset also includes human-marked\ntextual segments, offering clear reasoning for decisions made in the wellness\nconcept categorization process. Our aim in publishing this dataset and\nanalyzing initial benchmarks is to spearhead the creation of advanced language\nmodels tailored for healthcare-focused concept extraction and categorization.\n","authors":["Muskan Garg"],"pdf_url":"https://arxiv.org/pdf/2308.13710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06828v2","updated":"2023-08-25T23:49:44Z","published":"2023-08-13T18:14:10Z","title":"An Ensemble Approach to Question Classification: Integrating Electra\n Transformer, GloVe, and LSTM","summary":" Natural Language Processing (NLP) has emerged as a crucial technology for\nunderstanding and generating human language, playing an essential role in tasks\nsuch as machine translation, sentiment analysis, and more pertinently, question\nclassification. As a subfield within NLP, question classification focuses on\ndetermining the type of information being sought, a fundamental step for\ndownstream applications like question answering systems. This study presents an\ninnovative ensemble approach for question classification, combining the\nstrengths of Electra, GloVe, and LSTM models. Rigorously tested on the\nwell-regarded TREC dataset, the model demonstrates how the integration of these\ndisparate technologies can lead to superior results. Electra brings in its\ntransformer-based capabilities for complex language understanding, GloVe offers\nglobal vector representations for capturing word-level semantics, and LSTM\ncontributes its sequence learning abilities to model long-term dependencies. By\nfusing these elements strategically, our ensemble model delivers a robust and\nefficient solution for the complex task of question classification. Through\nrigorous comparisons with well-known models like BERT, RoBERTa, and DistilBERT,\nthe ensemble approach verifies its effectiveness by attaining an 80% accuracy\nscore on the test dataset.\n","authors":["Sanad Aburass","Osama Dorgham"],"pdf_url":"https://arxiv.org/pdf/2308.06828v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13696v1","updated":"2023-08-25T22:57:53Z","published":"2023-08-25T22:57:53Z","title":"On the Depth between Beam Search and Exhaustive Search for Text\n Generation","summary":" Beam search and exhaustive search are two extreme ends of text decoding\nalgorithms with respect to the search depth. Beam search is limited in both\nsearch width and depth, whereas exhaustive search is a global search that has\nno such limitations. Surprisingly, beam search is not only computationally\ncheaper but also performs better than exhaustive search despite its higher\nsearch error. Plenty of research has investigated a range of beam widths, from\nsmall to large, and reported that a beam width that is neither too large nor\ntoo small is desirable. However, in terms of search depth, only the two extreme\nends, beam search and exhaustive search are studied intensively. In this paper,\nwe examine a range of search depths between the two extremes to discover the\ndesirable search depth. To this end, we introduce Lookahead Beam Search (LBS),\na multi-step lookahead search that optimizes the objective considering a fixed\nnumber of future steps. Beam search and exhaustive search are special cases of\nLBS where the lookahead depth is set to $0$ and $\\infty$, respectively. We\nempirically evaluate the performance of LBS and find that it outperforms beam\nsearch overall on machine translation tasks. The result suggests there is room\nfor improvement in beam search by searching deeper. Inspired by the analysis,\nwe propose Lookbehind Heuristic Beam Search, a computationally feasible search\nalgorithm that heuristically simulates LBS with 1-step lookahead. The empirical\nresults show that the proposed method outperforms vanilla beam search on\nmachine translation and text summarization tasks.\n","authors":["Yuu Jinnai","Tetsuro Morimura","Ukyo Honda"],"pdf_url":"https://arxiv.org/pdf/2308.13696v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13687v1","updated":"2023-08-25T22:00:53Z","published":"2023-08-25T22:00:53Z","title":"1.5 million materials narratives generated by chatbots","summary":" The advent of artificial intelligence (AI) has enabled a comprehensive\nexploration of materials for various applications. However, AI models often\nprioritize frequently encountered materials in the scientific literature,\nlimiting the selection of suitable candidates based on inherent physical and\nchemical properties. To address this imbalance, we have generated a dataset of\n1,494,017 natural language-material paragraphs based on combined OQMD,\nMaterials Project, JARVIS, COD and AFLOW2 databases, which are dominated by ab\ninitio calculations and tend to be much more evenly distributed on the periodic\ntable. The generated text narratives were then polled and scored by both human\nexperts and ChatGPT-4, based on three rubrics: technical accuracy, language and\nstructure, and relevance and depth of content, showing similar scores but with\nhuman-scored depth of content being the most lagging. The merger of\nmulti-modality data sources and large language model (LLM) holds immense\npotential for AI frameworks to help the exploration and discovery of\nsolid-state materials for specific applications.\n","authors":["Yang Jeong Park","Sung Eun Jerng","Jin-Sung Park","Choah Kwon","Chia-Wei Hsu","Zhichu Ren","Sungroh Yoon","Ju Li"],"pdf_url":"https://arxiv.org/pdf/2308.13687v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13676v1","updated":"2023-08-25T21:25:08Z","published":"2023-08-25T21:25:08Z","title":"Rethinking Language Models as Symbolic Knowledge Graphs","summary":" Symbolic knowledge graphs (KGs) play a pivotal role in knowledge-centric\napplications such as search, question answering and recommendation. As\ncontemporary language models (LMs) trained on extensive textual data have\ngained prominence, researchers have extensively explored whether the parametric\nknowledge within these models can match up to that present in knowledge graphs.\nVarious methodologies have indicated that enhancing the size of the model or\nthe volume of training data enhances its capacity to retrieve symbolic\nknowledge, often with minimal or no human supervision. Despite these\nadvancements, there is a void in comprehensively evaluating whether LMs can\nencompass the intricate topological and semantic attributes of KGs, attributes\ncrucial for reasoning processes. In this work, we provide an exhaustive\nevaluation of language models of varying sizes and capabilities. We construct\nnine qualitative benchmarks that encompass a spectrum of attributes including\nsymmetry, asymmetry, hierarchy, bidirectionality, compositionality, paths,\nentity-centricity, bias and ambiguity. Additionally, we propose novel\nevaluation metrics tailored for each of these attributes. Our extensive\nevaluation of various LMs shows that while these models exhibit considerable\npotential in recalling factual information, their ability to capture intricate\ntopological and semantic traits of KGs remains significantly constrained. We\nnote that our proposed evaluation metrics are more reliable in evaluating these\nabilities than the existing metrics. Lastly, some of our benchmarks challenge\nthe common notion that larger LMs (e.g., GPT-4) universally outshine their\nsmaller counterparts (e.g., BERT).\n","authors":["Vishwas Mruthyunjaya","Pouya Pezeshkpour","Estevam Hruschka","Nikita Bhutani"],"pdf_url":"https://arxiv.org/pdf/2308.13676v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13646v1","updated":"2023-08-25T19:34:21Z","published":"2023-08-25T19:34:21Z","title":"GRASP: A Rehearsal Policy for Efficient Online Continual Learning","summary":" Continual learning (CL) in deep neural networks (DNNs) involves incrementally\naccumulating knowledge in a DNN from a growing data stream. A major challenge\nin CL is that non-stationary data streams cause catastrophic forgetting of\npreviously learned abilities. Rehearsal is a popular and effective way to\nmitigate this problem, which is storing past observations in a buffer and\nmixing them with new observations during learning. This leads to a question:\nWhich stored samples should be selected for rehearsal? Choosing samples that\nare best for learning, rather than simply selecting them at random, could lead\nto significantly faster learning. For class incremental learning, prior work\nhas shown that a simple class balanced random selection policy outperforms more\nsophisticated methods. Here, we revisit this question by exploring a new sample\nselection policy called GRASP. GRASP selects the most prototypical (class\nrepresentative) samples first and then gradually selects less prototypical\n(harder) examples to update the DNN. GRASP has little additional compute or\nmemory overhead compared to uniform selection, enabling it to scale to large\ndatasets. We evaluate GRASP and other policies by conducting CL experiments on\nthe large-scale ImageNet-1K and Places-LT image classification datasets. GRASP\noutperforms all other rehearsal policies. Beyond vision, we also demonstrate\nthat GRASP is effective for CL on five text classification datasets.\n","authors":["Md Yousuf Harun","Jhair Gallardo","Christopher Kanan"],"pdf_url":"https://arxiv.org/pdf/2308.13646v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13590v1","updated":"2023-08-25T17:23:12Z","published":"2023-08-25T17:23:12Z","title":"LSTM-based QoE Evaluation for Web Microservices' Reputation Scoring","summary":" Sentiment analysis is the task of mining the authors' opinions about specific\nentities. It allows organizations to monitor different services in real time\nand act accordingly. Reputation is what is generally said or believed about\npeople or things. Informally, reputation combines the measure of reliability\nderived from feedback, reviews, and ratings gathered from users, which reflect\ntheir quality of experience (QoE) and can either increase or harm the\nreputation of the provided services. In this study, we propose to perform\nsentiment analysis on web microservices reviews to exploit the provided\ninformation to assess and score the microservices' reputation. Our proposed\napproach uses the Long Short-Term Memory (LSTM) model to perform sentiment\nanalysis and the Net Brand Reputation (NBR) algorithm to assess reputation\nscores for microservices. This approach is tested on a set of more than 10,000\nreviews related to 15 Amazon Web microservices, and the experimental results\nhave shown that our approach is more accurate than existing approaches, with an\naccuracy and precision of 93% obtained after applying an oversampling strategy\nand a resulting reputation score of the considered microservices community of\n89%.\n","authors":["Maha Driss"],"pdf_url":"https://arxiv.org/pdf/2308.13590v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13577v1","updated":"2023-08-25T13:07:33Z","published":"2023-08-25T13:07:33Z","title":"Text Style Transfer Evaluation Using Large Language Models","summary":" Text Style Transfer (TST) is challenging to evaluate because the quality of\nthe generated text manifests itself in multiple aspects, each of which is hard\nto measure individually: style transfer accuracy, content preservation, and\noverall fluency of the text. Human evaluation is the gold standard in TST\nevaluation; however, it is expensive, and the results are difficult to\nreproduce. Numerous automated metrics are employed to assess performance in\nthese aspects, serving as substitutes for human evaluation. However, the\ncorrelation between many of these automated metrics and human evaluations\nremains unclear, raising doubts about their effectiveness as reliable\nbenchmarks. Recent advancements in Large Language Models (LLMs) have\ndemonstrated their ability to not only match but also surpass the average human\nperformance across a wide range of unseen tasks. This suggests that LLMs have\nthe potential to serve as a viable alternative to human evaluation and other\nautomated metrics. We assess the performance of different LLMs on TST\nevaluation by employing multiple input prompts and comparing their results. Our\nfindings indicate that (even zero-shot) prompting correlates strongly with\nhuman evaluation and often surpasses the performance of (other) automated\nmetrics. Additionally, we propose the ensembling of prompts and show it\nincreases the robustness of TST evaluation.This work contributes to the ongoing\nefforts in evaluating LLMs on diverse tasks, which includes a discussion of\nfailure cases and limitations.\n","authors":["Phil Ostheimer","Mayank Nagda","Marius Kloft","Sophie Fellenz"],"pdf_url":"https://arxiv.org/pdf/2308.13577v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13576v1","updated":"2023-08-25T12:45:46Z","published":"2023-08-25T12:45:46Z","title":"An Ensemble Approach to Personalized Real Time Predictive Writing for\n Experts","summary":" Completing a sentence, phrase or word after typing few words / characters is\nvery helpful for Intuit financial experts, while taking notes or having a live\nchat with users, since they need to write complex financial concepts more\nefficiently and accurately many times in a day. In this paper, we tie together\ndifferent approaches like large language models, traditional Markov Models and\nchar level models to create an end-to-end system to provide personalised\nsentence/word auto-complete suggestions to experts, under strict latency\nconstraints. Proposed system can auto-complete sentences, phrases or words\nwhile writing with personalisation and can be trained with very less data and\nresources with good efficiency. Our proposed system is not only efficient and\npersonalized but also robust as it leverages multiple machine learning\ntechniques along with transfer learning approach to fine tune large language\nmodel with Intuit specific data. This ensures that even in cases of rare or\nunusual phrases, the system can provide relevant auto-complete suggestions in\nnear real time. Survey has showed that this system saves expert note-taking\ntime and boosts expert confidence in their communication with teammates and\nclients. Since enabling this predictive writing feature for QBLive experts,\nmore than a million keystrokes have been saved based on these suggestions. We\nhave done comparative study for our ensemble choice. Moreover this feature can\nbe integrated with any product which has writing facility within a very short\nperiod of time.\n","authors":["Sourav Prosad","Viswa Datha Polavarapu","Shrutendra Harsola"],"pdf_url":"https://arxiv.org/pdf/2308.13576v1.pdf","comment":"ACM SIGKDD Workshop on Machine Learning in Finance, 2023"},{"id":"http://arxiv.org/abs/2308.13345v1","updated":"2023-08-25T12:31:12Z","published":"2023-08-25T12:31:12Z","title":"Decoupled Structure for Improved Adaptability of End-to-End Models","summary":" Although end-to-end (E2E) trainable automatic speech recognition (ASR) has\nshown great success by jointly learning acoustic and linguistic information, it\nstill suffers from the effect of domain shifts, thus limiting potential\napplications. The E2E ASR model implicitly learns an internal language model\n(LM) which characterises the training distribution of the source domain, and\nthe E2E trainable nature makes the internal LM difficult to adapt to the target\ndomain with text-only data To solve this problem, this paper proposes decoupled\nstructures for attention-based encoder-decoder (Decoupled-AED) and neural\ntransducer (Decoupled-Transducer) models, which can achieve flexible domain\nadaptation in both offline and online scenarios while maintaining robust\nintra-domain performance. To this end, the acoustic and linguistic parts of the\nE2E model decoder (or prediction network) are decoupled, making the linguistic\ncomponent (i.e. internal LM) replaceable. When encountering a domain shift, the\ninternal LM can be directly replaced during inference by a target-domain LM,\nwithout re-training or using domain-specific paired speech-text data.\nExperiments for E2E ASR models trained on the LibriSpeech-100h corpus showed\nthat the proposed decoupled structure gave 15.1% and 17.2% relative word error\nrate reductions on the TED-LIUM 2 and AESRC2020 corpora while still maintaining\nperformance on intra-domain data.\n","authors":["Keqi Deng","Philip C. Woodland"],"pdf_url":"https://arxiv.org/pdf/2308.13345v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13317v1","updated":"2023-08-25T11:41:05Z","published":"2023-08-25T11:41:05Z","title":"Transforming the Output of Generative Pre-trained Transformer: The\n Influence of the PGI Framework on Attention Dynamics","summary":" This paper presents a novel approach named Persona-Grouping-Intelligence\n(PGI), which has been crafted to tackle the challenges posed by GPT models when\napplied to real-world business issues. PGI leverages the inherent capabilities\nof the GPT model to comprehend intricate language structures and generate\nresponses that are contextually relevant. The experiment occurred in a business\nscenario where human intelligence was being underutilized due to less optimized\nbusiness processes. The primary objective of this approach is to leverage GPT\nmodels to reduce the workload on humans in tasks that are extensive,\nmonotonous, and repetitive. Instead, the focus is redirected toward\ndecision-making activities. Remarkably, the experiment yielded an accuracy rate\nof 93.81% in validating 4,000 responses generated by the model, underscoring\nthe effectiveness of the PGI strategies. Effectively addressing the issue of\nunderutilized human intelligence, this paradigm shift aligns business\nenvironments with dynamic machine intelligence, enabling them to navigate the\nintricacies of real-world challenges. This approach facilitates the practical\nutilization of these models to tackle actual problems. The methodology offers\nan opportunity to reshape the fundamental structure of business processes by\nseamlessly integrating human decision-making with adaptable machine\nintelligence. Consequently, this optimization enhances operational efficiency\nand elevates strategic decision-making across diverse business contexts.\n","authors":["Aline Ioste"],"pdf_url":"https://arxiv.org/pdf/2308.13317v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13569v1","updated":"2023-08-25T05:25:05Z","published":"2023-08-25T05:25:05Z","title":"Discovering Mental Health Research Topics with Topic Modeling","summary":" Mental health significantly influences various aspects of our daily lives,\nand its importance has been increasingly recognized by the research community\nand the general public, particularly in the wake of the COVID-19 pandemic. This\nheightened interest is evident in the growing number of publications dedicated\nto mental health in the past decade. In this study, our goal is to identify\ngeneral trends in the field and pinpoint high-impact research topics by\nanalyzing a large dataset of mental health research papers. To accomplish this,\nwe collected abstracts from various databases and trained a customized\nSentence-BERT based embedding model leveraging the BERTopic framework. Our\ndataset comprises 96,676 research papers pertaining to mental health, enabling\nus to examine the relationships between different topics using their abstracts.\nTo evaluate the effectiveness of the model, we compared it against two other\nstate-of-the-art methods: Top2Vec model and LDA-BERT model. The model\ndemonstrated superior performance in metrics that measure topic diversity and\ncoherence. To enhance our analysis, we also generated word clouds to provide a\ncomprehensive overview of the machine learning models applied in mental health\nresearch, shedding light on commonly utilized techniques and emerging trends.\nFurthermore, we provide a GitHub link* to the dataset used in this paper,\nensuring its accessibility for further research endeavors.\n","authors":["Xin Gao","Cem Sazara"],"pdf_url":"https://arxiv.org/pdf/2308.13569v1.pdf","comment":"Workshop on Interpretable ML in Healthcare at International\n Conference on Machine Learning (ICML)"},{"id":"http://arxiv.org/abs/2308.13566v1","updated":"2023-08-25T01:41:04Z","published":"2023-08-25T01:41:04Z","title":"MLLM-DataEngine: An Iterative Refinement Approach for MLLM","summary":" Despite the great advance of Multimodal Large Language Models (MLLMs) in both\ninstruction dataset building and benchmarking, the independence of training and\nevaluation makes current MLLMs hard to further improve their capability under\nthe guidance of evaluation results with a relatively low human cost. In this\npaper, we propose MLLM-DataEngine, a novel closed-loop system that bridges data\ngeneration, model training, and evaluation. Within each loop iteration, the\nMLLM-DataEngine first analyze the weakness of the model based on the evaluation\nresults, then generate a proper incremental dataset for the next training\niteration and enhance the model capability iteratively. Compared with previous\ndata collection methods which are separate from the benchmarking, the data\ngenerated by MLLM-DataEngine shows better targeting, quality, and correctness.\nFor targeting, we propose an Adaptive Bad-case Sampling module, which adjusts\nthe ratio of different types of data within each incremental dataset based on\nthe benchmarking results. For quality, we resort to GPT-4 to generate\nhigh-quality data with each given data type. For correctness, prompt design is\ncritical for the data generation results. Rather than previous hand-crafted\nprompt, we propose an Interactive Prompt Optimization strategy, which optimizes\nthe prompt with the multi-round interaction between human and GPT, and improve\nthe correctness of generated data greatly. Through extensive experiments, we\nfind our MLLM-DataEngine could boost the MLLM capability in a targeted and\nautomatic manner, with only a few human participation. The MLLM-DataEngine will\nbe released and we hope it could be a general solution for the following MLLMs\nbuilding.\n","authors":["Zhiyuan Zhao","Linke Ouyang","Bin Wang","Siyuan Huang","Pan Zhang","Xiaoyi Dong","Jiaqi Wang","Conghui He"],"pdf_url":"https://arxiv.org/pdf/2308.13566v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13565v1","updated":"2023-08-25T01:40:48Z","published":"2023-08-25T01:40:48Z","title":"DARWIN Series: Domain Specific Large Language Models for Natural Science","summary":" Emerging tools bring forth fresh approaches to work, and the field of natural\nscience is no different. In natural science, traditional manual, serial, and\nlabour-intensive work is being augmented by automated, parallel, and iterative\nprocesses driven by artificial intelligence-based experimental automation and\nmore. To add new capabilities in natural science, enabling the acceleration and\nenrichment of automation of the discovery process, we present DARWIN, a series\nof tailored LLMs for natural science, mainly in physics, chemistry, and\nmaterial science. This series relies on open-source LLM, incorporating\nstructured and unstructured scientific knowledge from public datasets and\nliterature. We fine-tuned the models using over 60,000 instruction data points,\nemphasizing factual correctness. During the fine-tuning, we introduce the\nScientific Instruction Generation (SIG) model, automating instruction\ngeneration from scientific texts. This eliminates the need for manual\nextraction or domain-specific knowledge graphs and efficiently injects\nscientific knowledge into the model. We also explore multi-task training\nstrategies, revealing interconnections between scientific tasks. DARWIN series\nnot only achieves state-of-the-art results on various scientific tasks but also\ndiminishes reliance on closed-source AI models. Our research showcases the\nability of LLM in the scientific domain, with the overarching goal of fostering\nprosperity within the broader AI for science community.\n","authors":["Tong Xie","Yuwei Wan","Wei Huang","Zhenyu Yin","Yixuan Liu","Shaozhou Wang","Qingyuan Linghu","Chunyu Kit","Clara Grazian","Wenjie Zhang","Imran Razzak","Bram Hoex"],"pdf_url":"https://arxiv.org/pdf/2308.13565v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13563v1","updated":"2023-08-25T00:09:16Z","published":"2023-08-25T00:09:16Z","title":"Large Language Models in Analyzing Crash Narratives -- A Comparative\n Study of ChatGPT, BARD and GPT-4","summary":" In traffic safety research, extracting information from crash narratives\nusing text analysis is a common practice. With recent advancements of large\nlanguage models (LLM), it would be useful to know how the popular LLM\ninterfaces perform in classifying or extracting information from crash\nnarratives. To explore this, our study has used the three most popular publicly\navailable LLM interfaces- ChatGPT, BARD and GPT4. This study investigated their\nusefulness and boundaries in extracting information and answering queries\nrelated to accidents from 100 crash narratives from Iowa and Kansas. During the\ninvestigation, their capabilities and limitations were assessed and their\nresponses to the queries were compared. Five questions were asked related to\nthe narratives: 1) Who is at-fault? 2) What is the manner of collision? 3) Has\nthe crash occurred in a work-zone? 4) Did the crash involve pedestrians? and 5)\nWhat are the sequence of harmful events in the crash? For questions 1 through\n4, the overall similarity among the LLMs were 70%, 35%, 96% and 89%,\nrespectively. The similarities were higher while answering direct questions\nrequiring binary responses and significantly lower for complex questions. To\ncompare the responses to question 5, network diagram and centrality measures\nwere analyzed. The network diagram from the three LLMs were not always similar\nalthough they sometimes have the same influencing events with high in-degree,\nout-degree and betweenness centrality. This study suggests using multiple\nmodels to extract viable information from narratives. Also, caution must be\npracticed while using these interfaces to obtain crucial safety related\ninformation.\n","authors":["Maroa Mumtarin","Md Samiullah Chowdhury","Jonathan Wood"],"pdf_url":"https://arxiv.org/pdf/2308.13563v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2305.07618v2","updated":"2023-08-25T17:34:49Z","published":"2023-05-12T17:17:01Z","title":"Uncertainty Estimation using the Local Lipschitz for Deep Learning Image\n Reconstruction Models","summary":" The use of supervised deep neural network approaches has been investigated to\nsolve inverse problems in all domains, especially radiology where imaging\ntechnologies are at the heart of diagnostics. However, in deployment, these\nmodels are exposed to input distributions that are widely shifted from training\ndata, due in part to data biases or drifts. It becomes crucial to know whether\na given input lies outside the training data distribution before relying on the\nreconstruction for diagnosis. The goal of this work is three-fold: (i)\ndemonstrate use of the local Lipshitz value as an uncertainty estimation\nthreshold for determining suitable performance, (ii) provide method for\nidentifying out-of-distribution (OOD) images where the model may not have\ngeneralized, and (iii) use the local Lipschitz values to guide proper data\naugmentation through identifying false positives and decrease epistemic\nuncertainty. We provide results for both MRI reconstruction and CT sparse view\nto full view reconstruction using AUTOMAP and UNET architectures due to it\nbeing pertinent in the medical domain that reconstructed images remain\ndiagnostically accurate.\n","authors":["Danyal F. Bhutto","Bo Zhu","Jeremiah Z. Liu","Neha Koonjoo","Bruce R. Rosen","Matthew S. Rosen"],"pdf_url":"https://arxiv.org/pdf/2305.07618v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13505v1","updated":"2023-08-25T17:30:08Z","published":"2023-08-25T17:30:08Z","title":"Joint Modeling of Feature, Correspondence, and a Compressed Memory for\n Video Object Segmentation","summary":" Current prevailing Video Object Segmentation (VOS) methods usually perform\ndense matching between the current and reference frames after extracting their\nfeatures. One on hand, the decoupled modeling restricts the targets information\npropagation only at high-level feature space. On the other hand, the pixel-wise\nmatching leads to a lack of holistic understanding of the targets. To overcome\nthese issues, we propose a unified VOS framework, coined as JointFormer, for\njoint modeling the three elements of feature, correspondence, and a compressed\nmemory. The core design is the Joint Block, utilizing the flexibility of\nattention to simultaneously extract feature and propagate the targets\ninformation to the current tokens and the compressed memory token. This scheme\nallows to perform extensive information propagation and discriminative feature\nlearning. To incorporate the long-term temporal targets information, we also\ndevise a customized online updating mechanism for the compressed memory token,\nwhich can prompt the information flow along the temporal dimension and thus\nimprove the global modeling capability. Under the design, our method achieves a\nnew state-of-art performance on DAVIS 2017 val/test-dev (89.7% and 87.6%) and\nYouTube-VOS 2018/2019 val (87.0% and 87.0%) benchmarks, outperforming existing\nworks by a large margin.\n","authors":["Jiaming Zhang","Yutao Cui","Gangshan Wu","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2308.13505v1.pdf","comment":"9 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.13504v1","updated":"2023-08-25T17:28:58Z","published":"2023-08-25T17:28:58Z","title":"A2Q: Accumulator-Aware Quantization with Guaranteed Overflow Avoidance","summary":" We present accumulator-aware quantization (A2Q), a novel weight quantization\nmethod designed to train quantized neural networks (QNNs) to avoid overflow\nwhen using low-precision accumulators during inference. A2Q introduces a unique\nformulation inspired by weight normalization that constrains the L1-norm of\nmodel weights according to accumulator bit width bounds that we derive. Thus,\nin training QNNs for low-precision accumulation, A2Q also inherently promotes\nunstructured weight sparsity to guarantee overflow avoidance. We apply our\nmethod to deep learning-based computer vision tasks to show that A2Q can train\nQNNs for low-precision accumulators while maintaining model accuracy\ncompetitive with a floating-point baseline. In our evaluations, we consider the\nimpact of A2Q on both general-purpose platforms and programmable hardware.\nHowever, we primarily target model deployment on FPGAs because they can be\nprogrammed to fully exploit custom accumulator bit widths. Our experimentation\nshows accumulator bit width significantly impacts the resource efficiency of\nFPGA-based accelerators. On average across our benchmarks, A2Q offers up to a\n2.3x reduction in resource utilization over 32-bit accumulator counterparts\nwith 99.2% of the floating-point model accuracy.\n","authors":["Ian Colbert","Alessandro Pappalardo","Jakoba Petri-Koenig"],"pdf_url":"https://arxiv.org/pdf/2308.13504v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2301.13376"},{"id":"http://arxiv.org/abs/2308.13503v1","updated":"2023-08-25T17:28:23Z","published":"2023-08-25T17:28:23Z","title":"Attending Generalizability in Course of Deep Fake Detection by Exploring\n Multi-task Learning","summary":" This work explores various ways of exploring multi-task learning (MTL)\ntechniques aimed at classifying videos as original or manipulated in\ncross-manipulation scenario to attend generalizability in deep fake scenario.\nThe dataset used in our evaluation is FaceForensics++, which features 1000\noriginal videos manipulated by four different techniques, with a total of 5000\nvideos. We conduct extensive experiments on multi-task learning and contrastive\ntechniques, which are well studied in literature for their generalization\nbenefits. It can be concluded that the proposed detection model is quite\ngeneralized, i.e., accurately detects manipulation methods not encountered\nduring training as compared to the state-of-the-art.\n","authors":["Pranav Balaji","Abhijit Das","Srijan Das","Antitza Dantcheva"],"pdf_url":"https://arxiv.org/pdf/2308.13503v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.17645v2","updated":"2023-08-25T17:13:55Z","published":"2023-06-30T13:33:27Z","title":"Federated Object Detection for Quality Inspection in Shared Production","summary":" Federated learning (FL) has emerged as a promising approach for training\nmachine learning models on decentralized data without compromising data\nprivacy. In this paper, we propose a FL algorithm for object detection in\nquality inspection tasks using YOLOv5 as the object detection algorithm and\nFederated Averaging (FedAvg) as the FL algorithm. We apply this approach to a\nmanufacturing use-case where multiple factories/clients contribute data for\ntraining a global object detection model while preserving data privacy on a\nnon-IID dataset. Our experiments demonstrate that our FL approach achieves\nbetter generalization performance on the overall clients' test dataset and\ngenerates improved bounding boxes around the objects compared to models trained\nusing local clients' datasets. This work showcases the potential of FL for\nquality inspection tasks in the manufacturing industry and provides valuable\ninsights into the performance and feasibility of utilizing YOLOv5 and FedAvg\nfor federated object detection.\n","authors":["Vinit Hegiste","Tatjana Legler","Martin Ruskowski"],"pdf_url":"https://arxiv.org/pdf/2306.17645v2.pdf","comment":"Will submit it to an IEEE conference"},{"id":"http://arxiv.org/abs/2308.13495v1","updated":"2023-08-25T17:10:22Z","published":"2023-08-25T17:10:22Z","title":"Open Gaze: An Open-Source Implementation Replicating Google's Eye\n Tracking Paper","summary":" Eye tracking has been a pivotal tool in diverse fields such as vision\nresearch, language analysis, and usability assessment. The majority of prior\ninvestigations, however, have concentrated on expansive desktop displays\nemploying specialized, costly eye tracking hardware that lacks scalability.\nRemarkably little insight exists into ocular movement patterns on smartphones,\ndespite their widespread adoption and significant usage. In this manuscript, we\npresent an open-source implementation of a smartphone-based gaze tracker that\nemulates the methodology proposed by a GooglePaper (whose source code remains\nproprietary). Our focus is on attaining accuracy comparable to that attained\nthrough the GooglePaper's methodology, without the necessity for supplementary\nhardware. Through the integration of machine learning techniques, we unveil an\naccurate eye tracking solution that is native to smartphones. Our approach\ndemonstrates precision akin to the state-of-the-art mobile eye trackers, which\nare characterized by a cost that is two orders of magnitude higher. Leveraging\nthe vast MIT GazeCapture dataset, which is available through registration on\nthe dataset's website, we successfully replicate crucial findings from previous\nstudies concerning ocular motion behavior in oculomotor tasks and saliency\nanalyses during natural image observation. Furthermore, we emphasize the\napplicability of smartphone-based gaze tracking in discerning reading\ncomprehension challenges. Our findings exhibit the inherent potential to\namplify eye movement research by significant proportions, accommodating\nparticipation from thousands of subjects with explicit consent. This\nscalability not only fosters advancements in vision research, but also extends\nits benefits to domains such as accessibility enhancement and healthcare\napplications.\n","authors":["Sushmanth reddy Mereddy","Jyothi Swaroop Reddy","Somnath Sharma"],"pdf_url":"https://arxiv.org/pdf/2308.13495v1.pdf","comment":"17 pages , 15 figures"},{"id":"http://arxiv.org/abs/2308.13494v1","updated":"2023-08-25T17:10:12Z","published":"2023-08-25T17:10:12Z","title":"Eventful Transformers: Leveraging Temporal Redundancy in Vision\n Transformers","summary":" Vision Transformers achieve impressive accuracy across a range of visual\nrecognition tasks. Unfortunately, their accuracy frequently comes with high\ncomputational costs. This is a particular issue in video recognition, where\nmodels are often applied repeatedly across frames or temporal chunks. In this\nwork, we exploit temporal redundancy between subsequent inputs to reduce the\ncost of Transformers for video processing. We describe a method for identifying\nand re-processing only those tokens that have changed significantly over time.\nOur proposed family of models, Eventful Transformers, can be converted from\nexisting Transformers (often without any re-training) and give adaptive control\nover the compute cost at runtime. We evaluate our method on large-scale\ndatasets for video object detection (ImageNet VID) and action recognition\n(EPIC-Kitchens 100). Our approach leads to significant computational savings\n(on the order of 2-4x) with only minor reductions in accuracy.\n","authors":["Matthew Dutson","Yin Li","Mohit Gupta"],"pdf_url":"https://arxiv.org/pdf/2308.13494v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2306.17829v2","updated":"2023-08-25T17:08:34Z","published":"2023-06-30T17:50:00Z","title":"Federated Ensemble YOLOv5 -- A Better Generalized Object Detection\n Algorithm","summary":" Federated learning (FL) has gained significant traction as a\nprivacy-preserving algorithm, but the underlying resemblances of federated\nlearning algorithms like Federated averaging (FedAvg) or Federated SGD (Fed\nSGD) to ensemble learning algorithms have not been fully explored. The purpose\nof this paper is to examine the application of FL to object detection as a\nmethod to enhance generalizability, and to compare its performance against a\ncentralized training approach for an object detection algorithm. Specifically,\nwe investigate the performance of a YOLOv5 model trained using FL across\nmultiple clients and employ a random sampling strategy without replacement, so\neach client holds a portion of the same dataset used for centralized training.\nOur experimental results showcase the superior efficiency of the FL object\ndetector's global model in generating accurate bounding boxes for unseen\nobjects, with the test set being a mixture of objects from two distinct clients\nnot represented in the training dataset. These findings suggest that FL can be\nviewed from an ensemble algorithm perspective, akin to a synergistic blend of\nBagging and Boosting techniques. As a result, FL can be seen not only as a\nmethod to enhance privacy, but also as a method to enhance the performance of a\nmachine learning model.\n","authors":["Vinit Hegiste","Tatjana Legler","Martin Ruskowski"],"pdf_url":"https://arxiv.org/pdf/2306.17829v2.pdf","comment":"8 pages and submitted to FLTA2023 symposium under IEEE"},{"id":"http://arxiv.org/abs/2308.13492v1","updated":"2023-08-25T17:06:30Z","published":"2023-08-25T17:06:30Z","title":"Ultrafast-and-Ultralight ConvNet-Based Intelligent Monitoring System for\n Diagnosing Early-Stage Mpox Anytime and Anywhere","summary":" Due to the lack of more efficient diagnostic tools for monkeypox, its spread\nremains unchecked, presenting a formidable challenge to global health. While\nthe high efficacy of deep learning models for monkeypox diagnosis has been\ndemonstrated in related studies, the overlook of inference speed, the parameter\nsize and diagnosis performance for early-stage monkeypox renders the models\ninapplicable in real-world settings. To address these challenges, we proposed\nan ultrafast and ultralight network named Fast-MpoxNet. Fast-MpoxNet possesses\nonly 0.27M parameters and can process input images at 68 frames per second\n(FPS) on the CPU. To counteract the diagnostic performance limitation brought\nabout by the small model capacity, it integrates the attention-based feature\nfusion module and the multiple auxiliary losses enhancement strategy for better\ndetecting subtle image changes and optimizing weights. Using transfer learning\nand five-fold cross-validation, Fast-MpoxNet achieves 94.26% Accuracy on the\nMpox dataset. Notably, its recall for early-stage monkeypox achieves 93.65%. By\nadopting data augmentation, our model's Accuracy rises to 98.40% and attains a\nPracticality Score (A new metric for measuring model practicality in real-time\ndiagnosis application) of 0.80. We also developed an application system named\nMpox-AISM V2 for both personal computers and mobile phones. Mpox-AISM V2\nfeatures ultrafast responses, offline functionality, and easy deployment,\nenabling accurate and real-time diagnosis for both the public and individuals\nin various real-world settings, especially in populous settings during the\noutbreak. Our work could potentially mitigate future monkeypox outbreak and\nilluminate a fresh paradigm for developing real-time diagnostic tools in the\nhealthcare field.\n","authors":["Yubiao Yue","Xiaoqiang Shi","Li Qin","Xinyue Zhang","Yanmei Chen","Jialong Xu","Zipei Zheng","Yujun Cao","Di Liu","Zhenzhang Li","Yang Li"],"pdf_url":"https://arxiv.org/pdf/2308.13492v1.pdf","comment":"This paper has been submitted to Neurocomputing"},{"id":"http://arxiv.org/abs/2308.13488v1","updated":"2023-08-25T16:55:30Z","published":"2023-08-25T16:55:30Z","title":"Temporal Uncertainty Localization to Enable Human-in-the-loop Analysis\n of Dynamic Contrast-enhanced Cardiac MRI Datasets","summary":" Dynamic contrast-enhanced (DCE) cardiac magnetic resonance imaging (CMRI) is\na widely used modality for diagnosing myocardial blood flow (perfusion)\nabnormalities. During a typical free-breathing DCE-CMRI scan, close to 300\ntime-resolved images of myocardial perfusion are acquired at various contrast\n\"wash in/out\" phases. Manual segmentation of myocardial contours in each\ntime-frame of a DCE image series can be tedious and time-consuming,\nparticularly when non-rigid motion correction has failed or is unavailable.\nWhile deep neural networks (DNNs) have shown promise for analyzing DCE-CMRI\ndatasets, a \"dynamic quality control\" (dQC) technique for reliably detecting\nfailed segmentations is lacking. Here we propose a new space-time uncertainty\nmetric as a dQC tool for DNN-based segmentation of free-breathing DCE-CMRI\ndatasets by validating the proposed metric on an external dataset and\nestablishing a human-in-the-loop framework to improve the segmentation results.\nIn the proposed approach, we referred the top 10% most uncertain segmentations\nas detected by our dQC tool to the human expert for refinement. This approach\nresulted in a significant increase in the Dice score (p<0.001) and a notable\ndecrease in the number of images with failed segmentation (16.2% to 11.3%)\nwhereas the alternative approach of randomly selecting the same number of\nsegmentations for human referral did not achieve any significant improvement.\nOur results suggest that the proposed dQC framework has the potential to\naccurately identify poor-quality segmentations and may enable efficient\nDNN-based analysis of DCE-CMRI in a human-in-the-loop pipeline for clinical\ninterpretation and reporting of dynamic CMRI datasets.\n","authors":["Dilek M. Yalcinkaya","Khalid Youssef","Bobak Heydari","Orlando Simonetti","Rohan Dharmakumar","Subha Raman","Behzad Sharif"],"pdf_url":"https://arxiv.org/pdf/2308.13488v1.pdf","comment":"Accepted for publication in MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.13473v1","updated":"2023-08-25T16:20:41Z","published":"2023-08-25T16:20:41Z","title":"Unlocking the Performance of Proximity Sensors by Utilizing Transient\n Histograms","summary":" We provide methods which recover planar scene geometry by utilizing the\ntransient histograms captured by a class of close-range time-of-flight (ToF)\ndistance sensor. A transient histogram is a one dimensional temporal waveform\nwhich encodes the arrival time of photons incident on the ToF sensor.\nTypically, a sensor processes the transient histogram using a proprietary\nalgorithm to produce distance estimates, which are commonly used in several\nrobotics applications. Our methods utilize the transient histogram directly to\nenable recovery of planar geometry more accurately than is possible using only\nproprietary distance estimates, and consistent recovery of the albedo of the\nplanar surface, which is not possible with proprietary distance estimates\nalone. This is accomplished via a differentiable rendering pipeline, which\nsimulates the transient imaging process, allowing direct optimization of scene\ngeometry to match observations. To validate our methods, we capture 3,800\nmeasurements of eight planar surfaces from a wide range of viewpoints, and show\nthat our method outperforms the proprietary-distance-estimate baseline by an\norder of magnitude in most scenarios. We demonstrate a simple robotics\napplication which uses our method to sense the distance to and slope of a\nplanar surface from a sensor mounted on the end effector of a robot arm.\n","authors":["Carter Sifferman","Yeping Wang","Mohit Gupta","Michael Gleicher"],"pdf_url":"https://arxiv.org/pdf/2308.13473v1.pdf","comment":"Accepted for publication at IEEE Robotics and Automation Letters\n (RA-L)"},{"id":"http://arxiv.org/abs/2303.08084v2","updated":"2023-08-25T16:18:51Z","published":"2023-03-14T17:14:21Z","title":"Editing Implicit Assumptions in Text-to-Image Diffusion Models","summary":" Text-to-image diffusion models often make implicit assumptions about the\nworld when generating images. While some assumptions are useful (e.g., the sky\nis blue), they can also be outdated, incorrect, or reflective of social biases\npresent in the training data. Thus, there is a need to control these\nassumptions without requiring explicit user input or costly re-training. In\nthis work, we aim to edit a given implicit assumption in a pre-trained\ndiffusion model. Our Text-to-Image Model Editing method, TIME for short,\nreceives a pair of inputs: a \"source\" under-specified prompt for which the\nmodel makes an implicit assumption (e.g., \"a pack of roses\"), and a\n\"destination\" prompt that describes the same setting, but with a specified\ndesired attribute (e.g., \"a pack of blue roses\"). TIME then updates the model's\ncross-attention layers, as these layers assign visual meaning to textual\ntokens. We edit the projection matrices in these layers such that the source\nprompt is projected close to the destination prompt. Our method is highly\nefficient, as it modifies a mere 2.2% of the model's parameters in under one\nsecond. To evaluate model editing approaches, we introduce TIMED (TIME\nDataset), containing 147 source and destination prompt pairs from various\ndomains. Our experiments (using Stable Diffusion) show that TIME is successful\nin model editing, generalizes well for related prompts unseen during editing,\nand imposes minimal effect on unrelated generations.\n","authors":["Hadas Orgad","Bahjat Kawar","Yonatan Belinkov"],"pdf_url":"https://arxiv.org/pdf/2303.08084v2.pdf","comment":"Project page: https://time-diffusion.github.io/"},{"id":"http://arxiv.org/abs/2308.13471v1","updated":"2023-08-25T16:15:38Z","published":"2023-08-25T16:15:38Z","title":"A Fast Minimization Algorithm for the Euler Elastica Model Based on a\n Bilinear Decomposition","summary":" The Euler Elastica (EE) model with surface curvature can generate\nartifact-free results compared with the traditional total variation\nregularization model in image processing. However, strong nonlinearity and\nsingularity due to the curvature term in the EE model pose a great challenge\nfor one to design fast and stable algorithms for the EE model. In this paper,\nwe propose a new, fast, hybrid alternating minimization (HALM) algorithm for\nthe EE model based on a bilinear decomposition of the gradient of the\nunderlying image and prove the global convergence of the minimizing sequence\ngenerated by the algorithm under mild conditions. The HALM algorithm comprises\nthree sub-minimization problems and each is either solved in the closed form or\napproximated by fast solvers making the new algorithm highly accurate and\nefficient. We also discuss the extension of the HALM strategy to deal with\ngeneral curvature-based variational models, especially with a Lipschitz smooth\nfunctional of the curvature. A host of numerical experiments are conducted to\nshow that the new algorithm produces good results with much-improved efficiency\ncompared to other state-of-the-art algorithms for the EE model. As one of the\nbenchmarks, we show that the average running time of the HALM algorithm is at\nmost one-quarter of that of the fast operator-splitting-based\nDeng-Glowinski-Tai algorithm.\n","authors":["Zhifang Liu","Baochen Sun","Xue-Cheng Tai","Qi Wang","Huibin Chang"],"pdf_url":"https://arxiv.org/pdf/2308.13471v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13469v1","updated":"2023-08-25T16:13:22Z","published":"2023-08-25T16:13:22Z","title":"RestNet: Boosting Cross-Domain Few-Shot Segmentation with Residual\n Transformation Network","summary":" Cross-domain few-shot segmentation (CD-FSS) aims to achieve semantic\nsegmentation in previously unseen domains with a limited number of annotated\nsamples. Although existing CD-FSS models focus on cross-domain feature\ntransformation, relying exclusively on inter-domain knowledge transfer may lead\nto the loss of critical intra-domain information. To this end, we propose a\nnovel residual transformation network (RestNet) that facilitates knowledge\ntransfer while retaining the intra-domain support-query feature information.\nSpecifically, we propose a Semantic Enhanced Anchor Transform (SEAT) module\nthat maps features to a stable domain-agnostic space using advanced semantics.\nAdditionally, an Intra-domain Residual Enhancement (IRE) module is designed to\nmaintain the intra-domain representation of the original discriminant space in\nthe new space. We also propose a mask prediction strategy based on prototype\nfusion to help the model gradually learn how to segment. Our RestNet can\ntransfer cross-domain knowledge from both inter-domain and intra-domain without\nrequiring additional fine-tuning. Extensive experiments on ISIC, Chest X-ray,\nand FSS-1000 show that our RestNet achieves state-of-the-art performance. Our\ncode will be available soon.\n","authors":["Xinyang Huang","Chuang Zhu","Wenkai Chen"],"pdf_url":"https://arxiv.org/pdf/2308.13469v1.pdf","comment":"BMVC 2023"},{"id":"http://arxiv.org/abs/2306.05952v2","updated":"2023-08-25T15:59:15Z","published":"2023-06-09T15:09:16Z","title":"Overcoming Adversarial Attacks for Human-in-the-Loop Applications","summary":" Including human analysis has the potential to positively affect the\nrobustness of Deep Neural Networks and is relatively unexplored in the\nAdversarial Machine Learning literature. Neural network visual explanation maps\nhave been shown to be prone to adversarial attacks. Further research is needed\nin order to select robust visualizations of explanations for the image analyst\nto evaluate a given model. These factors greatly impact Human-In-The-Loop\n(HITL) evaluation tools due to their reliance on adversarial images, including\nexplanation maps and measurements of robustness. We believe models of human\nvisual attention may improve interpretability and robustness of human-machine\nimagery analysis systems. Our challenge remains, how can HITL evaluation be\nrobust in this adversarial landscape?\n","authors":["Ryan McCoppin","Marla Kennedy","Platon Lukyanenko","Sean Kennedy"],"pdf_url":"https://arxiv.org/pdf/2306.05952v2.pdf","comment":"New Frontiers in Adversarial Machine Learning, ICML 2022"},{"id":"http://arxiv.org/abs/2303.11910v3","updated":"2023-08-25T15:59:04Z","published":"2023-03-21T15:01:02Z","title":"360BEV: Panoramic Semantic Mapping for Indoor Bird's-Eye View","summary":" Seeing only a tiny part of the whole is not knowing the full circumstance.\nBird's-eye-view (BEV) perception, a process of obtaining allocentric maps from\negocentric views, is restricted when using a narrow Field of View (FoV) alone.\nIn this work, mapping from 360{\\deg} panoramas to BEV semantics, the 360BEV\ntask, is established for the first time to achieve holistic representations of\nindoor scenes in a top-down view. Instead of relying on narrow-FoV image\nsequences, a panoramic image with depth information is sufficient to generate a\nholistic BEV semantic map. To benchmark 360BEV, we present two indoor datasets,\n360BEV-Matterport and 360BEV-Stanford, both of which include egocentric\npanoramic images and semantic segmentation labels, as well as allocentric\nsemantic maps. Besides delving deep into different mapping paradigms, we\npropose a dedicated solution for panoramic semantic mapping, namely 360Mapper.\nThrough extensive experiments, our methods achieve 44.32% and 45.78% in mIoU on\nboth datasets respectively, surpassing previous counterparts with gains of\n+7.60% and +9.70% in mIoU. Code and datasets are available at the project page:\nhttps://jamycheung.github.io/360BEV.html.\n","authors":["Zhifeng Teng","Jiaming Zhang","Kailun Yang","Kunyu Peng","Hao Shi","Simon Reiß","Ke Cao","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2303.11910v3.pdf","comment":"Code and datasets are available at the project page:\n https://jamycheung.github.io/360BEV.html. Accepted to WACV 2024"},{"id":"http://arxiv.org/abs/2308.13442v1","updated":"2023-08-25T15:42:19Z","published":"2023-08-25T15:42:19Z","title":"Unlocking Fine-Grained Details with Wavelet-based High-Frequency\n Enhancement in Transformers","summary":" Medical image segmentation is a critical task that plays a vital role in\ndiagnosis, treatment planning, and disease monitoring. Accurate segmentation of\nanatomical structures and abnormalities from medical images can aid in the\nearly detection and treatment of various diseases. In this paper, we address\nthe local feature deficiency of the Transformer model by carefully re-designing\nthe self-attention map to produce accurate dense prediction in medical images.\nTo this end, we first apply the wavelet transformation to decompose the input\nfeature map into low-frequency (LF) and high-frequency (HF) subbands. The LF\nsegment is associated with coarse-grained features while the HF components\npreserve fine-grained features such as texture and edge information. Next, we\nreformulate the self-attention operation using the efficient Transformer to\nperform both spatial and context attention on top of the frequency\nrepresentation. Furthermore, to intensify the importance of the boundary\ninformation, we impose an additional attention map by creating a Gaussian\npyramid on top of the HF components. Moreover, we propose a multi-scale context\nenhancement block within skip connections to adaptively model inter-scale\ndependencies to overcome the semantic gap among stages of the encoder and\ndecoder modules. Throughout comprehensive experiments, we demonstrate the\neffectiveness of our strategy on multi-organ and skin lesion segmentation\nbenchmarks. The implementation code will be available upon acceptance.\n\\href{https://github.com/mindflow-institue/WaveFormer}{GitHub}.\n","authors":["Reza Azad","Amirhossein Kazerouni","Alaa Sulaiman","Afshin Bozorgpour","Ehsan Khodapanah Aghdam","Abin Jose","Dorit Merhof"],"pdf_url":"https://arxiv.org/pdf/2308.13442v1.pdf","comment":"Accepted in MICCAI 2023 workshop MLMI"},{"id":"http://arxiv.org/abs/2308.13441v1","updated":"2023-08-25T15:41:05Z","published":"2023-08-25T15:41:05Z","title":"Mesh-Wise Prediction of Demographic Composition from Satellite Images\n Using Multi-Head Convolutional Neural Network","summary":" Population aging is one of the most serious problems in certain countries. In\norder to implement its countermeasures, understanding its rapid progress is of\nurgency with a granular resolution. However, a detailed and rigorous survey\nwith high frequency is not feasible due to the constraints of financial and\nhuman resources. Nowadays, Deep Learning is prevalent for pattern recognition\nwith significant accuracy, with its application to remote sensing. This paper\nproposes a multi-head Convolutional Neural Network model with transfer learning\nfrom pre-trained ResNet50 for estimating mesh-wise demographics of Japan as one\nof the most aged countries in the world, with satellite images from\nLandsat-8/OLI and Suomi NPP/VIIRS-DNS as inputs and census demographics as\nlabels. The trained model was performed on a testing dataset with a test score\nof at least 0.8914 in $\\text{R}^2$ for all the demographic composition groups,\nand the estimated demographic composition was generated and visualised for 2022\nas a non-census year.\n","authors":["Yuta Sato"],"pdf_url":"https://arxiv.org/pdf/2308.13441v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13437v1","updated":"2023-08-25T15:33:47Z","published":"2023-08-25T15:33:47Z","title":"Position-Enhanced Visual Instruction Tuning for Multimodal Large\n Language Models","summary":" Recently, Multimodal Large Language Models (MLLMs) that enable Large Language\nModels (LLMs) to interpret images through visual instruction tuning have\nachieved significant success. However, existing visual instruction tuning\nmethods only utilize image-language instruction data to align the language and\nimage modalities, lacking a more fine-grained cross-modal alignment. In this\npaper, we propose Position-enhanced Visual Instruction Tuning (PVIT), which\nextends the functionality of MLLMs by integrating an additional region-level\nvision encoder. This integration promotes a more detailed comprehension of\nimages for the MLLM. In addition, to efficiently achieve a fine-grained\nalignment between the vision modules and the LLM, we design multiple data\ngeneration strategies to construct an image-region-language instruction\ndataset. Finally, we present both quantitative experiments and qualitative\nanalysis that demonstrate the superiority of the proposed model. Code and data\nwill be released at https://github.com/THUNLP-MT/PVIT.\n","authors":["Chi Chen","Ruoyu Qin","Fuwen Luo","Xiaoyue Mi","Peng Li","Maosong Sun","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2308.13437v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07707v2","updated":"2023-08-25T15:17:41Z","published":"2023-04-16T06:33:43Z","title":"Non-exemplar Class-incremental Learning by Random Auxiliary Classes\n Augmentation and Mixed Features","summary":" Non-exemplar class-incremental learning refers to classifying new and old\nclasses without storing samples of old classes. Since only new class samples\nare available for optimization, it often occurs catastrophic forgetting of old\nknowledge. To alleviate this problem, many new methods are proposed such as\nmodel distillation, class augmentation. In this paper, we propose an effective\nnon-exemplar method called RAMF consisting of Random Auxiliary classes\naugmentation and Mixed Feature. On the one hand, we design a novel random\nauxiliary classes augmentation method, where one augmentation is randomly\nselected from three augmentations and applied on the input to generate\naugmented samples and extra class labels. By extending data and label space, it\nallows the model to learn more diverse representations, which can prevent the\nmodel from being biased towards learning task-specific features. When learning\nnew tasks, it will reduce the change of feature space and improve model\ngeneralization. On the other hand, we employ mixed feature to replace the new\nfeatures since only using new feature to optimize the model will affect the\nrepresentation that was previously embedded in the feature space. Instead, by\nmixing new and old features, old knowledge can be retained without increasing\nthe computational complexity. Extensive experiments on three benchmarks\ndemonstrate the superiority of our approach, which outperforms the\nstate-of-the-art non-exemplar methods and is comparable to high-performance\nreplay-based methods.\n","authors":["Ke Song","Quan Xia","Guoqiang Liang","Zhaojie Chen","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2304.07707v2.pdf","comment":"12 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.13421v1","updated":"2023-08-25T15:06:14Z","published":"2023-08-25T15:06:14Z","title":"Exploiting Diverse Feature for Multimodal Sentiment Analysis","summary":" In this paper, we present our solution to the MuSe-Personalisation\nsub-challenge in the MuSe 2023 Multimodal Sentiment Analysis Challenge. The\ntask of MuSe-Personalisation aims to predict the continuous arousal and valence\nvalues of a participant based on their audio-visual, language, and\nphysiological signal modalities data. Considering different people have\npersonal characteristics, the main challenge of this task is how to build\nrobustness feature presentation for sentiment prediction. To address this\nissue, we propose exploiting diverse features. Specifically, we proposed a\nseries of feature extraction methods to build a robust representation and model\nensemble. We empirically evaluate the performance of the utilized method on the\nofficially provided dataset. \\textbf{As a result, we achieved 3rd place in the\nMuSe-Personalisation sub-challenge.} Specifically, we achieve the results of\n0.8492 and 0.8439 for MuSe-Personalisation in terms of arousal and valence CCC.\n","authors":["Jia Li","Wei Qian","Kun Li","Qi Li","Dan Guo","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.13421v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13418v1","updated":"2023-08-25T15:03:36Z","published":"2023-08-25T15:03:36Z","title":"Nougat: Neural Optical Understanding for Academic Documents","summary":" Scientific knowledge is predominantly stored in books and scientific\njournals, often in the form of PDFs. However, the PDF format leads to a loss of\nsemantic information, particularly for mathematical expressions. We propose\nNougat (Neural Optical Understanding for Academic Documents), a Visual\nTransformer model that performs an Optical Character Recognition (OCR) task for\nprocessing scientific documents into a markup language, and demonstrate the\neffectiveness of our model on a new dataset of scientific documents. The\nproposed approach offers a promising solution to enhance the accessibility of\nscientific knowledge in the digital age, by bridging the gap between\nhuman-readable documents and machine-readable text. We release the models and\ncode to accelerate future work on scientific text recognition.\n","authors":["Lukas Blecher","Guillem Cucurull","Thomas Scialom","Robert Stojnic"],"pdf_url":"https://arxiv.org/pdf/2308.13418v1.pdf","comment":"17 pages, 10 figures"},{"id":"http://arxiv.org/abs/2109.01291v2","updated":"2023-08-25T15:02:38Z","published":"2021-09-03T03:23:27Z","title":"LATFormer: Locality-Aware Point-View Fusion Transformer for 3D Shape\n Recognition","summary":" Recently, 3D shape understanding has achieved significant progress due to the\nadvances of deep learning models on various data formats like images, voxels,\nand point clouds. Among them, point clouds and multi-view images are two\ncomplementary modalities of 3D objects and learning representations by fusing\nboth of them has been proven to be fairly effective. While prior works\ntypically focus on exploiting global features of the two modalities, herein we\nargue that more discriminative features can be derived by modeling ``where to\nfuse''. To investigate this, we propose a novel Locality-Aware Point-View\nFusion Transformer (LATFormer) for 3D shape retrieval and classification. The\ncore component of LATFormer is a module named Locality-Aware Fusion (LAF) which\nintegrates the local features of correlated regions across the two modalities\nbased on the co-occurrence scores. We further propose to filter out scores with\nlow values to obtain salient local co-occurring regions, which reduces\nredundancy for the fusion process. In our LATFormer, we utilize the LAF module\nto fuse the multi-scale features of the two modalities both bidirectionally and\nhierarchically to obtain more informative features. Comprehensive experiments\non four popular 3D shape benchmarks covering 3D object retrieval and\nclassification validate its effectiveness.\n","authors":["Xinwei He","Silin Cheng","Dingkang Liang","Song Bai","Xi Wang","Yingying Zhu"],"pdf_url":"https://arxiv.org/pdf/2109.01291v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.07390v3","updated":"2023-08-25T14:56:40Z","published":"2022-03-14T18:00:03Z","title":"What's the Difference? The potential for Convolutional Neural Networks\n for transient detection without template subtraction","summary":" We present a study of the potential for Convolutional Neural Networks (CNNs)\nto enable separation of astrophysical transients from image artifacts, a task\nknown as \"real-bogus\" classification without requiring a template subtracted\n(or difference) image which requires a computationally expensive process to\ngenerate, involving image matching on small spatial scales in large volumes of\ndata. Using data from the Dark Energy Survey, we explore the use of CNNs to (1)\nautomate the \"real-bogus\" classification, (2) reduce the computational costs of\ntransient discovery. We compare the efficiency of two CNNs with similar\narchitectures, one that uses \"image triplets\" (templates, search, and\ndifference image) and one that takes as input the template and search only. We\nmeasure the decrease in efficiency associated with the loss of information in\ninput finding that the testing accuracy is reduced from 96% to 91.1%. We\nfurther investigate how the latter model learns the required information from\nthe template and search by exploring the saliency maps. Our work (1) confirms\nthat CNNs are excellent models for \"real-bogus\" classification that rely\nexclusively on the imaging data and require no feature engineering task; (2)\ndemonstrates that high-accuracy (> 90%) models can be built without the need to\nconstruct difference images, but some accuracy is lost. Since once trained,\nneural networks can generate predictions at minimal computational costs, we\nargue that future implementations of this methodology could dramatically reduce\nthe computational costs in the detection of transients in synoptic surveys like\nRubin Observatory's Legacy Survey of Space and Time by bypassing the Difference\nImage Analysis entirely.\n","authors":["Tatiana Acero-Cuellar","Federica Bianco","Gregory Dobler","Masao Sako","Helen Qu","The LSST Dark Energy Science Collaboration"],"pdf_url":"https://arxiv.org/pdf/2203.07390v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13415v1","updated":"2023-08-25T14:55:38Z","published":"2023-08-25T14:55:38Z","title":"An investigation into the impact of deep learning model choice on sex\n and race bias in cardiac MR segmentation","summary":" In medical imaging, artificial intelligence (AI) is increasingly being used\nto automate routine tasks. However, these algorithms can exhibit and exacerbate\nbiases which lead to disparate performances between protected groups. We\ninvestigate the impact of model choice on how imbalances in subject sex and\nrace in training datasets affect AI-based cine cardiac magnetic resonance image\nsegmentation. We evaluate three convolutional neural network-based models and\none vision transformer model. We find significant sex bias in three of the four\nmodels and racial bias in all of the models. However, the severity and nature\nof the bias varies between the models, highlighting the importance of model\nchoice when attempting to train fair AI-based segmentation models for medical\nimaging tasks.\n","authors":["Tiarna Lee","Esther Puyol-Antón","Bram Ruijsink","Keana Aitcheson","Miaojing Shi","Andrew P. King"],"pdf_url":"https://arxiv.org/pdf/2308.13415v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13411v1","updated":"2023-08-25T14:38:51Z","published":"2023-08-25T14:38:51Z","title":"Harvard Glaucoma Detection and Progression: A Multimodal Multitask\n Dataset and Generalization-Reinforced Semi-Supervised Learning","summary":" Glaucoma is the number one cause of irreversible blindness globally. A major\nchallenge for accurate glaucoma detection and progression forecasting is the\nbottleneck of limited labeled patients with the state-of-the-art (SOTA) 3D\nretinal imaging data of optical coherence tomography (OCT). To address the data\nscarcity issue, this paper proposes two solutions. First, we develop a novel\ngeneralization-reinforced semi-supervised learning (SSL) model called pseudo\nsupervisor to optimally utilize unlabeled data. Compared with SOTA models, the\nproposed pseudo supervisor optimizes the policy of predicting pseudo labels\nwith unlabeled samples to improve empirical generalization. Our pseudo\nsupervisor model is evaluated with two clinical tasks consisting of glaucoma\ndetection and progression forecasting. The progression forecasting task is\nevaluated both unimodally and multimodally. Our pseudo supervisor model\ndemonstrates superior performance than SOTA SSL comparison models. Moreover,\nour model also achieves the best results on the publicly available LAG fundus\ndataset. Second, we introduce the Harvard Glaucoma Detection and Progression\n(Harvard-GDP) Dataset, a multimodal multitask dataset that includes data from\n1,000 patients with OCT imaging data, as well as labels for glaucoma detection\nand progression. This is the largest glaucoma detection dataset with 3D OCT\nimaging data and the first glaucoma progression forecasting dataset that is\npublicly available. Detailed sex and racial analysis are provided, which can be\nused by interested researchers for fairness learning studies. Our released\ndataset is benchmarked with several SOTA supervised CNN and transformer deep\nlearning models. The dataset and code are made publicly available via\n\\url{https://ophai.hms.harvard.edu/datasets/harvard-gdp1000}.\n","authors":["Yan Luo","Min Shi","Yu Tian","Tobias Elze","Mengyu Wang"],"pdf_url":"https://arxiv.org/pdf/2308.13411v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2304.06714v4","updated":"2023-08-25T14:36:57Z","published":"2023-04-13T17:59:01Z","title":"Single-Stage Diffusion NeRF: A Unified Approach to 3D Generation and\n Reconstruction","summary":" 3D-aware image synthesis encompasses a variety of tasks, such as scene\ngeneration and novel view synthesis from images. Despite numerous task-specific\nmethods, developing a comprehensive model remains challenging. In this paper,\nwe present SSDNeRF, a unified approach that employs an expressive diffusion\nmodel to learn a generalizable prior of neural radiance fields (NeRF) from\nmulti-view images of diverse objects. Previous studies have used two-stage\napproaches that rely on pretrained NeRFs as real data to train diffusion\nmodels. In contrast, we propose a new single-stage training paradigm with an\nend-to-end objective that jointly optimizes a NeRF auto-decoder and a latent\ndiffusion model, enabling simultaneous 3D reconstruction and prior learning,\neven from sparsely available views. At test time, we can directly sample the\ndiffusion prior for unconditional generation, or combine it with arbitrary\nobservations of unseen objects for NeRF reconstruction. SSDNeRF demonstrates\nrobust results comparable to or better than leading task-specific methods in\nunconditional generation and single/sparse-view 3D reconstruction.\n","authors":["Hansheng Chen","Jiatao Gu","Anpei Chen","Wei Tian","Zhuowen Tu","Lingjie Liu","Hao Su"],"pdf_url":"https://arxiv.org/pdf/2304.06714v4.pdf","comment":"ICCV 2023 final version. Project page:\n https://lakonik.github.io/ssdnerf"},{"id":"http://arxiv.org/abs/2308.13406v1","updated":"2023-08-25T14:33:59Z","published":"2023-08-25T14:33:59Z","title":"Using Visual and Vehicular Sensors for Driver Behavior Analysis: A\n Survey","summary":" Risky drivers account for 70% of fatal accidents in the United States. With\nrecent advances in sensors and intelligent vehicular systems, there has been\nsignificant research on assessing driver behavior to improve driving\nexperiences and road safety. This paper examines the various techniques used to\nanalyze driver behavior using visual and vehicular data, providing an overview\nof the latest research in this field. The paper also discusses the challenges\nand open problems in the field and offers potential recommendations for future\nresearch. The survey concludes that integrating vision and vehicular\ninformation can significantly enhance the accuracy and effectiveness of driver\nbehavior analysis, leading to improved safety measures and reduced traffic\naccidents.\n","authors":["Bikram Adhikari"],"pdf_url":"https://arxiv.org/pdf/2308.13406v1.pdf","comment":"10 pages, 2 figures, 5 tables"},{"id":"http://arxiv.org/abs/2304.00058v2","updated":"2023-08-25T14:31:35Z","published":"2023-03-31T18:21:09Z","title":"Weakly-Supervised Text-driven Contrastive Learning for Facial Behavior\n Understanding","summary":" Contrastive learning has shown promising potential for learning robust\nrepresentations by utilizing unlabeled data. However, constructing effective\npositive-negative pairs for contrastive learning on facial behavior datasets\nremains challenging. This is because such pairs inevitably encode the\nsubject-ID information, and the randomly constructed pairs may push similar\nfacial images away due to the limited number of subjects in facial behavior\ndatasets. To address this issue, we propose to utilize activity descriptions,\ncoarse-grained information provided in some datasets, which can provide\nhigh-level semantic information about the image sequences but is often\nneglected in previous studies. More specifically, we introduce a two-stage\nContrastive Learning with Text-Embeded framework for Facial behavior\nunderstanding (CLEF). The first stage is a weakly-supervised contrastive\nlearning method that learns representations from positive-negative pairs\nconstructed using coarse-grained activity information. The second stage aims to\ntrain the recognition of facial expressions or facial action units by\nmaximizing the similarity between image and the corresponding text label names.\nThe proposed CLEF achieves state-of-the-art performance on three in-the-lab\ndatasets for AU recognition and three in-the-wild datasets for facial\nexpression recognition.\n","authors":["Xiang Zhang","Taoyue Wang","Xiaotian Li","Huiyuan Yang","Lijun Yin"],"pdf_url":"https://arxiv.org/pdf/2304.00058v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06093v2","updated":"2023-08-25T14:30:45Z","published":"2023-08-11T12:05:12Z","title":"Experts Weights Averaging: A New General Training Scheme for Vision\n Transformers","summary":" Structural re-parameterization is a general training scheme for Convolutional\nNeural Networks (CNNs), which achieves performance improvement without\nincreasing inference cost. As Vision Transformers (ViTs) are gradually\nsurpassing CNNs in various visual tasks, one may question: if a training scheme\nspecifically for ViTs exists that can also achieve performance improvement\nwithout increasing inference cost? Recently, Mixture-of-Experts (MoE) has\nattracted increasing attention, as it can efficiently scale up the capacity of\nTransformers at a fixed cost through sparsely activated experts. Considering\nthat MoE can also be viewed as a multi-branch structure, can we utilize MoE to\nimplement a ViT training scheme similar to structural re-parameterization? In\nthis paper, we affirmatively answer these questions, with a new general\ntraining strategy for ViTs. Specifically, we decouple the training and\ninference phases of ViTs. During training, we replace some Feed-Forward\nNetworks (FFNs) of the ViT with specially designed, more efficient MoEs that\nassign tokens to experts by random uniform partition, and perform Experts\nWeights Averaging (EWA) on these MoEs at the end of each iteration. After\ntraining, we convert each MoE into an FFN by averaging the experts,\ntransforming the model back into original ViT for inference. We further provide\na theoretical analysis to show why and how it works. Comprehensive experiments\nacross various 2D and 3D visual tasks, ViT architectures, and datasets validate\nthe effectiveness and generalizability of the proposed training scheme.\nBesides, our training scheme can also be applied to improve performance when\nfine-tuning ViTs. Lastly, but equally important, the proposed EWA technique can\nsignificantly improve the effectiveness of naive MoE in various 2D visual small\ndatasets and 3D visual tasks.\n","authors":["Yongqi Huang","Peng Ye","Xiaoshui Huang","Sheng Li","Tao Chen","Tong He","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2308.06093v2.pdf","comment":"12 pages, 2 figures"},{"id":"http://arxiv.org/abs/2308.13404v1","updated":"2023-08-25T14:27:03Z","published":"2023-08-25T14:27:03Z","title":"Relighting Neural Radiance Fields with Shadow and Highlight Hints","summary":" This paper presents a novel neural implicit radiance representation for free\nviewpoint relighting from a small set of unstructured photographs of an object\nlit by a moving point light source different from the view position. We express\nthe shape as a signed distance function modeled by a multi layer perceptron. In\ncontrast to prior relightable implicit neural representations, we do not\ndisentangle the different reflectance components, but model both the local and\nglobal reflectance at each point by a second multi layer perceptron that, in\naddition, to density features, the current position, the normal (from the\nsigned distace function), view direction, and light position, also takes shadow\nand highlight hints to aid the network in modeling the corresponding high\nfrequency light transport effects. These hints are provided as a suggestion,\nand we leave it up to the network to decide how to incorporate these in the\nfinal relit result. We demonstrate and validate our neural implicit\nrepresentation on synthetic and real scenes exhibiting a wide variety of\nshapes, material properties, and global illumination light transport.\n","authors":["Chong Zeng","Guojun Chen","Yue Dong","Pieter Peers","Hongzhi Wu","Xin Tong"],"pdf_url":"https://arxiv.org/pdf/2308.13404v1.pdf","comment":"Accepted to SIGGRAPH 2023. Author's version. Project page:\n https://nrhints.github.io/"},{"id":"http://arxiv.org/abs/2308.13392v1","updated":"2023-08-25T14:08:07Z","published":"2023-08-25T14:08:07Z","title":"Self-Supervised Representation Learning with Cross-Context Learning\n between Global and Hypercolumn Features","summary":" Whilst contrastive learning yields powerful representations by matching\ndifferent augmented views of the same instance, it lacks the ability to capture\nthe similarities between different instances. One popular way to address this\nlimitation is by learning global features (after the global pooling) to capture\ninter-instance relationships based on knowledge distillation, where the global\nfeatures of the teacher are used to guide the learning of the global features\nof the student. Inspired by cross-modality learning, we extend this existing\nframework that only learns from global features by encouraging the global\nfeatures and intermediate layer features to learn from each other. This leads\nto our novel self-supervised framework: cross-context learning between global\nand hypercolumn features (CGH), that enforces the consistency of instance\nrelations between low- and high-level semantics. Specifically, we stack the\nintermediate feature maps to construct a hypercolumn representation so that we\ncan measure instance relations using two contexts (hypercolumn and global\nfeature) separately, and then use the relations of one context to guide the\nlearning of the other. This cross-context learning allows the model to learn\nfrom the differences between the two contexts. The experimental results on\nlinear classification and downstream tasks show that our method outperforms the\nstate-of-the-art methods.\n","authors":["Zheng Gao","Chen Feng","Ioannis Patras"],"pdf_url":"https://arxiv.org/pdf/2308.13392v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13388v1","updated":"2023-08-25T14:04:17Z","published":"2023-08-25T14:04:17Z","title":"Direction-aware Video Demoireing with Temporal-guided Bilateral Learning","summary":" Moire patterns occur when capturing images or videos on screens, severely\ndegrading the quality of the captured images or videos. Despite the recent\nprogresses, existing video demoireing methods neglect the physical\ncharacteristics and formation process of moire patterns, significantly limiting\nthe effectiveness of video recovery. This paper presents a unified framework,\nDTNet, a direction-aware and temporal-guided bilateral learning network for\nvideo demoireing. DTNet effectively incorporates the process of moire pattern\nremoval, alignment, color correction, and detail refinement. Our proposed DTNet\ncomprises two primary stages: Frame-level Direction-aware Demoireing and\nAlignment (FDDA) and Tone and Detail Refinement (TDR). In FDDA, we employ\nmultiple directional DCT modes to perform the moire pattern removal process in\nthe frequency domain, effectively detecting the prominent moire edges. Then,\nthe coarse and fine-grained alignment is applied on the demoired features for\nfacilitating the utilization of neighboring information. In TDR, we propose a\ntemporal-guided bilateral learning pipeline to mitigate the degradation of\ncolor and details caused by the moire patterns while preserving the restored\nfrequency information in FDDA. Guided by the aligned temporal features from\nFDDA, the affine transformations for the recovery of the ultimate clean frames\nare learned in TDR. Extensive experiments demonstrate that our video demoireing\nmethod outperforms state-of-the-art approaches by 2.3 dB in PSNR, and also\ndelivers a superior visual experience.\n","authors":["Shuning Xu","Binbin Song","Xiangyu Chen","Jiantao Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.13388v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09158v3","updated":"2023-08-25T13:59:08Z","published":"2023-07-18T11:35:57Z","title":"Class-relation Knowledge Distillation for Novel Class Discovery","summary":" We tackle the problem of novel class discovery, which aims to learn novel\nclasses without supervision based on labeled data from known classes. A key\nchallenge lies in transferring the knowledge in the known-class data to the\nlearning of novel classes. Previous methods mainly focus on building a shared\nrepresentation space for knowledge transfer and often ignore modeling class\nrelations. To address this, we introduce a class relation representation for\nthe novel classes based on the predicted class distribution of a model trained\non known classes. Empirically, we find that such class relation becomes less\ninformative during typical discovery training. To prevent such information\nloss, we propose a novel knowledge distillation framework, which utilizes our\nclass-relation representation to regularize the learning of novel classes. In\naddition, to enable a flexible knowledge distillation scheme for each data\npoint in novel classes, we develop a learnable weighting function for the\nregularization, which adaptively promotes knowledge transfer based on the\nsemantic similarity between the novel and known classes. To validate the\neffectiveness and generalization of our method, we conduct extensive\nexperiments on multiple benchmarks, including CIFAR100, Stanford Cars, CUB, and\nFGVC-Aircraft datasets. Our results demonstrate that the proposed method\noutperforms the previous state-of-the-art methods by a significant margin on\nalmost all benchmarks. Code is available at\n\\href{https://github.com/kleinzcy/Cr-KD-NCD}{here}.\n","authors":["Peiyan Gu","Chuyu Zhang","Ruijie Xu","Xuming He"],"pdf_url":"https://arxiv.org/pdf/2307.09158v3.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2308.11561v3","updated":"2023-08-25T13:57:28Z","published":"2023-08-22T16:45:35Z","title":"Target-Grounded Graph-Aware Transformer for Aerial Vision-and-Dialog\n Navigation","summary":" This report details the methods of the winning entry of the AVDN Challenge in\nICCV CLVL 2023. The competition addresses the Aerial Navigation from Dialog\nHistory (ANDH) task, which requires a drone agent to associate dialog history\nwith aerial observations to reach the destination. For better cross-modal\ngrounding abilities of the drone agent, we propose a Target-Grounded\nGraph-Aware Transformer (TG-GAT) framework. Concretely, TG-GAT first leverages\na graph-aware transformer to capture spatiotemporal dependency, which benefits\nnavigation state tracking and robust action planning. In addition,an auxiliary\nvisual grounding task is devised to boost the agent's awareness of referred\nlandmarks. Moreover, a hybrid augmentation strategy based on large language\nmodels is utilized to mitigate data scarcity limitations. Our TG-GAT framework\nwon the AVDN Challenge, with 2.2% and 3.0% absolute improvements over the\nbaseline on SPL and SR metrics, respectively. The code is available at\nhttps://github.com/yifeisu/TG-GAT.\n","authors":["Yifei Su","Dong An","Yuan Xu","Kehan Chen","Yan Huang"],"pdf_url":"https://arxiv.org/pdf/2308.11561v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02989v3","updated":"2023-08-25T13:57:03Z","published":"2023-08-06T02:15:19Z","title":"Novel Class Discovery for Long-tailed Recognition","summary":" While the novel class discovery has recently made great progress, existing\nmethods typically focus on improving algorithms on class-balanced benchmarks.\nHowever, in real-world recognition tasks, the class distributions of their\ncorresponding datasets are often imbalanced, which leads to serious performance\ndegeneration of those methods. In this paper, we consider a more realistic\nsetting for novel class discovery where the distributions of novel and known\nclasses are long-tailed. One main challenge of this new problem is to discover\nimbalanced novel classes with the help of long-tailed known classes. To tackle\nthis problem, we propose an adaptive self-labeling strategy based on an\nequiangular prototype representation of classes. Our method infers high-quality\npseudo-labels for the novel classes by solving a relaxed optimal transport\nproblem and effectively mitigates the class biases in learning the known and\nnovel classes. We perform extensive experiments on CIFAR100, ImageNet100,\nHerbarium19 and large-scale iNaturalist18 datasets, and the results demonstrate\nthe superiority of our method. Our code is available at\nhttps://github.com/kleinzcy/NCDLR.\n","authors":["Chuyu Zhang","Ruijie Xu","Xuming He"],"pdf_url":"https://arxiv.org/pdf/2308.02989v3.pdf","comment":"TMLR2023, Final version"},{"id":"http://arxiv.org/abs/2308.13382v1","updated":"2023-08-25T13:52:05Z","published":"2023-08-25T13:52:05Z","title":"Prompting Visual-Language Models for Dynamic Facial Expression\n Recognition","summary":" This paper presents a novel visual-language model called DFER-CLIP, which is\nbased on the CLIP model and designed for in-the-wild Dynamic Facial Expression\nRecognition (DFER). Specifically, the proposed DFER-CLIP consists of a visual\npart and a textual part. For the visual part, based on the CLIP image encoder,\na temporal model consisting of several Transformer encoders is introduced for\nextracting temporal facial expression features, and the final feature embedding\nis obtained as a learnable \"class\" token. For the textual part, we use as\ninputs textual descriptions of the facial behaviour that is related to the\nclasses (facial expressions) that we are interested in recognising -- those\ndescriptions are generated using large language models, like ChatGPT. This, in\ncontrast to works that use only the class names and more accurately captures\nthe relationship between them. Alongside the textual description, we introduce\na learnable token which helps the model learn relevant context information for\neach expression during training. Extensive experiments demonstrate the\neffectiveness of the proposed method and show that our DFER-CLIP also achieves\nstate-of-the-art results compared with the current supervised DFER methods on\nthe DFEW, FERV39k, and MAFW benchmarks. Code is publicly available at\nhttps://github.com/zengqunzhao/DFER-CLIP.\n","authors":["Zengqun Zhao","Ioannis Patras"],"pdf_url":"https://arxiv.org/pdf/2308.13382v1.pdf","comment":"Accepted at BMVC 2023"},{"id":"http://arxiv.org/abs/2308.06689v2","updated":"2023-08-25T13:51:25Z","published":"2023-08-13T05:38:47Z","title":"Estimator Meets Equilibrium Perspective: A Rectified Straight Through\n Estimator for Binary Neural Networks Training","summary":" Binarization of neural networks is a dominant paradigm in neural networks\ncompression. The pioneering work BinaryConnect uses Straight Through Estimator\n(STE) to mimic the gradients of the sign function, but it also causes the\ncrucial inconsistency problem. Most of the previous methods design different\nestimators instead of STE to mitigate it. However, they ignore the fact that\nwhen reducing the estimating error, the gradient stability will decrease\nconcomitantly. These highly divergent gradients will harm the model training\nand increase the risk of gradient vanishing and gradient exploding. To fully\ntake the gradient stability into consideration, we present a new perspective to\nthe BNNs training, regarding it as the equilibrium between the estimating error\nand the gradient stability. In this view, we firstly design two indicators to\nquantitatively demonstrate the equilibrium phenomenon. In addition, in order to\nbalance the estimating error and the gradient stability well, we revise the\noriginal straight through estimator and propose a power function based\nestimator, Rectified Straight Through Estimator (ReSTE for short). Comparing to\nother estimators, ReSTE is rational and capable of flexibly balancing the\nestimating error with the gradient stability. Extensive experiments on CIFAR-10\nand ImageNet datasets show that ReSTE has excellent performance and surpasses\nthe state-of-the-art methods without any auxiliary modules or losses.\n","authors":["Xiao-Ming Wu","Dian Zheng","Zuhao Liu","Wei-Shi Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.06689v2.pdf","comment":"10 pages, 6 figures. Accepted in ICCV 2023"},{"id":"http://arxiv.org/abs/2112.06074v3","updated":"2023-08-25T13:48:53Z","published":"2021-12-11T21:28:50Z","title":"Early Stopping for Deep Image Prior","summary":" Deep image prior (DIP) and its variants have showed remarkable potential for\nsolving inverse problems in computer vision, without any extra training data.\nPractical DIP models are often substantially overparameterized. During the\nfitting process, these models learn mostly the desired visual content first,\nand then pick up the potential modeling and observational noise, i.e.,\noverfitting. Thus, the practicality of DIP often depends critically on good\nearly stopping (ES) that captures the transition period. In this regard, the\nmajority of DIP works for vision tasks only demonstrates the potential of the\nmodels -- reporting the peak performance against the ground truth, but provides\nno clue about how to operationally obtain near-peak performance without access\nto the groundtruth. In this paper, we set to break this practicality barrier of\nDIP, and propose an efficient ES strategy, which consistently detects near-peak\nperformance across several vision tasks and DIP variants. Based on a simple\nmeasure of dispersion of consecutive DIP reconstructions, our ES method not\nonly outpaces the existing ones -- which only work in very narrow domains, but\nalso remains effective when combined with a number of methods that try to\nmitigate the overfitting. The code is available at\nhttps://github.com/sun-umn/Early_Stopping_for_DIP.\n","authors":["Hengkang Wang","Taihui Li","Zhong Zhuang","Tiancong Chen","Hengyue Liang","Ju Sun"],"pdf_url":"https://arxiv.org/pdf/2112.06074v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13373v1","updated":"2023-08-25T13:33:56Z","published":"2023-08-25T13:33:56Z","title":"Enhanced Mortality Prediction In Patients With Subarachnoid Haemorrhage\n Using A Deep Learning Model Based On The Initial CT Scan","summary":" PURPOSE: Subarachnoid hemorrhage (SAH) entails high morbidity and mortality\nrates. Convolutional neural networks (CNN), a form of deep learning, are\ncapable of generating highly accurate predictions from imaging data. Our\nobjective was to predict mortality in SAH patients by processing the initial CT\nscan on a CNN based algorithm.\n METHODS: Retrospective multicentric study of a consecutive cohort of patients\nwith SAH between 2011-2022. Demographic, clinical and radiological variables\nwere analyzed. Pre-processed baseline CT scan images were used as the input for\ntraining a CNN using AUCMEDI Framework. Our model's architecture leverages the\nDenseNet-121 structure, employing transfer learning principles. The output\nvariable was mortality in the first three months. Performance of the model was\nevaluated by statistical parameters conventionally used in studies involving\nartificial intelligence methods.\n RESULTS: Images from 219 patients were processed, 175 for training and\nvalidation of the CNN and 44 for its evaluation. 52%(115/219) of patients were\nfemale, and the median age was 58(SD=13.06) years. 18.5%(39/219) were\nidiopathic SAH. Mortality rate was 28.5%(63/219). The model showed good\naccuracy at predicting mortality in SAH patients exclusively using the images\nof the initial CT scan (Accuracy=74%, F1=75% and AUC=82%). CONCLUSION: Modern\nimage processing techniques based on AI and CNN make possible to predict\nmortality in SAH patients with high accuracy using CT scan images as the only\ninput. These models might be optimized by including more data and patients\nresulting in better training, development and performance on tasks which are\nbeyond the skills of conventional clinical knowledge.\n","authors":["Sergio Garcia-Garcia","Santiago Cepeda","Dominik Muller","Alejandra Mosteiro","Ramon Torne","Silvia Agudo","Natalia de la Torre","Ignacio Arrese","Rosario Sarabia"],"pdf_url":"https://arxiv.org/pdf/2308.13373v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13369v1","updated":"2023-08-25T13:29:31Z","published":"2023-08-25T13:29:31Z","title":"Distribution-Aligned Diffusion for Human Mesh Recovery","summary":" Recovering a 3D human mesh from a single RGB image is a challenging task due\nto depth ambiguity and self-occlusion, resulting in a high degree of\nuncertainty. Meanwhile, diffusion models have recently seen much success in\ngenerating high-quality outputs by progressively denoising noisy inputs.\nInspired by their capability, we explore a diffusion-based approach for human\nmesh recovery, and propose a Human Mesh Diffusion (HMDiff) framework which\nframes mesh recovery as a reverse diffusion process. We also propose a\nDistribution Alignment Technique (DAT) that injects input-specific distribution\ninformation into the diffusion process, and provides useful prior knowledge to\nsimplify the mesh recovery task. Our method achieves state-of-the-art\nperformance on three widely used datasets. Project page:\nhttps://gongjia0208.github.io/HMDiff/.\n","authors":["Lin Geng Foo","Jia Gong","Hossein Rahmani","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2308.13369v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.13367v1","updated":"2023-08-25T13:25:27Z","published":"2023-08-25T13:25:27Z","title":"Burnt area extraction from high-resolution satellite images based on\n anomaly detection","summary":" Wildfire detection using satellite images is a widely studied task in remote\nsensing with many applications to fire delineation and mapping. Recently, deep\nlearning methods have become a scalable solution to automate this task,\nespecially in the field of unsupervised learning where no training data is\navailable. This is particularly important in the context of emergency risk\nmonitoring where fast and effective detection is needed, generally based on\nhigh-resolution satellite data. Among various approaches, Anomaly Detection\n(AD) appears to be highly potential thanks to its broad applications in\ncomputer vision, medical imaging, as well as remote sensing. In this work, we\nbuild upon the framework of Vector Quantized Variational Autoencoder (VQ-VAE),\na popular reconstruction-based AD method with discrete latent spaces, to\nperform unsupervised burnt area extraction. We integrate VQ-VAE into an\nend-to-end framework with an intensive post-processing step using dedicated\nvegetation, water and brightness indexes. Our experiments conducted on\nhigh-resolution SPOT-6/7 images provide promising results of the proposed\ntechnique, showing its high potential in future research on unsupervised burnt\narea extraction.\n","authors":["Oscar David Rafael Narvaez Luces","Minh-Tan Pham","Quentin Poterek","Rémi Braun"],"pdf_url":"https://arxiv.org/pdf/2308.13367v1.pdf","comment":"10 pages, accepted to the MACLEAN workshop of ECML/PKDD 2023"},{"id":"http://arxiv.org/abs/2308.13363v1","updated":"2023-08-25T13:18:14Z","published":"2023-08-25T13:18:14Z","title":"CS-Mixer: A Cross-Scale Vision MLP Model with Spatial-Channel Mixing","summary":" Despite their simpler information fusion designs compared with Vision\nTransformers and Convolutional Neural Networks, Vision MLP architectures have\ndemonstrated strong performance and high data efficiency in recent research.\nHowever, existing works such as CycleMLP and Vision Permutator typically model\nspatial information in equal-size spatial regions and do not consider\ncross-scale spatial interactions. Further, their token mixers only model 1- or\n2-axis correlations, avoiding 3-axis spatial-channel mixing due to its\ncomputational demands. We therefore propose CS-Mixer, a hierarchical Vision MLP\nthat learns dynamic low-rank transformations for spatial-channel mixing through\ncross-scale local and global aggregation. The proposed methodology achieves\ncompetitive results on popular image recognition benchmarks without incurring\nsubstantially more compute. Our largest model, CS-Mixer-L, reaches 83.2% top-1\naccuracy on ImageNet-1k with 13.7 GFLOPs and 94 M parameters.\n","authors":["Jonathan Cui","David A. Araujo","Suman Saha","Md. Faisal Kabir"],"pdf_url":"https://arxiv.org/pdf/2308.13363v1.pdf","comment":"8 page, 5 figures, developed under Penn State University's\n Multi-Campus Research Experience for Undergraduates Symposium, 2023"},{"id":"http://arxiv.org/abs/2209.05160v3","updated":"2023-08-25T13:17:46Z","published":"2022-09-12T11:34:57Z","title":"Prototypical few-shot segmentation for cross-institution male pelvic\n structures with spatial registration","summary":" The prowess that makes few-shot learning desirable in medical image analysis\nis the efficient use of the support image data, which are labelled to classify\nor segment new classes, a task that otherwise requires substantially more\ntraining images and expert annotations. This work describes a fully 3D\nprototypical few-shot segmentation algorithm, such that the trained networks\ncan be effectively adapted to clinically interesting structures that are absent\nin training, using only a few labelled images from a different institute.\nFirst, to compensate for the widely recognised spatial variability between\ninstitutions in episodic adaptation of novel classes, a novel spatial\nregistration mechanism is integrated into prototypical learning, consisting of\na segmentation head and an spatial alignment module. Second, to assist the\ntraining with observed imperfect alignment, support mask conditioning module is\nproposed to further utilise the annotation available from the support images.\nExtensive experiments are presented in an application of segmenting eight\nanatomical structures important for interventional planning, using a data set\nof 589 pelvic T2-weighted MR images, acquired at seven institutes. The results\ndemonstrate the efficacy in each of the 3D formulation, the spatial\nregistration, and the support mask conditioning, all of which made positive\ncontributions independently or collectively. Compared with the previously\nproposed 2D alternatives, the few-shot segmentation performance was improved\nwith statistical significance, regardless whether the support data come from\nthe same or different institutes.\n","authors":["Yiwen Li","Yunguan Fu","Iani Gayo","Qianye Yang","Zhe Min","Shaheer Saeed","Wen Yan","Yipei Wang","J. Alison Noble","Mark Emberton","Matthew J. Clarkson","Henkjan Huisman","Dean Barratt","Victor Adrian Prisacariu","Yipeng Hu"],"pdf_url":"https://arxiv.org/pdf/2209.05160v3.pdf","comment":"accepted by Medical Image Analysis"},{"id":"http://arxiv.org/abs/2308.13356v1","updated":"2023-08-25T13:05:06Z","published":"2023-08-25T13:05:06Z","title":"CEIMVEN: An Approach of Cutting Edge Implementation of Modified Versions\n of EfficientNet (V1-V2) Architecture for Breast Cancer Detection and\n Classification from Ultrasound Images","summary":" Undoubtedly breast cancer identifies itself as one of the most widespread and\nterrifying cancers across the globe. Millions of women are getting affected\neach year from it. Breast cancer remains the major one for being the reason of\nlargest number of demise of women. In the recent time of research, Medical\nImage Computing and Processing has been playing a significant role for\ndetecting and classifying breast cancers from ultrasound images and mammograms,\nalong with the celestial touch of deep neural networks. In this research, we\nfocused mostly on our rigorous implementations and iterative result analysis of\ndifferent cutting-edge modified versions of EfficientNet architectures namely\nEfficientNet-V1 (b0-b7) and EfficientNet-V2 (b0-b3) with ultrasound image,\nnamed as CEIMVEN. We utilized transfer learning approach here for using the\npre-trained models of EfficientNet versions. We activated the hyper-parameter\ntuning procedures, added fully connected layers, discarded the unprecedented\noutliers and recorded the accuracy results from our custom modified\nEfficientNet architectures. Our deep learning model training approach was\nrelated to both identifying the cancer affected areas with region of interest\n(ROI) techniques and multiple classifications (benign, malignant and normal).\nThe approximate testing accuracies we got from the modified versions of\nEfficientNet-V1 (b0- 99.15%, b1- 98.58%, b2- 98.43%, b3- 98.01%, b4- 98.86%,\nb5- 97.72%, b6- 97.72%, b7- 98.72%) and EfficientNet-V2 (b0- 99.29%, b1-\n99.01%, b2- 98.72%, b3- 99.43%) are showing very bright future and strong\npotentials of deep learning approach for the successful detection and\nclassification of breast cancers from the ultrasound images at a very early\nstage.\n","authors":["Sheekar Banerjee","Md. Kamrul Hasan Monir"],"pdf_url":"https://arxiv.org/pdf/2308.13356v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.02774v2","updated":"2023-08-25T13:01:20Z","published":"2023-05-04T12:20:51Z","title":"Spatial and Modal Optimal Transport for Fast Cross-Modal MRI\n Reconstruction","summary":" Multi-modal magnetic resonance imaging (MRI) plays a crucial role in\ncomprehensive disease diagnosis in clinical medicine. However, acquiring\ncertain modalities, such as T2-weighted images (T2WIs), is time-consuming and\nprone to be with motion artifacts. It negatively impacts subsequent multi-modal\nimage analysis. To address this issue, we propose an end-to-end deep learning\nframework that utilizes T1-weighted images (T1WIs) as auxiliary modalities to\nexpedite T2WIs' acquisitions. While image pre-processing is capable of\nmitigating misalignment, improper parameter selection leads to adverse\npre-processing effects, requiring iterative experimentation and adjustment. To\novercome this shortage, we employ Optimal Transport (OT) to synthesize T2WIs by\naligning T1WIs and performing cross-modal synthesis, effectively mitigating\nspatial misalignment effects. Furthermore, we adopt an alternating iteration\nframework between the reconstruction task and the cross-modal synthesis task to\noptimize the final results. Then, we prove that the reconstructed T2WIs and the\nsynthetic T2WIs become closer on the T2 image manifold with iterations\nincreasing, and further illustrate that the improved reconstruction result\nenhances the synthesis process, whereas the enhanced synthesis result improves\nthe reconstruction process. Finally, experimental results from FastMRI and\ninternal datasets confirm the effectiveness of our method, demonstrating\nsignificant improvements in image reconstruction quality even at low sampling\nrates.\n","authors":["Qi Wang","Zhijie Wen","Jun Shi","Qian Wang","Dinggang Shen","Shihui Ying"],"pdf_url":"https://arxiv.org/pdf/2305.02774v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13343v1","updated":"2023-08-25T12:30:48Z","published":"2023-08-25T12:30:48Z","title":"Squeeze aggregated excitation network","summary":" Convolutional neural networks have spatial representations which read\npatterns in the vision tasks. Squeeze and excitation links the channel wise\nrepresentations by explicitly modeling on channel level. Multi layer\nperceptrons learn global representations and in most of the models it is used\noften at the end after all convolutional layers to gather all the information\nlearned before classification. We propose a method of inducing the global\nrepresentations within channels to have better performance of the model. We\npropose SaEnet, Squeeze aggregated excitation network, for learning global\nchannelwise representation in between layers. The proposed module takes\nadvantage of passing important information after squeeze by having aggregated\nexcitation before regaining its shape. We also introduce a new idea of having a\nmultibranch linear(dense) layer in the network. This learns global\nrepresentations from the condensed information which enhances the\nrepresentational power of the network. The proposed module have undergone\nextensive experiments by using Imagenet and CIFAR100 datasets and compared with\nclosely related architectures. The analyzes results that proposed models\noutputs are comparable and in some cases better than existing state of the art\narchitectures.\n","authors":["Mahendran N"],"pdf_url":"https://arxiv.org/pdf/2308.13343v1.pdf","comment":"8 pages, 5 figures, 3 tables"},{"id":"http://arxiv.org/abs/2308.12898v2","updated":"2023-08-25T12:22:53Z","published":"2023-08-24T16:17:40Z","title":"Can Linguistic Knowledge Improve Multimodal Alignment in Vision-Language\n Pretraining?","summary":" The multimedia community has shown a significant interest in perceiving and\nrepresenting the physical world with multimodal pretrained neural network\nmodels, and among them, the visual-language pertaining (VLP) is, currently, the\nmost captivating topic. However, there have been few endeavors dedicated to the\nexploration of 1) whether essential linguistic knowledge (e.g., semantics and\nsyntax) can be extracted during VLP, and 2) how such linguistic knowledge\nimpact or enhance the multimodal alignment. In response, here we aim to\nelucidate the impact of comprehensive linguistic knowledge, including semantic\nexpression and syntactic structure, on multimodal alignment. Specifically, we\ndesign and release the SNARE, the first large-scale multimodal alignment\nprobing benchmark, to detect the vital linguistic components, e.g., lexical,\nsemantic, and syntax knowledge, containing four tasks: Semantic structure,\nNegation logic, Attribute ownership, and Relationship composition. Based on our\nproposed probing benchmarks, our holistic analyses of five advanced VLP models\nillustrate that the VLP model: i) shows insensitivity towards complex syntax\nstructures and relies on content words for sentence comprehension; ii)\ndemonstrates limited comprehension of combinations between sentences and\nnegations; iii) faces challenges in determining the presence of actions or\nspatial relationships within visual information and struggles with verifying\nthe correctness of triple combinations. We make our benchmark and code\navailable at \\url{https://github.com/WangFei-2019/SNARE/}.\n","authors":["Fei Wang","Liang Ding","Jun Rao","Ye Liu","Li Shen","Changxing Ding"],"pdf_url":"https://arxiv.org/pdf/2308.12898v2.pdf","comment":"[TL;DR] we design and release the SNARE, the first large-scale\n multimodal alignment probing benchmark for current vision-language pretrained\n models"},{"id":"http://arxiv.org/abs/2308.13340v1","updated":"2023-08-25T12:19:51Z","published":"2023-08-25T12:19:51Z","title":"TriGait: Aligning and Fusing Skeleton and Silhouette Gait Data via a\n Tri-Branch Network","summary":" Gait recognition is a promising biometric technology for identification due\nto its non-invasiveness and long-distance. However, external variations such as\nclothing changes and viewpoint differences pose significant challenges to gait\nrecognition. Silhouette-based methods preserve body shape but neglect internal\nstructure information, while skeleton-based methods preserve structure\ninformation but omit appearance. To fully exploit the complementary nature of\nthe two modalities, a novel triple branch gait recognition framework, TriGait,\nis proposed in this paper. It effectively integrates features from the skeleton\nand silhouette data in a hybrid fusion manner, including a two-stream network\nto extract static and motion features from appearance, a simple yet effective\nmodule named JSA-TC to capture dependencies between all joints, and a third\nbranch for cross-modal learning by aligning and fusing low-level features of\ntwo modalities. Experimental results demonstrate the superiority and\neffectiveness of TriGait for gait recognition. The proposed method achieves a\nmean rank-1 accuracy of 96.0% over all conditions on CASIA-B dataset and 94.3%\naccuracy for CL, significantly outperforming all the state-of-the-art methods.\nThe source code will be available at https://github.com/feng-xueling/TriGait/.\n","authors":["Yan Sun","Xueling Feng","Liyan Ma","Long Hu","Mark Nixon"],"pdf_url":"https://arxiv.org/pdf/2308.13340v1.pdf","comment":"Accepted by IJCB 2023"},{"id":"http://arxiv.org/abs/2308.13331v1","updated":"2023-08-25T12:06:00Z","published":"2023-08-25T12:06:00Z","title":"A Re-Parameterized Vision Transformer (ReVT) for Domain-Generalized\n Semantic Segmentation","summary":" The task of semantic segmentation requires a model to assign semantic labels\nto each pixel of an image. However, the performance of such models degrades\nwhen deployed in an unseen domain with different data distributions compared to\nthe training domain. We present a new augmentation-driven approach to domain\ngeneralization for semantic segmentation using a re-parameterized vision\ntransformer (ReVT) with weight averaging of multiple models after training. We\nevaluate our approach on several benchmark datasets and achieve\nstate-of-the-art mIoU performance of 47.3% (prior art: 46.3%) for small models\nand of 50.1% (prior art: 47.8%) for midsized models on commonly used benchmark\ndatasets. At the same time, our method requires fewer parameters and reaches a\nhigher frame rate than the best prior art. It is also easy to implement and,\nunlike network ensembles, does not add any computational complexity during\ninference.\n","authors":["Jan-Aike Termöhlen","Timo Bartels","Tim Fingscheidt"],"pdf_url":"https://arxiv.org/pdf/2308.13331v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13327v1","updated":"2023-08-25T12:01:24Z","published":"2023-08-25T12:01:24Z","title":"3D Face Alignment Through Fusion of Head Pose Information and Features","summary":" The ability of humans to infer head poses from face shapes, and vice versa,\nindicates a strong correlation between the two. Accordingly, recent studies on\nface alignment have employed head pose information to predict facial landmarks\nin computer vision tasks. In this study, we propose a novel method that employs\nhead pose information to improve face alignment performance by fusing said\ninformation with the feature maps of a face alignment network, rather than\nsimply using it to initialize facial landmarks. Furthermore, the proposed\nnetwork structure performs robust face alignment through a dual-dimensional\nnetwork using multidimensional features represented by 2D feature maps and a 3D\nheatmap. For effective dense face alignment, we also propose a prediction\nmethod for facial geometric landmarks through training based on knowledge\ndistillation using predicted keypoints. We experimentally assessed the\ncorrelation between the predicted facial landmarks and head pose information,\nas well as variations in the accuracy of facial landmarks with respect to the\nquality of head pose information. In addition, we demonstrated the\neffectiveness of the proposed method through a competitive performance\ncomparison with state-of-the-art methods on the AFLW2000-3D, AFLW, and BIWI\ndatasets.\n","authors":["Jaehyun So","Youngjoon Han"],"pdf_url":"https://arxiv.org/pdf/2308.13327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13324v1","updated":"2023-08-25T11:58:25Z","published":"2023-08-25T11:58:25Z","title":"ConSlide: Asynchronous Hierarchical Interaction Transformer with\n Breakup-Reorganize Rehearsal for Continual Whole Slide Image Analysis","summary":" Whole slide image (WSI) analysis has become increasingly important in the\nmedical imaging community, enabling automated and objective diagnosis,\nprognosis, and therapeutic-response prediction. However, in clinical practice,\nthe ever-evolving environment hamper the utility of WSI analysis models. In\nthis paper, we propose the FIRST continual learning framework for WSI analysis,\nnamed ConSlide, to tackle the challenges of enormous image size, utilization of\nhierarchical structure, and catastrophic forgetting by progressive model\nupdating on multiple sequential datasets. Our framework contains three key\ncomponents. The Hierarchical Interaction Transformer (HIT) is proposed to model\nand utilize the hierarchical structural knowledge of WSI. The\nBreakup-Reorganize (BuRo) rehearsal method is developed for WSI data replay\nwith efficient region storing buffer and WSI reorganizing operation. The\nasynchronous updating mechanism is devised to encourage the network to learn\ngeneric and specific knowledge respectively during the replay stage, based on a\nnested cross-scale similarity learning (CSSL) module. We evaluated the proposed\nConSlide on four public WSI datasets from TCGA projects. It performs best over\nother state-of-the-art methods with a fair WSI-based continual learning setting\nand achieves a better trade-off of the overall performance and forgetting on\nprevious task\n","authors":["Yanyan Huang","Weiqin Zhao","Shujun Wang","Yu Fu","Yuming Jiang","Lequan Yu"],"pdf_url":"https://arxiv.org/pdf/2308.13324v1.pdf","comment":"To be appeared in ICCV 2023"},{"id":"http://arxiv.org/abs/2308.13323v1","updated":"2023-08-25T11:53:00Z","published":"2023-08-25T11:53:00Z","title":"SVQNet: Sparse Voxel-Adjacent Query Network for 4D Spatio-Temporal LiDAR\n Semantic Segmentation","summary":" LiDAR-based semantic perception tasks are critical yet challenging for\nautonomous driving. Due to the motion of objects and static/dynamic occlusion,\ntemporal information plays an essential role in reinforcing perception by\nenhancing and completing single-frame knowledge. Previous approaches either\ndirectly stack historical frames to the current frame or build a 4D\nspatio-temporal neighborhood using KNN, which duplicates computation and\nhinders realtime performance. Based on our observation that stacking all the\nhistorical points would damage performance due to a large amount of redundant\nand misleading information, we propose the Sparse Voxel-Adjacent Query Network\n(SVQNet) for 4D LiDAR semantic segmentation. To take full advantage of the\nhistorical frames high-efficiently, we shunt the historical points into two\ngroups with reference to the current points. One is the Voxel-Adjacent\nNeighborhood carrying local enhancing knowledge. The other is the Historical\nContext completing the global knowledge. Then we propose new modules to select\nand extract the instructive features from the two groups. Our SVQNet achieves\nstate-of-the-art performance in LiDAR semantic segmentation of the\nSemanticKITTI benchmark and the nuScenes dataset.\n","authors":["Xuechao Chen","Shuangjie Xu","Xiaoyi Zou","Tongyi Cao","Dit-Yan Yeung","Lu Fang"],"pdf_url":"https://arxiv.org/pdf/2308.13323v1.pdf","comment":"Received by ICCV2023"},{"id":"http://arxiv.org/abs/2308.13320v1","updated":"2023-08-25T11:49:51Z","published":"2023-08-25T11:49:51Z","title":"Fine-tuning can cripple your foundation model; preserving features may\n be the solution","summary":" Pre-trained foundation models, owing primarily to their enormous capacity and\nexposure to vast amount of training data scraped from the internet, enjoy the\nadvantage of storing knowledge about plenty of real-world concepts. Such models\nare typically fine-tuned on downstream datasets to produce remarkable\nstate-of-the-art performances. While various fine-tuning methods have been\ndevised and are shown to be highly effective, we observe that a fine-tuned\nmodel's ability to recognize concepts on tasks $\\textit{different}$ from the\ndownstream one is reduced significantly compared to its pre-trained\ncounterpart. This is clearly undesirable as a huge amount of time and money\nwent into learning those very concepts in the first place. We call this\nundesirable phenomenon \"concept forgetting\" and via experiments show that most\nend-to-end fine-tuning approaches suffer heavily from this side effect. To this\nend, we also propose a rather simple fix to this problem by designing a method\ncalled LDIFS (short for $\\ell_2$ distance in feature space) that simply\npreserves the features of the original foundation model during fine-tuning. We\nshow that LDIFS significantly reduces concept forgetting without having\nnoticeable impact on the downstream task performance.\n","authors":["Jishnu Mukhoti","Yarin Gal","Philip H. S. Torr","Puneet K. Dokania"],"pdf_url":"https://arxiv.org/pdf/2308.13320v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.15457v2","updated":"2023-08-25T11:45:02Z","published":"2022-10-27T14:15:13Z","title":"One-Class Risk Estimation for One-Class Hyperspectral Image\n Classification","summary":" Hyperspectral imagery (HSI) one-class classification is aimed at identifying\na single target class from the HSI by using only knowing positive data, which\ncan significantly reduce the requirements for annotation. However, when\none-class classification meets HSI, it is difficult for classifiers to find a\nbalance between the overfitting and underfitting of positive data due to the\nproblems of distribution overlap and distribution imbalance. Although deep\nlearning-based methods are currently the mainstream to overcome distribution\noverlap in HSI multiclassification, few studies focus on deep learning-based\nHSI one-class classification. In this article, a weakly supervised deep HSI\none-class classifier, namely, HOneCls, is proposed, where a risk estimator,the\none-class risk estimator, is particularly introduced to make the fully\nconvolutional neural network (FCN) with the ability of one class classification\nin the case of distribution imbalance. Extensive experiments (20 tasks in\ntotal) were conducted to demonstrate the superiority of the proposed\nclassifier.\n","authors":["Hengwei Zhao","Yanfei Zhong","Xinyu Wang","Hong Shu"],"pdf_url":"https://arxiv.org/pdf/2210.15457v2.pdf","comment":"Accepted by TGRS"},{"id":"http://arxiv.org/abs/2211.10995v2","updated":"2023-08-25T11:34:23Z","published":"2022-11-20T15:15:57Z","title":"Distinctive Self-Similar Object Detection","summary":" Deep learning-based object detection has demonstrated a significant presence\nin the practical applications of artificial intelligence. However, objects such\nas fire and smoke, pose challenges to object detection because of their\nnon-solid and various shapes, and consequently difficult to truly meet\nrequirements in practical fire prevention and control. In this paper, we\npropose that the distinctive fractal feature of self-similar in fire and smoke\ncan relieve us from struggling with their various shapes. To our best\nknowledge, we are the first to discuss this problem. In order to evaluate the\nself-similarity of the fire and smoke and improve the precision of object\ndetection, we design a semi-supervised method that use Hausdorff distance to\ndescribe the resemblance between instances. Besides, based on the concept of\nself-similar, we have devised a novel methodology for evaluating this\nparticular task in a more equitable manner. We have meticulously designed our\nnetwork architecture based on well-established and representative baseline\nnetworks such as YOLO and Faster R-CNN. Our experiments have been conducted on\npublicly available fire and smoke detection datasets, which we have thoroughly\nverified to ensure the validity of our approach. As a result, we have observed\nsignificant improvements in the detection accuracy.\n","authors":["Zeyu Shangguan","Bocheng Hu","Guohua Dai","Yuyu Liu","Darun Tang","Xingqun Jiang"],"pdf_url":"https://arxiv.org/pdf/2211.10995v2.pdf","comment":null},{"id":"http://arxiv.org/abs/1911.06968v2","updated":"2023-08-25T11:19:54Z","published":"2019-11-16T05:57:16Z","title":"Defensive Few-shot Learning","summary":" This paper investigates a new challenging problem called defensive few-shot\nlearning in order to learn a robust few-shot model against adversarial attacks.\nSimply applying the existing adversarial defense methods to few-shot learning\ncannot effectively solve this problem. This is because the commonly assumed\nsample-level distribution consistency between the training and test sets can no\nlonger be met in the few-shot setting. To address this situation, we develop a\ngeneral defensive few-shot learning (DFSL) framework to answer the following\ntwo key questions: (1) how to transfer adversarial defense knowledge from one\nsample distribution to another? (2) how to narrow the distribution gap between\nclean and adversarial examples under the few-shot setting? To answer the first\nquestion, we propose an episode-based adversarial training mechanism by\nassuming a task-level distribution consistency to better transfer the\nadversarial defense knowledge. As for the second question, within each few-shot\ntask, we design two kinds of distribution consistency criteria to narrow the\ndistribution gap between clean and adversarial examples from the feature-wise\nand prediction-wise perspectives, respectively. Extensive experiments\ndemonstrate that the proposed framework can effectively make the existing\nfew-shot models robust against adversarial attacks. Code is available at\nhttps://github.com/WenbinLee/DefensiveFSL.git.\n","authors":["Wenbin Li","Lei Wang","Xingxing Zhang","Lei Qi","Jing Huo","Yang Gao","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/1911.06968v2.pdf","comment":"Accepted to IEEE Transactions on Pattern Analysis and Machine\n Intelligence (TPAMI) 2022"},{"id":"http://arxiv.org/abs/2308.13305v1","updated":"2023-08-25T11:07:11Z","published":"2023-08-25T11:07:11Z","title":"Dynamic Residual Classifier for Class Incremental Learning","summary":" The rehearsal strategy is widely used to alleviate the catastrophic\nforgetting problem in class incremental learning (CIL) by preserving limited\nexemplars from previous tasks. With imbalanced sample numbers between old and\nnew classes, the classifier learning can be biased. Existing CIL methods\nexploit the long-tailed (LT) recognition techniques, e.g., the adjusted losses\nand the data re-sampling methods, to handle the data imbalance issue within\neach increment task. In this work, the dynamic nature of data imbalance in CIL\nis shown and a novel Dynamic Residual Classifier (DRC) is proposed to handle\nthis challenging scenario. Specifically, DRC is built upon a recent advance\nresidual classifier with the branch layer merging to handle the model-growing\nproblem. Moreover, DRC is compatible with different CIL pipelines and\nsubstantially improves them. Combining DRC with the model adaptation and fusion\n(MAF) pipeline, this method achieves state-of-the-art results on both the\nconventional CIL and the LT-CIL benchmarks. Extensive experiments are also\nconducted for a detailed analysis. The code is publicly available.\n","authors":["Xiuwei Chen","Xiaobin Chang"],"pdf_url":"https://arxiv.org/pdf/2308.13305v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13304v1","updated":"2023-08-25T11:04:35Z","published":"2023-08-25T11:04:35Z","title":"Bang and the Artefacts are Gone! Rapid Artefact Removal and Tissue\n Segmentation in Haematoxylin and Eosin Stained Biopsies","summary":" We present H&E Otsu thresholding, a scheme for rapidly detecting tissue in\nwhole-slide images (WSIs) that eliminates a wide range of undesirable artefacts\nsuch as pen marks and scanning artefacts. Our method involves obtaining a\nbid-modal representation of a low-magnification RGB overview image which\nenables simple Otsu thresholding to separate tissue from background and\nartefacts. We demonstrate our method on WSIs prepared from a wide range of\ninstitutions and WSI digital scanners, each containing substantial artefacts\nthat cause other methods to fail. The beauty of our approach lies in its\nsimplicity: manipulating RGB colour space and using Otsu thresholding allows\nfor the rapid removal of artefacts and segmentation of tissue.\n","authors":["B. A. Schreiber","J. Denholm","F. Jaeckle","M. J. Arends","K. M. Branson","C. -B. Schönlieb","E. J. Soilleux"],"pdf_url":"https://arxiv.org/pdf/2308.13304v1.pdf","comment":"4 pages, 2 figures"},{"id":"http://arxiv.org/abs/2308.13300v1","updated":"2023-08-25T10:51:02Z","published":"2023-08-25T10:51:02Z","title":"Learning Compact Neural Networks with Deep Overparameterised Multitask\n Learning","summary":" Compact neural network offers many benefits for real-world applications.\nHowever, it is usually challenging to train the compact neural networks with\nsmall parameter sizes and low computational costs to achieve the same or better\nmodel performance compared to more complex and powerful architecture. This is\nparticularly true for multitask learning, with different tasks competing for\nresources. We present a simple, efficient and effective multitask learning\noverparameterisation neural network design by overparameterising the model\narchitecture in training and sharing the overparameterised model parameters\nmore effectively across tasks, for better optimisation and generalisation.\nExperiments on two challenging multitask datasets (NYUv2 and COCO) demonstrate\nthe effectiveness of the proposed method across various convolutional networks\nand parameter sizes.\n","authors":["Shen Ren","Haosen Shi"],"pdf_url":"https://arxiv.org/pdf/2308.13300v1.pdf","comment":"Accepted for IJCAI2023 workshop, 1st International Workshop on\n Generalizing from Limited Resources in the Open World"},{"id":"http://arxiv.org/abs/2206.09146v3","updated":"2023-08-25T10:48:15Z","published":"2022-06-18T08:06:29Z","title":"A Perceptually Optimized and Self-Calibrated Tone Mapping Operator","summary":" With the increasing popularity and accessibility of high dynamic range (HDR)\nphotography, tone mapping operators (TMOs) for dynamic range compression are\npractically demanding. In this paper, we develop a two-stage neural\nnetwork-based TMO that is self-calibrated and perceptually optimized. In Stage\none, motivated by the physiology of the early stages of the human visual\nsystem, we first decompose an HDR image into a normalized Laplacian pyramid. We\nthen use two lightweight deep neural networks (DNNs), taking the normalized\nrepresentation as input and estimating the Laplacian pyramid of the\ncorresponding LDR image. We optimize the tone mapping network by minimizing the\nnormalized Laplacian pyramid distance (NLPD), a perceptual metric aligning with\nhuman judgments of tone-mapped image quality. In Stage two, the input HDR image\nis self-calibrated to compute the final LDR image. We feed the same HDR image\nbut rescaled with different maximum luminances to the learned tone mapping\nnetwork, and generate a pseudo-multi-exposure image stack with different detail\nvisibility and color saturation. We then train another lightweight DNN to fuse\nthe LDR image stack into a desired LDR image by maximizing a variant of the\nstructural similarity index for multi-exposure image fusion (MEF-SSIM), which\nhas been proven perceptually relevant to fused image quality. The proposed\nself-calibration mechanism through MEF enables our TMO to accept uncalibrated\nHDR images, while being physiology-driven. Extensive experiments show that our\nmethod produces images with consistently better visual quality. Additionally,\nsince our method builds upon three lightweight DNNs, it is among the fastest\nlocal TMOs.\n","authors":["Peibei Cao","Chenyang Le","Yuming Fang","Kede Ma"],"pdf_url":"https://arxiv.org/pdf/2206.09146v3.pdf","comment":"15 pages,17 figures"},{"id":"http://arxiv.org/abs/2212.07585v2","updated":"2023-08-25T10:39:23Z","published":"2022-12-15T02:25:22Z","title":"Rethinking the Role of Pre-Trained Networks in Source-Free Domain\n Adaptation","summary":" Source-free domain adaptation (SFDA) aims to adapt a source model trained on\na fully-labeled source domain to an unlabeled target domain. Large-data\npre-trained networks are used to initialize source models during source\ntraining, and subsequently discarded. However, source training can cause the\nmodel to overfit to source data distribution and lose applicable target domain\nknowledge. We propose to integrate the pre-trained network into the target\nadaptation process as it has diversified features important for generalization\nand provides an alternate view of features and classification decisions\ndifferent from the source model. We propose to distil useful target domain\ninformation through a co-learning strategy to improve target pseudolabel\nquality for finetuning the source model. Evaluation on 4 benchmark datasets\nshow that our proposed strategy improves adaptation performance and can be\nsuccessfully integrated with existing SFDA methods. Leveraging modern\npre-trained networks that have stronger representation learning ability in the\nco-learning strategy further boosts performance.\n","authors":["Wenyu Zhang","Li Shen","Chuan-Sheng Foo"],"pdf_url":"https://arxiv.org/pdf/2212.07585v2.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.13286v1","updated":"2023-08-25T10:22:13Z","published":"2023-08-25T10:22:13Z","title":"Unsupervised Domain Adaptation for Anatomical Landmark Detection","summary":" Recently, anatomical landmark detection has achieved great progresses on\nsingle-domain data, which usually assumes training and test sets are from the\nsame domain. However, such an assumption is not always true in practice, which\ncan cause significant performance drop due to domain shift. To tackle this\nproblem, we propose a novel framework for anatomical landmark detection under\nthe setting of unsupervised domain adaptation (UDA), which aims to transfer the\nknowledge from labeled source domain to unlabeled target domain. The framework\nleverages self-training and domain adversarial learning to address the domain\ngap during adaptation. Specifically, a self-training strategy is proposed to\nselect reliable landmark-level pseudo-labels of target domain data with dynamic\nthresholds, which makes the adaptation more effective. Furthermore, a domain\nadversarial learning module is designed to handle the unaligned data\ndistributions of two domains by learning domain-invariant features via\nadversarial training. Our experiments on cephalometric and lung landmark\ndetection show the effectiveness of the method, which reduces the domain gap by\na large margin and outperforms other UDA methods consistently. The code is\navailable at https://github.com/jhb86253817/UDA_Med_Landmark.\n","authors":["Haibo Jin","Haoxuan Che","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2308.13286v1.pdf","comment":"Accepted to MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.13273v1","updated":"2023-08-25T09:51:03Z","published":"2023-08-25T09:51:03Z","title":"Bridging the Gap: Fine-to-Coarse Sketch Interpolation Network for\n High-Quality Animation Sketch Inbetweening","summary":" The 2D animation workflow is typically initiated with the creation of\nkeyframes using sketch-based drawing. Subsequent inbetweens (i.e., intermediate\nsketch frames) are crafted through manual interpolation for smooth animations,\nwhich is a labor-intensive process. Thus, the prospect of automatic animation\nsketch interpolation has become highly appealing. However, existing video\ninterpolation methods are generally hindered by two key issues for sketch\ninbetweening: 1) limited texture and colour details in sketches, and 2)\nexaggerated alterations between two sketch keyframes. To overcome these issues,\nwe propose a novel deep learning method, namely Fine-to-Coarse Sketch\nInterpolation Network (FC-SIN). This approach incorporates multi-level guidance\nthat formulates region-level correspondence, sketch-level correspondence and\npixel-level dynamics. A multi-stream U-Transformer is then devised to\ncharacterize sketch inbewteening patterns using these multi-level guides\nthrough the integration of both self-attention and cross-attention mechanisms.\nAdditionally, to facilitate future research on animation sketch inbetweening,\nwe constructed a large-scale dataset - STD-12K, comprising 30 sketch animation\nseries in diverse artistic styles. Comprehensive experiments on this dataset\nconvincingly show that our proposed FC-SIN surpasses the state-of-the-art\ninterpolation methods. Our code and dataset will be publicly available.\n","authors":["Jiaming Shen","Kun Hu","Wei Bao","Chang Wen Chen","Zhiyong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.13273v1.pdf","comment":"7pages,6figures"},{"id":"http://arxiv.org/abs/2308.13270v1","updated":"2023-08-25T09:44:45Z","published":"2023-08-25T09:44:45Z","title":"A Game of Bundle Adjustment -- Learning Efficient Convergence","summary":" Bundle adjustment is the common way to solve localization and mapping. It is\nan iterative process in which a system of non-linear equations is solved using\ntwo optimization methods, weighted by a damping factor. In the classic\napproach, the latter is chosen heuristically by the Levenberg-Marquardt\nalgorithm on each iteration. This might take many iterations, making the\nprocess computationally expensive, which might be harmful to real-time\napplications. We propose to replace this heuristic by viewing the problem in a\nholistic manner, as a game, and formulating it as a reinforcement-learning\ntask. We set an environment which solves the non-linear equations and train an\nagent to choose the damping factor in a learned manner. We demonstrate that our\napproach considerably reduces the number of iterations required to reach the\nbundle adjustment's convergence, on both synthetic and real-life scenarios. We\nshow that this reduction benefits the classic approach and can be integrated\nwith other bundle adjustment acceleration methods.\n","authors":["Amir Belder","Refael Vivanti","Ayellet Tal"],"pdf_url":"https://arxiv.org/pdf/2308.13270v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13266v1","updated":"2023-08-25T09:37:51Z","published":"2023-08-25T09:37:51Z","title":"Integrating Boxes and Masks: A Multi-Object Framework for Unified Visual\n Tracking and Segmentation","summary":" Tracking any given object(s) spatially and temporally is a common purpose in\nVisual Object Tracking (VOT) and Video Object Segmentation (VOS). Joint\ntracking and segmentation have been attempted in some studies but they often\nlack full compatibility of both box and mask in initialization and prediction,\nand mainly focus on single-object scenarios. To address these limitations, this\npaper proposes a Multi-object Mask-box Integrated framework for unified\nTracking and Segmentation, dubbed MITS. Firstly, the unified identification\nmodule is proposed to support both box and mask reference for initialization,\nwhere detailed object information is inferred from boxes or directly retained\nfrom masks. Additionally, a novel pinpoint box predictor is proposed for\naccurate multi-object box prediction, facilitating target-oriented\nrepresentation learning. All target objects are processed simultaneously from\nencoding to propagation and decoding, as a unified pipeline for VOT and VOS.\nExperimental results show MITS achieves state-of-the-art performance on both\nVOT and VOS benchmarks. Notably, MITS surpasses the best prior VOT competitor\nby around 6% on the GOT-10k test set, and significantly improves the\nperformance of box initialization on VOS benchmarks. The code is available at\nhttps://github.com/yoxu515/MITS.\n","authors":["Yuanyou Xu","Zongxin Yang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2308.13266v1.pdf","comment":"Accepted to ICCV2023"},{"id":"http://arxiv.org/abs/2305.00976v2","updated":"2023-08-25T09:35:46Z","published":"2023-05-02T17:52:41Z","title":"TMR: Text-to-Motion Retrieval Using Contrastive 3D Human Motion\n Synthesis","summary":" In this paper, we present TMR, a simple yet effective approach for text to 3D\nhuman motion retrieval. While previous work has only treated retrieval as a\nproxy evaluation metric, we tackle it as a standalone task. Our method extends\nthe state-of-the-art text-to-motion synthesis model TEMOS, and incorporates a\ncontrastive loss to better structure the cross-modal latent space. We show that\nmaintaining the motion generation loss, along with the contrastive training, is\ncrucial to obtain good performance. We introduce a benchmark for evaluation and\nprovide an in-depth analysis by reporting results on several protocols. Our\nextensive experiments on the KIT-ML and HumanML3D datasets show that TMR\noutperforms the prior work by a significant margin, for example reducing the\nmedian rank from 54 to 19. Finally, we showcase the potential of our approach\non moment retrieval. Our code and models are publicly available at\nhttps://mathis.petrovich.fr/tmr.\n","authors":["Mathis Petrovich","Michael J. Black","Gül Varol"],"pdf_url":"https://arxiv.org/pdf/2305.00976v2.pdf","comment":"ICCV 2023 Camera Ready, project page:\n https://mathis.petrovich.fr/tmr/"},{"id":"http://arxiv.org/abs/2308.13252v1","updated":"2023-08-25T08:59:03Z","published":"2023-08-25T08:59:03Z","title":"Kissing to Find a Match: Efficient Low-Rank Permutation Representation","summary":" Permutation matrices play a key role in matching and assignment problems\nacross the fields, especially in computer vision and robotics. However, memory\nfor explicitly representing permutation matrices grows quadratically with the\nsize of the problem, prohibiting large problem instances. In this work, we\npropose to tackle the curse of dimensionality of large permutation matrices by\napproximating them using low-rank matrix factorization, followed by a\nnonlinearity. To this end, we rely on the Kissing number theory to infer the\nminimal rank required for representing a permutation matrix of a given size,\nwhich is significantly smaller than the problem size. This leads to a drastic\nreduction in computation and memory costs, e.g., up to $3$ orders of magnitude\nless memory for a problem of size $n=20000$, represented using $8.4\\times10^5$\nelements in two small matrices instead of using a single huge matrix with\n$4\\times 10^8$ elements. The proposed representation allows for accurate\nrepresentations of large permutation matrices, which in turn enables handling\nlarge problems that would have been infeasible otherwise. We demonstrate the\napplicability and merits of the proposed approach through a series of\nexperiments on a range of problems that involve predicting permutation\nmatrices, from linear and quadratic assignment to shape matching problems.\n","authors":["Hannah Dröge","Zorah Lähner","Yuval Bahat","Onofre Martorell","Felix Heide","Michael Möller"],"pdf_url":"https://arxiv.org/pdf/2308.13252v1.pdf","comment":"13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.13245v1","updated":"2023-08-25T08:37:55Z","published":"2023-08-25T08:37:55Z","title":"Unpaired Multi-domain Attribute Translation of 3D Facial Shapes with a\n Square and Symmetric Geometric Map","summary":" While impressive progress has recently been made in image-oriented facial\nattribute translation, shape-oriented 3D facial attribute translation remains\nan unsolved issue. This is primarily limited by the lack of 3D generative\nmodels and ineffective usage of 3D facial data. We propose a learning framework\nfor 3D facial attribute translation to relieve these limitations. Firstly, we\ncustomize a novel geometric map for 3D shape representation and embed it in an\nend-to-end generative adversarial network. The geometric map represents 3D\nshapes symmetrically on a square image grid, while preserving the neighboring\nrelationship of 3D vertices in a local least-square sense. This enables\neffective learning for the latent representation of data with different\nattributes. Secondly, we employ a unified and unpaired learning framework for\nmulti-domain attribute translation. It not only makes effective usage of data\ncorrelation from multiple domains, but also mitigates the constraint for hardly\naccessible paired data. Finally, we propose a hierarchical architecture for the\ndiscriminator to guarantee robust results against both global and local\nartifacts. We conduct extensive experiments to demonstrate the advantage of the\nproposed framework over the state-of-the-art in generating high-fidelity facial\nshapes. Given an input 3D facial shape, the proposed framework is able to\nsynthesize novel shapes of different attributes, which covers some downstream\napplications, such as expression transfer, gender translation, and aging. Code\nat https://github.com/NaughtyZZ/3D_facial_shape_attribute_translation_ssgmap.\n","authors":["Zhenfeng Fan","Zhiheng Zhang","Shuang Yang","Chongyang Zhong","Min Cao","Shihong Xia"],"pdf_url":"https://arxiv.org/pdf/2308.13245v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13236v1","updated":"2023-08-25T08:06:48Z","published":"2023-08-25T08:06:48Z","title":"Black-box Unsupervised Domain Adaptation with Bi-directional\n Atkinson-Shiffrin Memory","summary":" Black-box unsupervised domain adaptation (UDA) learns with source predictions\nof target data without accessing either source data or source models during\ntraining, and it has clear superiority in data privacy and flexibility in\ntarget network selection. However, the source predictions of target data are\noften noisy and training with them is prone to learning collapses. We propose\nBiMem, a bi-directional memorization mechanism that learns to remember useful\nand representative information to correct noisy pseudo labels on the fly,\nleading to robust black-box UDA that can generalize across different visual\nrecognition tasks. BiMem constructs three types of memory, including sensory\nmemory, short-term memory, and long-term memory, which interact in a\nbi-directional manner for comprehensive and robust memorization of learnt\nfeatures. It includes a forward memorization flow that identifies and stores\nuseful features and a backward calibration flow that rectifies features' pseudo\nlabels progressively. Extensive experiments show that BiMem achieves superior\ndomain adaptation performance consistently across various visual recognition\ntasks such as image classification, semantic segmentation and object detection.\n","authors":["Jingyi Zhang","Jiaxing Huang","Xueying Jiang","Shijian Lu"],"pdf_url":"https://arxiv.org/pdf/2308.13236v1.pdf","comment":"Accepted to ICCV2023"},{"id":"http://arxiv.org/abs/2308.12673v2","updated":"2023-08-25T08:03:28Z","published":"2023-08-24T09:31:02Z","title":"Masked Feature Modelling: Feature Masking for the Unsupervised\n Pre-training of a Graph Attention Network Block for Bottom-up Video Event\n Recognition","summary":" In this paper, we introduce Masked Feature Modelling (MFM), a novel approach\nfor the unsupervised pre-training of a Graph Attention Network (GAT) block. MFM\nutilizes a pretrained Visual Tokenizer to reconstruct masked features of\nobjects within a video, leveraging the MiniKinetics dataset. We then\nincorporate the pre-trained GAT block into a state-of-the-art bottom-up\nsupervised video-event recognition architecture, ViGAT, to improve the model's\nstarting point and overall accuracy. Experimental evaluations on the YLI-MED\ndataset demonstrate the effectiveness of MFM in improving event recognition\nperformance.\n","authors":["Dimitrios Daskalakis","Nikolaos Gkalelis","Vasileios Mezaris"],"pdf_url":"https://arxiv.org/pdf/2308.12673v2.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2308.13229v1","updated":"2023-08-25T08:02:04Z","published":"2023-08-25T08:02:04Z","title":"ReST: A Reconfigurable Spatial-Temporal Graph Model for Multi-Camera\n Multi-Object Tracking","summary":" Multi-Camera Multi-Object Tracking (MC-MOT) utilizes information from\nmultiple views to better handle problems with occlusion and crowded scenes.\nRecently, the use of graph-based approaches to solve tracking problems has\nbecome very popular. However, many current graph-based methods do not\neffectively utilize information regarding spatial and temporal consistency.\nInstead, they rely on single-camera trackers as input, which are prone to\nfragmentation and ID switch errors. In this paper, we propose a novel\nreconfigurable graph model that first associates all detected objects across\ncameras spatially before reconfiguring it into a temporal graph for Temporal\nAssociation. This two-stage association approach enables us to extract robust\nspatial and temporal-aware features and address the problem with fragmented\ntracklets. Furthermore, our model is designed for online tracking, making it\nsuitable for real-world applications. Experimental results show that the\nproposed graph model is able to extract more discriminating features for object\ntracking, and our model achieves state-of-the-art performance on several public\ndatasets.\n","authors":["Cheng-Che Cheng","Min-Xuan Qiu","Chen-Kuo Chiang","Shang-Hong Lai"],"pdf_url":"https://arxiv.org/pdf/2308.13229v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2305.13802v2","updated":"2023-08-25T07:59:21Z","published":"2023-05-23T08:15:02Z","title":"Online Open-set Semi-supervised Object Detection by Valuable Instances\n Mining","summary":" Open-set semi-supervised object detection (OSSOD) leverages practical\nopen-set unlabeled datasets with out-of-distribution (OOD) instances for\nsemi-supervised object detection (SSOD). The main challenge in OSSOD is\ndistinguishing and filtering the OOD instances (i.e., outliers) from\nin-distribution (ID) instances during pseudo-labeling. The only OSSOD work\nemploys an additional offline OOD detection network trained solely with labeled\ndata for solving this problem. However, the limited training data restricts the\npotential for improvement. Meanwhile, the offline strategy results in low\nefficiency. To alleviate these issues, this paper proposes an end-to-end online\nOSSOD framework that improves performance and efficiency: 1) We propose a\nsemi-supervised outlier filtering method that more effectively filters the OOD\ninstances by using both labeled and unlabeled data. 2) We propose a\nthreshold-free Dual Competing OOD head that further improves the performance by\nsuppressing the mispredictions during semi-supervised outlier filtering. 3) Our\nproposed method is an online end-to-end trainable OSSOD framework. Experimental\nresults show that our method achieves state-of-the-art performance on several\nOSSOD benchmarks compared to existing methods. Moreover, additional experiments\nshow that our method can be easily applied to different SSOD frameworks.\n","authors":["Zerun Wang","Ling Xiao","Liuyu Xiang","Zhaotian Weng","Toshihiko Yamasaki"],"pdf_url":"https://arxiv.org/pdf/2305.13802v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13225v1","updated":"2023-08-25T07:50:59Z","published":"2023-08-25T07:50:59Z","title":"DPF-Net: Combining Explicit Shape Priors in Deformable Primitive Field\n for Unsupervised Structural Reconstruction of 3D Objects","summary":" Unsupervised methods for reconstructing structures face significant\nchallenges in capturing the geometric details with consistent structures among\ndiverse shapes of the same category. To address this issue, we present a novel\nunsupervised structural reconstruction method, named DPF-Net, based on a new\nDeformable Primitive Field (DPF) representation, which allows for high-quality\nshape reconstruction using parameterized geometric primitives. We design a\ntwo-stage shape reconstruction pipeline which consists of a primitive\ngeneration module and a primitive deformation module to approximate the target\nshape of each part progressively. The primitive generation module estimates the\nexplicit orientation, position, and size parameters of parameterized geometric\nprimitives, while the primitive deformation module predicts a dense deformation\nfield based on a parameterized primitive field to recover shape details. The\nstrong shape prior encoded in parameterized geometric primitives enables our\nDPF-Net to extract high-level structures and recover fine-grained shape details\nconsistently. The experimental results on three categories of objects in\ndiverse shapes demonstrate the effectiveness and generalization ability of our\nDPF-Net on structural reconstruction and shape segmentation.\n","authors":["Qingyao Shuai","Chi Zhang","Kaizhi Yang","Xuejin Chen"],"pdf_url":"https://arxiv.org/pdf/2308.13225v1.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.13223v1","updated":"2023-08-25T07:39:26Z","published":"2023-08-25T07:39:26Z","title":"EfficientDreamer: High-Fidelity and Robust 3D Creation via\n Orthogonal-view Diffusion Prior","summary":" While the image diffusion model has made significant strides in text-driven\n3D content creation, it often falls short in accurately capturing the intended\nmeaning of the text prompt, particularly with respect to direction information.\nThis shortcoming gives rise to the Janus problem, where multi-faced 3D models\nare produced with the guidance of such diffusion models. In this paper, we\npresent a robust pipeline for generating high-fidelity 3D content with\northogonal-view image guidance. Specifically, we introduce a novel 2D diffusion\nmodel that generates an image consisting of four orthogonal-view sub-images for\nthe given text prompt. The 3D content is then created with this diffusion\nmodel, which enhances 3D consistency and provides strong structured semantic\npriors. This addresses the infamous Janus problem and significantly promotes\ngeneration efficiency. Additionally, we employ a progressive 3D synthesis\nstrategy that results in substantial improvement in the quality of the created\n3D contents. Both quantitative and qualitative evaluations show that our method\ndemonstrates a significant improvement over previous text-to-3D techniques.\n","authors":["Minda Zhao","Chaoyi Zhao","Xinyue Liang","Lincheng Li","Zeng Zhao","Zhipeng Hu","Changjie Fan","Xin Yu"],"pdf_url":"https://arxiv.org/pdf/2308.13223v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03538v4","updated":"2023-08-25T07:34:42Z","published":"2023-06-06T09:35:56Z","title":"SDR-GAIN: A High Real-Time Occluded Pedestrian Pose Completion Method\n for Autonomous Driving","summary":" To mitigate the challenges arising from partial occlusion in human pose\nkeypoint based pedestrian detection methods , we present a novel pedestrian\npose keypoint completion method called the separation and dimensionality\nreduction-based generative adversarial imputation networks (SDR-GAIN) .\nFirstly, we utilize OpenPose to estimate pedestrian poses in images. Then, we\nisolate the head and torso keypoints of pedestrians with incomplete keypoints\ndue to occlusion or other factors and perform dimensionality reduction to\nenhance features and further unify feature distribution. Finally, we introduce\ntwo generative models based on the generative adversarial networks (GAN)\nframework, which incorporate Huber loss, residual structure, and L1\nregularization to generate missing parts of the incomplete head and torso pose\nkeypoints of partially occluded pedestrians, resulting in pose completion. Our\nexperiments on MS COCO and JAAD datasets demonstrate that SDR-GAIN outperforms\nbasic GAIN framework, interpolation methods PCHIP and MAkima, machine learning\nmethods k-NN and MissForest in terms of pose completion task. Furthermore, the\nSDR-GAIN algorithm exhibits a remarkably short running time of approximately\n0.4ms and boasts exceptional real-time performance. As such, it holds\nsignificant practical value in the domain of autonomous driving, wherein high\nsystem response speeds are of paramount importance. Specifically, it excels at\nrapidly and precisely capturing human pose key points, thus enabling an\nexpanded range of applications for pedestrian detection tasks based on pose key\npoints, including but not limited to pedestrian behavior recognition and\nprediction.\n","authors":["Honghao Fu","Libo Sun","Yilang Shen","Yiwen Wu"],"pdf_url":"https://arxiv.org/pdf/2306.03538v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13218v1","updated":"2023-08-25T07:32:34Z","published":"2023-08-25T07:32:34Z","title":"MultiCapCLIP: Auto-Encoding Prompts for Zero-Shot Multilingual Visual\n Captioning","summary":" Supervised visual captioning models typically require a large scale of images\nor videos paired with descriptions in a specific language (i.e., the\nvision-caption pairs) for training. However, collecting and labeling\nlarge-scale datasets is time-consuming and expensive for many scenarios and\nlanguages. Therefore, sufficient labeled pairs are usually not available. To\ndeal with the label shortage problem, we present a simple yet effective\nzero-shot approach MultiCapCLIP that can generate visual captions for different\nscenarios and languages without any labeled vision-caption pairs of downstream\ndatasets. In the training stage, MultiCapCLIP only requires text data for\ninput. Then it conducts two main steps: 1) retrieving concept prompts that\npreserve the corresponding domain knowledge of new scenarios; 2) auto-encoding\nthe prompts to learn writing styles to output captions in a desired language.\nIn the testing stage, MultiCapCLIP instead takes visual data as input directly\nto retrieve the concept prompts to generate the final visual descriptions. The\nextensive experiments on image and video captioning across four benchmarks and\nfour languages (i.e., English, Chinese, German, and French) confirm the\neffectiveness of our approach. Compared with state-of-the-art zero-shot and\nweakly-supervised methods, our method achieves 4.8% and 21.5% absolute\nimprovements in terms of BLEU@4 and CIDEr metrics. Our code is available at\nhttps://github.com/yangbang18/MultiCapCLIP.\n","authors":["Bang Yang","Fenglin Liu","Xian Wu","Yaowei Wang","Xu Sun","Yuexian Zou"],"pdf_url":"https://arxiv.org/pdf/2308.13218v1.pdf","comment":"ACL'2023, 13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2203.01923v5","updated":"2023-08-25T07:30:32Z","published":"2022-03-03T18:56:08Z","title":"Recovering 3D Human Mesh from Monocular Images: A Survey","summary":" Estimating human pose and shape from monocular images is a long-standing\nproblem in computer vision. Since the release of statistical body models, 3D\nhuman mesh recovery has been drawing broader attention. With the same goal of\nobtaining well-aligned and physically plausible mesh results, two paradigms\nhave been developed to overcome challenges in the 2D-to-3D lifting process: i)\nan optimization-based paradigm, where different data terms and regularization\nterms are exploited as optimization objectives; and ii) a regression-based\nparadigm, where deep learning techniques are embraced to solve the problem in\nan end-to-end fashion. Meanwhile, continuous efforts are devoted to improving\nthe quality of 3D mesh labels for a wide range of datasets. Though remarkable\nprogress has been achieved in the past decade, the task is still challenging\ndue to flexible body motions, diverse appearances, complex environments, and\ninsufficient in-the-wild annotations. To the best of our knowledge, this is the\nfirst survey that focuses on the task of monocular 3D human mesh recovery. We\nstart with the introduction of body models and then elaborate recovery\nframeworks and training objectives by providing in-depth analyses of their\nstrengths and weaknesses. We also summarize datasets, evaluation metrics, and\nbenchmark results. Open issues and future directions are discussed in the end,\nhoping to motivate researchers and facilitate their research in this area. A\nregularly updated project page can be found at\nhttps://github.com/tinatiansjz/hmr-survey.\n","authors":["Yating Tian","Hongwen Zhang","Yebin Liu","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2203.01923v5.pdf","comment":"Published in IEEE TPAMI, Survey on monocular 3D human mesh recovery,\n Project page: https://github.com/tinatiansjz/hmr-survey"},{"id":"http://arxiv.org/abs/2308.13217v1","updated":"2023-08-25T07:30:18Z","published":"2023-08-25T07:30:18Z","title":"GEMTrans: A General, Echocardiography-based, Multi-Level Transformer\n Framework for Cardiovascular Diagnosis","summary":" Echocardiography (echo) is an ultrasound imaging modality that is widely used\nfor various cardiovascular diagnosis tasks. Due to inter-observer variability\nin echo-based diagnosis, which arises from the variability in echo image\nacquisition and the interpretation of echo images based on clinical experience,\nvision-based machine learning (ML) methods have gained popularity to act as\nsecondary layers of verification. For such safety-critical applications, it is\nessential for any proposed ML method to present a level of explainability along\nwith good accuracy. In addition, such methods must be able to process several\necho videos obtained from various heart views and the interactions among them\nto properly produce predictions for a variety of cardiovascular measurements or\ninterpretation tasks. Prior work lacks explainability or is limited in scope by\nfocusing on a single cardiovascular task. To remedy this, we propose a General,\nEcho-based, Multi-Level Transformer (GEMTrans) framework that provides\nexplainability, while simultaneously enabling multi-video training where the\ninter-play among echo image patches in the same frame, all frames in the same\nvideo, and inter-video relationships are captured based on a downstream task.\nWe show the flexibility of our framework by considering two critical tasks\nincluding ejection fraction (EF) and aortic stenosis (AS) severity detection.\nOur model achieves mean absolute errors of 4.15 and 4.84 for single and\ndual-video EF estimation and an accuracy of 96.5 % for AS detection, while\nproviding informative task-specific attention maps and prototypical\nexplainability.\n","authors":["Masoud Mokhtari","Neda Ahmadi","Teresa S. M. Tsang","Purang Abolmaesumi","Renjie Liao"],"pdf_url":"https://arxiv.org/pdf/2308.13217v1.pdf","comment":"To be published in MLMI 2023"},{"id":"http://arxiv.org/abs/2308.13204v1","updated":"2023-08-25T06:59:26Z","published":"2023-08-25T06:59:26Z","title":"Self-supervised learning for hotspot detection and isolation from\n thermal images","summary":" Hotspot detection using thermal imaging has recently become essential in\nseveral industrial applications, such as security applications, health\napplications, and equipment monitoring applications. Hotspot detection is of\nutmost importance in industrial safety where equipment can develop anomalies.\nHotspots are early indicators of such anomalies. We address the problem of\nhotspot detection in thermal images by proposing a self-supervised learning\napproach. Self-supervised learning has shown potential as a competitive\nalternative to their supervised learning counterparts but their application to\nthermography has been limited. This has been due to lack of diverse data\navailability, domain specific pre-trained models, standardized benchmarks, etc.\nWe propose a self-supervised representation learning approach followed by\nfine-tuning that improves detection of hotspots by classification. The SimSiam\nnetwork based ensemble classifier decides whether an image contains hotspots or\nnot. Detection of hotspots is followed by precise hotspot isolation. By doing\nso, we are able to provide a highly accurate and precise hotspot\nidentification, applicable to a wide range of applications. We created a novel\nlarge thermal image dataset to address the issue of paucity of easily\naccessible thermal images. Our experiments with the dataset created by us and a\npublicly available segmentation dataset show the potential of our approach for\nhotspot detection and its ability to isolate hotspots with high accuracy. We\nachieve a Dice Coefficient of 0.736, the highest when compared with existing\nhotspot identification techniques. Our experiments also show self-supervised\nlearning as a strong contender of supervised learning, providing competitive\nmetrics for hotspot detection, with the highest accuracy of our approach being\n97%.\n","authors":["Shreyas Goyal","Jagath C. Rajapakse"],"pdf_url":"https://arxiv.org/pdf/2308.13204v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11681v2","updated":"2023-08-25T06:55:14Z","published":"2023-08-22T14:58:36Z","title":"VadCLIP: Adapting Vision-Language Models for Weakly Supervised Video\n Anomaly Detection","summary":" The recent contrastive language-image pre-training (CLIP) model has shown\ngreat success in a wide range of image-level tasks, revealing remarkable\nability for learning powerful visual representations with rich semantics. An\nopen and worthwhile problem is efficiently adapting such a strong model to the\nvideo domain and designing a robust video anomaly detector. In this work, we\npropose VadCLIP, a new paradigm for weakly supervised video anomaly detection\n(WSVAD) by leveraging the frozen CLIP model directly without any pre-training\nand fine-tuning process. Unlike current works that directly feed extracted\nfeatures into the weakly supervised classifier for frame-level binary\nclassification, VadCLIP makes full use of fine-grained associations between\nvision and language on the strength of CLIP and involves dual branch. One\nbranch simply utilizes visual features for coarse-grained binary\nclassification, while the other fully leverages the fine-grained language-image\nalignment. With the benefit of dual branch, VadCLIP achieves both\ncoarse-grained and fine-grained video anomaly detection by transferring\npre-trained knowledge from CLIP to WSVAD task. We conduct extensive experiments\non two commonly-used benchmarks, demonstrating that VadCLIP achieves the best\nperformance on both coarse-grained and fine-grained WSVAD, surpassing the\nstate-of-the-art methods by a large margin. Specifically, VadCLIP achieves\n84.51% AP and 88.02% AUC on XD-Violence and UCF-Crime, respectively. Code and\nfeatures will be released to facilitate future VAD research.\n","authors":["Peng Wu","Xuerong Zhou","Guansong Pang","Lingru Zhou","Qingsen Yan","Peng Wang","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.11681v2.pdf","comment":"Submitted"},{"id":"http://arxiv.org/abs/2308.13201v1","updated":"2023-08-25T06:45:02Z","published":"2023-08-25T06:45:02Z","title":"Deep Active Audio Feature Learning in Resource-Constrained Environments","summary":" The scarcity of labelled data makes training Deep Neural Network (DNN) models\nin bioacoustic applications challenging. In typical bioacoustics applications,\nmanually labelling the required amount of data can be prohibitively expensive.\nTo effectively identify both new and current classes, DNN models must continue\nto learn new features from a modest amount of fresh data. Active Learning (AL)\nis an approach that can help with this learning while requiring little\nlabelling effort. Nevertheless, the use of fixed feature extraction approaches\nlimits feature quality, resulting in underutilization of the benefits of AL. We\ndescribe an AL framework that addresses this issue by incorporating feature\nextraction into the AL loop and refining the feature extractor after each round\nof manual annotation. In addition, we use raw audio processing rather than\nspectrograms, which is a novel approach. Experiments reveal that the proposed\nAL framework requires 14.3%, 66.7%, and 47.4% less labelling effort on\nbenchmark audio datasets ESC-50, UrbanSound8k, and InsectWingBeat,\nrespectively, for a large DNN model and similar savings on a\nmicrocontroller-based counterpart. Furthermore, we showcase the practical\nrelevance of our study by incorporating data from conservation biology\nprojects.\n","authors":["Md Mohaimenuzzaman","Christoph Bergmeir","Bernd Meyer"],"pdf_url":"https://arxiv.org/pdf/2308.13201v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11898v2","updated":"2023-08-25T06:38:23Z","published":"2023-08-23T03:46:04Z","title":"Exploring the Optimization Objective of One-Class Classification for\n Anomaly Detection","summary":" One-class classification (OCC) is a longstanding method for anomaly\ndetection. With the powerful representation capability of the pre-trained\nbackbone, OCC methods have witnessed significant performance improvements.\nTypically, most of these OCC methods employ transfer learning to enhance the\ndiscriminative nature of the pre-trained backbone's features, thus achieving\nremarkable efficacy. While most current approaches emphasize feature transfer\nstrategies, we argue that the optimization objective space within OCC methods\ncould also be an underlying critical factor influencing performance. In this\nwork, we conducted a thorough investigation into the optimization objective of\nOCC. Through rigorous theoretical analysis and derivation, we unveil a key\ninsights: any space with the suitable norm can serve as an equivalent\nsubstitute for the hypersphere center, without relying on the distribution\nassumption of training samples. Further, we provide guidelines for determining\nthe feasible domain of norms for the OCC optimization objective. This novel\ninsight sparks a simple and data-agnostic deep one-class classification method.\nOur method is straightforward, with a single 1x1 convolutional layer as a\ntrainable projector and any space with suitable norm as the optimization\nobjective. Extensive experiments validate the reliability and efficacy of our\nfindings and the corresponding methodology, resulting in state-of-the-art\nperformance in both one-class classification and industrial vision anomaly\ndetection and segmentation tasks.\n","authors":["Han Gao","Huiyuan Luo","Fei Shen","Zhengtao Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.11898v2.pdf","comment":"15 paegs, 10 figures"},{"id":"http://arxiv.org/abs/2303.05699v3","updated":"2023-08-25T06:34:14Z","published":"2023-03-10T04:49:01Z","title":"Feature Unlearning for Pre-trained GANs and VAEs","summary":" We tackle the problem of feature unlearning from a pre-trained image\ngenerative model: GANs and VAEs. Unlike a common unlearning task where an\nunlearning target is a subset of the training set, we aim to unlearn a specific\nfeature, such as hairstyle from facial images, from the pre-trained generative\nmodels. As the target feature is only presented in a local region of an image,\nunlearning the entire image from the pre-trained model may result in losing\nother details in the remaining region of the image. To specify which features\nto unlearn, we collect randomly generated images that contain the target\nfeatures. We then identify a latent representation corresponding to the target\nfeature and then use the representation to fine-tune the pre-trained model.\nThrough experiments on MNIST and CelebA datasets, we show that target features\nare successfully removed while keeping the fidelity of the original models.\nFurther experiments with an adversarial attack show that the unlearned model is\nmore robust under the presence of malicious parties.\n","authors":["Saemi Moon","Seunghyuk Cho","Dongwoo Kim"],"pdf_url":"https://arxiv.org/pdf/2303.05699v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07873v4","updated":"2023-08-25T06:01:01Z","published":"2023-07-15T19:20:49Z","title":"Why Does Little Robustness Help? Understanding and Improving Adversarial\n Transferability from Surrogate Training","summary":" Adversarial examples (AEs) for DNNs have been shown to be transferable: AEs\nthat successfully fool white-box surrogate models can also deceive other\nblack-box models with different architectures. Although a bunch of empirical\nstudies have provided guidance on generating highly transferable AEs, many of\nthese findings lack explanations and even lead to inconsistent advice. In this\npaper, we take a further step towards understanding adversarial\ntransferability, with a particular focus on surrogate aspects. Starting from\nthe intriguing little robustness phenomenon, where models adversarially trained\nwith mildly perturbed adversarial samples can serve as better surrogates, we\nattribute it to a trade-off between two predominant factors: model smoothness\nand gradient similarity. Our investigations focus on their joint effects,\nrather than their separate correlations with transferability. Through a series\nof theoretical and empirical analyses, we conjecture that the data distribution\nshift in adversarial training explains the degradation of gradient similarity.\nBuilding on these insights, we explore the impacts of data augmentation and\ngradient regularization on transferability and identify that the trade-off\ngenerally exists in the various training mechanisms, thus building a\ncomprehensive blueprint for the regulation mechanism behind transferability.\nFinally, we provide a general route for constructing better surrogates to boost\ntransferability which optimizes both model smoothness and gradient similarity\nsimultaneously, e.g., the combination of input gradient regularization and\nsharpness-aware minimization (SAM), validated by extensive experiments. In\nsummary, we call for attention to the united impacts of these two factors for\nlaunching effective transfer attacks, rather than optimizing one while ignoring\nthe other, and emphasize the crucial role of manipulating surrogate models.\n","authors":["Yechao Zhang","Shengshan Hu","Leo Yu Zhang","Junyu Shi","Minghui Li","Xiaogeng Liu","Wei Wan","Hai Jin"],"pdf_url":"https://arxiv.org/pdf/2307.07873v4.pdf","comment":"Accepted by IEEE Symposium on Security and Privacy (Oakland) 2024; 21\n pages, 11 figures, 13 tables"},{"id":"http://arxiv.org/abs/2305.10947v2","updated":"2023-08-25T05:57:08Z","published":"2023-05-18T13:09:45Z","title":"Comparative Study: Standalone IEEE 16-bit Floating-Point for Image\n Classification","summary":" Reducing the number of bits needed to encode the weights and activations of\nneural networks is highly desirable as it speeds up their training and\ninference time while reducing memory consumption. It is unsurprising that\nconsiderable attention has been drawn to developing neural networks that employ\nlower-precision computation. This includes IEEE 16-bit, Google bfloat16, 8-bit,\n4-bit floating-point or fixed-point, 2-bit, and various mixed-precision\nalgorithms. Out of these low-precision formats, IEEE 16-bit stands out due to\nits universal compatibility with contemporary GPUs. This accessibility\ncontrasts with bfloat16, which needs high-end GPUs, or other non-standard\nfewer-bit designs, which typically require software simulation. This study\nfocuses on the widely accessible IEEE 16-bit format for comparative analysis.\nThis analysis involves an in-depth theoretical investigation of the factors\nthat lead to discrepancies between 16-bit and 32-bit models, including a\nformalization of the concepts of floating-point error and tolerance to\nunderstand the conditions under which a 16-bit model can approximate 32-bit\nresults. Contrary to literature that credits the success of noise-tolerated\nneural networks to regularization effects, our study-supported by a series of\nrigorous experiments-provides a quantitative explanation of why standalone IEEE\n16-bit floating-point neural networks can perform on par with 32-bit and\nmixed-precision networks in various image classification tasks. Because no\nprior research has studied IEEE 16-bit as a standalone floating-point precision\nin neural networks, we believe our findings will have significant impacts,\nencouraging the adoption of standalone IEEE 16-bit networks in future neural\nnetwork applications.\n","authors":["Juyoung Yun","Byungkon Kang","Francois Rameau","Zhoulai Fu"],"pdf_url":"https://arxiv.org/pdf/2305.10947v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13183v1","updated":"2023-08-25T05:25:01Z","published":"2023-08-25T05:25:01Z","title":"STRIDE: Street View-based Environmental Feature Detection and Pedestrian\n Collision Prediction","summary":" This paper introduces a novel benchmark to study the impact and relationship\nof built environment elements on pedestrian collision prediction, intending to\nenhance environmental awareness in autonomous driving systems to prevent\npedestrian injuries actively. We introduce a built environment detection task\nin large-scale panoramic images and a detection-based pedestrian collision\nfrequency prediction task. We propose a baseline method that incorporates a\ncollision prediction module into a state-of-the-art detection model to tackle\nboth tasks simultaneously. Our experiments demonstrate a significant\ncorrelation between object detection of built environment elements and\npedestrian collision frequency prediction. Our results are a stepping stone\ntowards understanding the interdependencies between built environment\nconditions and pedestrian safety.\n","authors":["Cristina González","Nicolás Ayobi","Felipe Escallón","Laura Baldovino-Chiquillo","Maria Wilches-Mogollón","Donny Pasos","Nicole Ramírez","Jose Pinzón","Olga Sarmiento","D Alex Quistberg","Pablo Arbeláez"],"pdf_url":"https://arxiv.org/pdf/2308.13183v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13182v1","updated":"2023-08-25T05:24:23Z","published":"2023-08-25T05:24:23Z","title":"Structural Cycle GAN for Virtual Immunohistochemistry Staining of Gland\n Markers in the Colon","summary":" With the advent of digital scanners and deep learning, diagnostic operations\nmay move from a microscope to a desktop. Hematoxylin and Eosin (H&E) staining\nis one of the most frequently used stains for disease analysis, diagnosis, and\ngrading, but pathologists do need different immunohistochemical (IHC) stains to\nanalyze specific structures or cells. Obtaining all of these stains (H&E and\ndifferent IHCs) on a single specimen is a tedious and time-consuming task.\nConsequently, virtual staining has emerged as an essential research direction.\nHere, we propose a novel generative model, Structural Cycle-GAN (SC-GAN), for\nsynthesizing IHC stains from H&E images, and vice versa. Our method expressly\nincorporates structural information in the form of edges (in addition to color\ndata) and employs attention modules exclusively in the decoder of the proposed\ngenerator model. This integration enhances feature localization and preserves\ncontextual information during the generation process. In addition, a structural\nloss is incorporated to ensure accurate structure alignment between the\ngenerated and input markers. To demonstrate the efficacy of the proposed model,\nexperiments are conducted with two IHC markers emphasizing distinct structures\nof glands in the colon: the nucleus of epithelial cells (CDX2) and the\ncytoplasm (CK818). Quantitative metrics such as FID and SSIM are frequently\nused for the analysis of generative models, but they do not correlate\nexplicitly with higher-quality virtual staining results. Therefore, we propose\ntwo new quantitative metrics that correlate directly with the virtual staining\nspecificity of IHC markers.\n","authors":["Shikha Dubey","Tushar Kataria","Beatrice Knudsen","Shireen Y. Elhabian"],"pdf_url":"https://arxiv.org/pdf/2308.13182v1.pdf","comment":"Accepted to MICCAI Workshop 2023"},{"id":"http://arxiv.org/abs/2306.16556v2","updated":"2023-08-25T05:09:32Z","published":"2023-06-28T20:52:51Z","title":"Inter-Rater Uncertainty Quantification in Medical Image Segmentation via\n Rater-Specific Bayesian Neural Networks","summary":" Automated medical image segmentation inherently involves a certain degree of\nuncertainty. One key factor contributing to this uncertainty is the ambiguity\nthat can arise in determining the boundaries of a target region of interest,\nprimarily due to variations in image appearance. On top of this, even among\nexperts in the field, different opinions can emerge regarding the precise\ndefinition of specific anatomical structures. This work specifically addresses\nthe modeling of segmentation uncertainty, known as inter-rater uncertainty. Its\nprimary objective is to explore and analyze the variability in segmentation\noutcomes that can occur when multiple experts in medical imaging interpret and\nannotate the same images. We introduce a novel Bayesian neural network-based\narchitecture to estimate inter-rater uncertainty in medical image segmentation.\nOur approach has three key advancements. Firstly, we introduce a\none-encoder-multi-decoder architecture specifically tailored for uncertainty\nestimation, enabling us to capture the rater-specific representation of each\nexpert involved. Secondly, we propose Bayesian modeling for the new\narchitecture, allowing efficient capture of the inter-rater distribution,\nparticularly in scenarios with limited annotations. Lastly, we enhance the\nrater-specific representation by integrating an attention module into each\ndecoder. This module facilitates focused and refined segmentation results for\neach rater. We conduct extensive evaluations using synthetic and real-world\ndatasets to validate our technical innovations rigorously. Our method surpasses\nexisting baseline methods in five out of seven diverse tasks on the publicly\navailable \\emph{QUBIQ} dataset, considering two evaluation metrics encompassing\ndifferent uncertainty aspects. Our codes, models, and the new dataset are\navailable through our GitHub repository:\nhttps://github.com/HaoWang420/bOEMD-net .\n","authors":["Qingqiao Hu","Hao Wang","Jing Luo","Yunhao Luo","Zhiheng Zhangg","Jan S. Kirschke","Benedikt Wiestler","Bjoern Menze","Jianguo Zhang","Hongwei Bran Li"],"pdf_url":"https://arxiv.org/pdf/2306.16556v2.pdf","comment":"submitted to a journal for review"},{"id":"http://arxiv.org/abs/2304.06931v2","updated":"2023-08-25T05:07:43Z","published":"2023-04-14T05:32:01Z","title":"Scale Federated Learning for Label Set Mismatch in Medical Image\n Classification","summary":" Federated learning (FL) has been introduced to the healthcare domain as a\ndecentralized learning paradigm that allows multiple parties to train a model\ncollaboratively without privacy leakage. However, most previous studies have\nassumed that every client holds an identical label set. In reality, medical\nspecialists tend to annotate only diseases within their area of expertise or\ninterest. This implies that label sets in each client can be different and even\ndisjoint. In this paper, we propose the framework FedLSM to solve the problem\nof Label Set Mismatch. FedLSM adopts different training strategies on data with\ndifferent uncertainty levels to efficiently utilize unlabeled or partially\nlabeled data as well as class-wise adaptive aggregation in the classification\nlayer to avoid inaccurate aggregation when clients have missing labels. We\nevaluated FedLSM on two public real-world medical image datasets, including\nchest X-ray (CXR) diagnosis with 112,120 CXR images and skin lesion diagnosis\nwith 10,015 dermoscopy images, and showed that it significantly outperformed\nother state-of-the-art FL algorithms. The code can be found at\nhttps://github.com/dzp2095/FedLSM.\n","authors":["Zhipeng Deng","Luyang Luo","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2304.06931v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13178v1","updated":"2023-08-25T05:00:05Z","published":"2023-08-25T05:00:05Z","title":"Self-supervised Scene Text Segmentation with Object-centric Layered\n Representations Augmented by Text Regions","summary":" Text segmentation tasks have a very wide range of application values, such as\nimage editing, style transfer, watermark removal, etc.However, existing public\ndatasets are of poor quality of pixel-level labels that have been shown to be\nnotoriously costly to acquire, both in terms of money and time. At the same\ntime, when pretraining is performed on synthetic datasets, the data\ndistribution of the synthetic datasets is far from the data distribution in the\nreal scene. These all pose a huge challenge to the current pixel-level text\nsegmentation algorithms.To alleviate the above problems, we propose a\nself-supervised scene text segmentation algorithm with layered decoupling of\nrepresentations derived from the object-centric manner to segment images into\ntexts and background. In our method, we propose two novel designs which include\nRegion Query Module and Representation Consistency Constraints adapting to the\nunique properties of text as complements to Auto Encoder, which improves the\nnetwork's sensitivity to texts.For this unique design, we treat the\npolygon-level masks predicted by the text localization model as extra input\ninformation, and neither utilize any pixel-level mask annotations for training\nstage nor pretrain on synthetic datasets.Extensive experiments show the\neffectiveness of the method proposed. On several public scene text datasets,\nour method outperforms the state-of-the-art unsupervised segmentation\nalgorithms.\n","authors":["Yibo Wang","Yunhu Ye","Yuanpeng Mao","Yanwei Yu","Yuanping Song"],"pdf_url":"https://arxiv.org/pdf/2308.13178v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13177v1","updated":"2023-08-25T04:54:32Z","published":"2023-08-25T04:54:32Z","title":"How to Evaluate the Generalization of Detection? A Benchmark for\n Comprehensive Open-Vocabulary Detection","summary":" Object detection (OD) in computer vision has made significant progress in\nrecent years, transitioning from closed-set labels to open-vocabulary detection\n(OVD) based on large-scale vision-language pre-training (VLP). However, current\nevaluation methods and datasets are limited to testing generalization over\nobject types and referral expressions, which do not provide a systematic,\nfine-grained, and accurate benchmark of OVD models' abilities. In this paper,\nwe propose a new benchmark named OVDEval, which includes 9 sub-tasks and\nintroduces evaluations on commonsense knowledge, attribute understanding,\nposition understanding, object relation comprehension, and more. The dataset is\nmeticulously created to provide hard negatives that challenge models' true\nunderstanding of visual and linguistic input. Additionally, we identify a\nproblem with the popular Average Precision (AP) metric when benchmarking models\non these fine-grained label datasets and propose a new metric called\nNon-Maximum Suppression Average Precision (NMS-AP) to address this issue.\nExtensive experimental results show that existing top OVD models all fail on\nthe new tasks except for simple object types, demonstrating the value of the\nproposed dataset in pinpointing the weakness of current OVD models and guiding\nfuture research. Furthermore, the proposed NMS-AP metric is verified by\nexperiments to provide a much more truthful evaluation of OVD models, whereas\ntraditional AP metrics yield deceptive results. Data is available at\n\\url{https://github.com/om-ai-lab/OVDEval}\n","authors":["Yiyang Yao","Peng Liu","Tiancheng Zhao","Qianqian Zhang","Jiajia Liao","Chunxin Fang","Kyusong Lee","Qing Wang"],"pdf_url":"https://arxiv.org/pdf/2308.13177v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13175v1","updated":"2023-08-25T04:52:52Z","published":"2023-08-25T04:52:52Z","title":"GridPull: Towards Scalability in Learning Implicit Representations from\n 3D Point Clouds","summary":" Learning implicit representations has been a widely used solution for surface\nreconstruction from 3D point clouds. The latest methods infer a distance or\noccupancy field by overfitting a neural network on a single point cloud.\nHowever, these methods suffer from a slow inference due to the slow convergence\nof neural networks and the extensive calculation of distances to surface\npoints, which limits them to small scale points. To resolve the scalability\nissue in surface reconstruction, we propose GridPull to improve the efficiency\nof learning implicit representations from large scale point clouds. Our novelty\nlies in the fast inference of a discrete distance field defined on grids\nwithout using any neural components. To remedy the lack of continuousness\nbrought by neural networks, we introduce a loss function to encourage\ncontinuous distances and consistent gradients in the field during pulling\nqueries onto the surface in grids near to the surface. We use uniform grids for\na fast grid search to localize sampled queries, and organize surface points in\na tree structure to speed up the calculation of distances to the surface. We do\nnot rely on learning priors or normal supervision during optimization, and\nachieve superiority over the latest methods in terms of complexity and\naccuracy. We evaluate our method on shape and scene benchmarks, and report\nnumerical and visual comparisons with the latest methods to justify our\neffectiveness and superiority. The code is available at\nhttps://github.com/chenchao15/GridPull.\n","authors":["Chao Chen","Yu-Shen Liu","Zhizhong Han"],"pdf_url":"https://arxiv.org/pdf/2308.13175v1.pdf","comment":"13pages,14figures. Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2303.11089v2","updated":"2023-08-25T04:50:47Z","published":"2023-03-20T13:22:04Z","title":"EmoTalk: Speech-Driven Emotional Disentanglement for 3D Face Animation","summary":" Speech-driven 3D face animation aims to generate realistic facial expressions\nthat match the speech content and emotion. However, existing methods often\nneglect emotional facial expressions or fail to disentangle them from speech\ncontent. To address this issue, this paper proposes an end-to-end neural\nnetwork to disentangle different emotions in speech so as to generate rich 3D\nfacial expressions. Specifically, we introduce the emotion disentangling\nencoder (EDE) to disentangle the emotion and content in the speech by\ncross-reconstructed speech signals with different emotion labels. Then an\nemotion-guided feature fusion decoder is employed to generate a 3D talking face\nwith enhanced emotion. The decoder is driven by the disentangled identity,\nemotional, and content embeddings so as to generate controllable personal and\nemotional styles. Finally, considering the scarcity of the 3D emotional talking\nface data, we resort to the supervision of facial blendshapes, which enables\nthe reconstruction of plausible 3D faces from 2D emotional data, and contribute\na large-scale 3D emotional talking face dataset (3D-ETF) to train the network.\nOur experiments and user studies demonstrate that our approach outperforms\nstate-of-the-art methods and exhibits more diverse facial movements. We\nrecommend watching the supplementary video:\nhttps://ziqiaopeng.github.io/emotalk\n","authors":["Ziqiao Peng","Haoyu Wu","Zhenbo Song","Hao Xu","Xiangyu Zhu","Jun He","Hongyan Liu","Zhaoxin Fan"],"pdf_url":"https://arxiv.org/pdf/2303.11089v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.13174v1","updated":"2023-08-25T04:49:49Z","published":"2023-08-25T04:49:49Z","title":"Interactive segmentation in aerial images: a new benchmark and an open\n access web-based tool","summary":" In recent years, deep learning has emerged as a powerful approach in remote\nsensing applications, particularly in segmentation and classification\ntechniques that play a crucial role in extracting significant land features\nfrom satellite and aerial imagery. However, only a limited number of papers\nhave discussed the use of deep learning for interactive segmentation in land\ncover classification tasks. In this study, we aim to bridge the gap between\ninteractive segmentation and remote sensing image analysis by conducting a\nbenchmark study on various deep learning-based interactive segmentation models.\nWe assessed the performance of five state-of-the-art interactive segmentation\nmethods (SimpleClick, FocalClick, Iterative Click Loss (ICL), Reviving\nIterative Training with Mask Guidance for Interactive Segmentation (RITM), and\nSegment Anything (SAM)) on two high-resolution aerial imagery datasets. To\nenhance the segmentation results without requiring multiple models, we\nintroduced the Cascade-Forward Refinement (CFR) approach, an innovative\ninference strategy for interactive segmentation. We evaluated these interactive\nsegmentation methods on various land cover types, object sizes, and band\ncombinations in remote sensing. Surprisingly, the popularly discussed method,\nSAM, proved to be ineffective for remote sensing images. Conversely, the\npoint-based approach used in the SimpleClick models consistently outperformed\nthe other methods in all experiments. Building upon these findings, we\ndeveloped a dedicated online tool called RSISeg for interactive segmentation of\nremote sensing data. RSISeg incorporates a well-performing interactive model,\nfine-tuned with remote sensing data. Additionally, we integrated the SAM model\ninto this tool. Compared to existing interactive segmentation tools, RSISeg\noffers strong interactivity, modifiability, and adaptability to remote sensing\ndata.\n","authors":["Zhe Wang","Shoukun Sun","Xiang Que","Xiaogang Ma"],"pdf_url":"https://arxiv.org/pdf/2308.13174v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13173v1","updated":"2023-08-25T04:45:37Z","published":"2023-08-25T04:45:37Z","title":"DISGO: Automatic End-to-End Evaluation for Scene Text OCR","summary":" This paper discusses the challenges of optical character recognition (OCR) on\nnatural scenes, which is harder than OCR on documents due to the wild content\nand various image backgrounds. We propose to uniformly use word error rates\n(WER) as a new measurement for evaluating scene-text OCR, both end-to-end (e2e)\nperformance and individual system component performances. Particularly for the\ne2e metric, we name it DISGO WER as it considers Deletion, Insertion,\nSubstitution, and Grouping/Ordering errors. Finally we propose to utilize the\nconcept of super blocks to automatically compute BLEU scores for e2e OCR\nmachine translation. The small SCUT public test set is used to demonstrate WER\nperformance by a modularized OCR system.\n","authors":["Mei-Yuh Hwang","Yangyang Shi","Ankit Ramchandani","Guan Pang","Praveen Krishnan","Lucas Kabela","Frank Seide","Samyak Datta","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2308.13173v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2307.11067v4","updated":"2023-08-25T04:21:57Z","published":"2023-07-20T17:46:21Z","title":"CNOS: A Strong Baseline for CAD-based Novel Object Segmentation","summary":" We propose a simple three-stage approach to segment unseen objects in RGB\nimages using their CAD models. Leveraging recent powerful foundation models,\nDINOv2 and Segment Anything, we create descriptors and generate proposals,\nincluding binary masks for a given input RGB image. By matching proposals with\nreference descriptors created from CAD models, we achieve precise object ID\nassignment along with modal masks. We experimentally demonstrate that our\nmethod achieves state-of-the-art results in CAD-based novel object\nsegmentation, surpassing existing approaches on the seven core datasets of the\nBOP challenge by 19.8% AP using the same BOP evaluation protocol. Our source\ncode is available at https://github.com/nv-nguyen/cnos.\n","authors":["Van Nguyen Nguyen","Thibault Groueix","Georgy Ponimatkin","Vincent Lepetit","Tomas Hodan"],"pdf_url":"https://arxiv.org/pdf/2307.11067v4.pdf","comment":"ICCV 2023, R6D Workshop"},{"id":"http://arxiv.org/abs/2308.13168v1","updated":"2023-08-25T04:14:02Z","published":"2023-08-25T04:14:02Z","title":"IOMatch: Simplifying Open-Set Semi-Supervised Learning with Joint\n Inliers and Outliers Utilization","summary":" Semi-supervised learning (SSL) aims to leverage massive unlabeled data when\nlabels are expensive to obtain. Unfortunately, in many real-world applications,\nthe collected unlabeled data will inevitably contain unseen-class outliers not\nbelonging to any of the labeled classes. To deal with the challenging open-set\nSSL task, the mainstream methods tend to first detect outliers and then filter\nthem out. However, we observe a surprising fact that such approach could result\nin more severe performance degradation when labels are extremely scarce, as the\nunreliable outlier detector may wrongly exclude a considerable portion of\nvaluable inliers. To tackle with this issue, we introduce a novel open-set SSL\nframework, IOMatch, which can jointly utilize inliers and outliers, even when\nit is difficult to distinguish exactly between them. Specifically, we propose\nto employ a multi-binary classifier in combination with the standard closed-set\nclassifier for producing unified open-set classification targets, which regard\nall outliers as a single new class. By adopting these targets as open-set\npseudo-labels, we optimize an open-set classifier with all unlabeled samples\nincluding both inliers and outliers. Extensive experiments have shown that\nIOMatch significantly outperforms the baseline methods across different\nbenchmark datasets and different settings despite its remarkable simplicity.\nOur code and models are available at https://github.com/nukezil/IOMatch.\n","authors":["Zekun Li","Lei Qi","Yinghuan Shi","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2308.13168v1.pdf","comment":"Accepted by ICCV 2023, selected for an Oral presentation"},{"id":"http://arxiv.org/abs/2308.13165v1","updated":"2023-08-25T04:06:30Z","published":"2023-08-25T04:06:30Z","title":"Dual Compensation Residual Networks for Class Imbalanced Learning","summary":" Learning generalizable representation and classifier for class-imbalanced\ndata is challenging for data-driven deep models. Most studies attempt to\nre-balance the data distribution, which is prone to overfitting on tail classes\nand underfitting on head classes. In this work, we propose Dual Compensation\nResidual Networks to better fit both tail and head classes. Firstly, we propose\ndual Feature Compensation Module (FCM) and Logit Compensation Module (LCM) to\nalleviate the overfitting issue. The design of these two modules is based on\nthe observation: an important factor causing overfitting is that there is\nsevere feature drift between training and test data on tail classes. In\ndetails, the test features of a tail category tend to drift towards feature\ncloud of multiple similar head categories. So FCM estimates a multi-mode\nfeature drift direction for each tail category and compensate for it.\nFurthermore, LCM translates the deterministic feature drift vector estimated by\nFCM along intra-class variations, so as to cover a larger effective\ncompensation space, thereby better fitting the test features. Secondly, we\npropose a Residual Balanced Multi-Proxies Classifier (RBMC) to alleviate the\nunder-fitting issue. Motivated by the observation that re-balancing strategy\nhinders the classifier from learning sufficient head knowledge and eventually\ncauses underfitting, RBMC utilizes uniform learning with a residual path to\nfacilitate classifier learning. Comprehensive experiments on Long-tailed and\nClass-Incremental benchmarks validate the efficacy of our method.\n","authors":["Ruibing Hou","Hong Chang","Bingpeng Ma","Shiguang Shan","Xilin Chen"],"pdf_url":"https://arxiv.org/pdf/2308.13165v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2308.13164v1","updated":"2023-08-25T04:03:41Z","published":"2023-08-25T04:03:41Z","title":"Diff-Retinex: Rethinking Low-light Image Enhancement with A Generative\n Diffusion Model","summary":" In this paper, we rethink the low-light image enhancement task and propose a\nphysically explainable and generative diffusion model for low-light image\nenhancement, termed as Diff-Retinex. We aim to integrate the advantages of the\nphysical model and the generative network. Furthermore, we hope to supplement\nand even deduce the information missing in the low-light image through the\ngenerative network. Therefore, Diff-Retinex formulates the low-light image\nenhancement problem into Retinex decomposition and conditional image\ngeneration. In the Retinex decomposition, we integrate the superiority of\nattention in Transformer and meticulously design a Retinex Transformer\ndecomposition network (TDN) to decompose the image into illumination and\nreflectance maps. Then, we design multi-path generative diffusion networks to\nreconstruct the normal-light Retinex probability distribution and solve the\nvarious degradations in these components respectively, including dark\nillumination, noise, color deviation, loss of scene contents, etc. Owing to\ngenerative diffusion model, Diff-Retinex puts the restoration of low-light\nsubtle detail into practice. Extensive experiments conducted on real-world\nlow-light datasets qualitatively and quantitatively demonstrate the\neffectiveness, superiority, and generalization of the proposed method.\n","authors":["Xunpeng Yi","Han Xu","Hao Zhang","Linfeng Tang","Jiayi Ma"],"pdf_url":"https://arxiv.org/pdf/2308.13164v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.05983v3","updated":"2023-08-25T03:20:12Z","published":"2023-08-11T07:38:46Z","title":"Face Encryption via Frequency-Restricted Identity-Agnostic Attacks","summary":" Billions of people are sharing their daily live images on social media\neveryday. However, malicious collectors use deep face recognition systems to\neasily steal their biometric information (e.g., faces) from these images. Some\nstudies are being conducted to generate encrypted face photos using adversarial\nattacks by introducing imperceptible perturbations to reduce face information\nleakage. However, existing studies need stronger black-box scenario feasibility\nand more natural visual appearances, which challenge the feasibility of privacy\nprotection. To address these problems, we propose a frequency-restricted\nidentity-agnostic (FRIA) framework to encrypt face images from unauthorized\nface recognition without access to personal information. As for the weak\nblack-box scenario feasibility, we obverse that representations of the average\nfeature in multiple face recognition models are similar, thus we propose to\nutilize the average feature via the crawled dataset from the Internet as the\ntarget to guide the generation, which is also agnostic to identities of unknown\nface recognition systems; in nature, the low-frequency perturbations are more\nvisually perceptible by the human vision system. Inspired by this, we restrict\nthe perturbation in the low-frequency facial regions by discrete cosine\ntransform to achieve the visual naturalness guarantee. Extensive experiments on\nseveral face recognition models demonstrate that our FRIA outperforms other\nstate-of-the-art methods in generating more natural encrypted faces while\nattaining high black-box attack success rates of 96%. In addition, we validate\nthe efficacy of FRIA using real-world black-box commercial API, which reveals\nthe potential of FRIA in practice. Our codes can be found in\nhttps://github.com/XinDong10/FRIA.\n","authors":["Xin Dong","Rui Wang","Siyuan Liang","Aishan Liu","Lihua Jing"],"pdf_url":"https://arxiv.org/pdf/2308.05983v3.pdf","comment":"I noticed something missing in the article's description in\n subsection 3.2, so I'd like to undo it and re-finalize and describe it"},{"id":"http://arxiv.org/abs/2305.06564v4","updated":"2023-08-25T03:12:20Z","published":"2023-05-11T04:43:10Z","title":"Undercover Deepfakes: Detecting Fake Segments in Videos","summary":" The recent renaissance in generative models, driven primarily by the advent\nof diffusion models and iterative improvement in GAN methods, has enabled many\ncreative applications. However, each advancement is also accompanied by a rise\nin the potential for misuse. In the arena of the deepfake generation, this is a\nkey societal issue. In particular, the ability to modify segments of videos\nusing such generative techniques creates a new paradigm of deepfakes which are\nmostly real videos altered slightly to distort the truth. This paradigm has\nbeen under-explored by the current deepfake detection methods in the academic\nliterature. In this paper, we present a deepfake detection method that can\naddress this issue by performing deepfake prediction at the frame and video\nlevels. To facilitate testing our method, we prepared a new benchmark dataset\nwhere videos have both real and fake frame sequences with very subtle\ntransitions. We provide a benchmark on the proposed dataset with our detection\nmethod which utilizes the Vision Transformer based on Scaling and Shifting to\nlearn spatial features, and a Timeseries Transformer to learn temporal features\nof the videos to help facilitate the interpretation of possible deepfakes.\nExtensive experiments on a variety of deepfake generation methods show\nexcellent results by the proposed method on temporal segmentation and classical\nvideo-level predictions as well. In particular, the paradigm we address will\nform a powerful tool for the moderation of deepfakes, where human oversight can\nbe better targeted to the parts of videos suspected of being deepfakes. All\nexperiments can be reproduced at:\ngithub.com/rgb91/temporal-deepfake-segmentation.\n","authors":["Sanjay Saha","Rashindrie Perera","Sachith Seneviratne","Tamasha Malepathirana","Sanka Rasnayaka","Deshani Geethika","Terence Sim","Saman Halgamuge"],"pdf_url":"https://arxiv.org/pdf/2305.06564v4.pdf","comment":"ICCV 2023 Workshop and Challenge on DeepFake Analysis and Detection"},{"id":"http://arxiv.org/abs/2308.13150v1","updated":"2023-08-25T03:08:41Z","published":"2023-08-25T03:08:41Z","title":"Enhancing Breast Cancer Classification Using Transfer ResNet with\n Lightweight Attention Mechanism","summary":" Deep learning models have revolutionized image classification by learning\ncomplex feature hierarchies in raw pixel data. This paper introduces an image\nclassification method based on the ResNet model, and introduces a lightweight\nattention mechanism framework to improve performance. The framework optimizes\nfeature representation, enhances classification capabilities, and improves\nfeature discriminativeness. We verified the effectiveness of the algorithm on\nthe Breakhis dataset, showing its superior performance in many aspects. Not\nonly in terms of conventional models, our method also shows advantages on\nstate-of-the-art methods such as contemporary visual transformers. Significant\nimprovements have been achieved in metrics such as precision, accuracy, recall,\nF1-score, and G-means, while also performing well in terms of convergence time.\nThese results strengthen the performance of the algorithm and solidify its\napplication prospects in practical image classification tasks. Keywords: ResNet\nmodel, Lightweight attention mechanism\n","authors":["Suxing Liu"],"pdf_url":"https://arxiv.org/pdf/2308.13150v1.pdf","comment":"6 pages, 4 figures,6 tables"},{"id":"http://arxiv.org/abs/2307.15880v2","updated":"2023-08-25T02:46:35Z","published":"2023-07-29T03:49:28Z","title":"Effective Whole-body Pose Estimation with Two-stages Distillation","summary":" Whole-body pose estimation localizes the human body, hand, face, and foot\nkeypoints in an image. This task is challenging due to multi-scale body parts,\nfine-grained localization for low-resolution regions, and data scarcity.\nMeanwhile, applying a highly efficient and accurate pose estimator to widely\nhuman-centric understanding and generation tasks is urgent. In this work, we\npresent a two-stage pose \\textbf{D}istillation for \\textbf{W}hole-body\n\\textbf{P}ose estimators, named \\textbf{DWPose}, to improve their effectiveness\nand efficiency. The first-stage distillation designs a weight-decay strategy\nwhile utilizing a teacher's intermediate feature and final logits with both\nvisible and invisible keypoints to supervise the student from scratch. The\nsecond stage distills the student model itself to further improve performance.\nDifferent from the previous self-knowledge distillation, this stage finetunes\nthe student's head with only 20% training time as a plug-and-play training\nstrategy. For data limitations, we explore the UBody dataset that contains\ndiverse facial expressions and hand gestures for real-life applications.\nComprehensive experiments show the superiority of our proposed simple yet\neffective methods. We achieve new state-of-the-art performance on\nCOCO-WholeBody, significantly boosting the whole-body AP of RTMPose-l from\n64.8% to 66.5%, even surpassing RTMPose-x teacher with 65.3% AP. We release a\nseries of models with different sizes, from tiny to large, for satisfying\nvarious downstream tasks. Our codes and models are available at\nhttps://github.com/IDEA-Research/DWPose.\n","authors":["Zhendong Yang","Ailing Zeng","Chun Yuan","Yu Li"],"pdf_url":"https://arxiv.org/pdf/2307.15880v2.pdf","comment":"Accepted by ICCV 2023, CV4Metaverse Workshop"},{"id":"http://arxiv.org/abs/2308.13142v1","updated":"2023-08-25T02:35:54Z","published":"2023-08-25T02:35:54Z","title":"A Survey of Diffusion Based Image Generation Models: Issues and Their\n Solutions","summary":" Recently, there has been significant progress in the development of large\nmodels. Following the success of ChatGPT, numerous language models have been\nintroduced, demonstrating remarkable performance. Similar advancements have\nalso been observed in image generation models, such as Google's Imagen model,\nOpenAI's DALL-E 2, and stable diffusion models, which have exhibited impressive\ncapabilities in generating images. However, similar to large language models,\nthese models still encounter unresolved challenges. Fortunately, the\navailability of open-source stable diffusion models and their underlying\nmathematical principles has enabled the academic community to extensively\nanalyze the performance of current image generation models and make\nimprovements based on this stable diffusion framework. This survey aims to\nexamine the existing issues and the current solutions pertaining to image\ngeneration models.\n","authors":["Tianyi Zhang","Zheng Wang","Jing Huang","Mohiuddin Muhammad Tasnim","Wei Shi"],"pdf_url":"https://arxiv.org/pdf/2308.13142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12676v4","updated":"2023-08-25T02:15:05Z","published":"2023-07-24T10:30:54Z","title":"Few-shot $\\mathbf{1/a}$ Anomalies Feedback : Damage Vision Mining\n Opportunity and Embedding Feature Imbalance","summary":" In past decade, previous balanced datasets have been used to advance deep\nlearning algorithms in industrial damage vision tasks. Specifically, for\ncondition-based maintenance, automating visual inspection is crucial to ensure\nhigh quality. Damage vision mining cannot avoid the imbalanced data issue\nbecause of rare unseen events and high quality status by improved operations.\nFor visual damage inspection, deteriorated class acquired from the surface of\nconcrete and steel components are occasionally imbalanced. From numerous\nrelated surveys, we summarize that imbalanced data problems can be categorized\ninto four types; 1) missing range of target and label valuables, 2)\nmajority-minority class imbalance, 3) foreground-background of spatial\nimbalance, 4) long-tailed class of pixel-wise imbalance. Since 2015, there has\nbeen many imbalanced studies using deep learning approaches that includes\nregression, image classification, object detection, semantic segmentation.\nHowever, anomaly detection for imbalanced data is not yet well known. In the\nstudy, we highlight one-class anomaly detection application whether anomalous\nclass or not, and demonstrate clear examples on imbalanced vision datasets:\nmedical disease, hazardous behavior, material deterioration, plant disease,\nriver sludge, and disaster damage. We provide key results on damage vision\nmining advantage, hypothesizing that the more effective range of positive\nratio, the higher accuracy gain of anomalies feedback. In our imbalanced\nstudies, compared with the balanced case of positive ratio 1/1, we find that\nthere is applicable positive ratio $1/a$, where the accuracy are consistently\nhigh. However, extremely imbalanced range from one-shot to $1/2a$, whose\naccuracy are inferior to those of applicable ratio. In contrast, ranged with\npositive ratio over $2/a$, it is shifting in over-mining phase without\neffective gain of accuracy.\n","authors":["Takato Yasuno"],"pdf_url":"https://arxiv.org/pdf/2307.12676v4.pdf","comment":"34 pages, 53 figures, 28 tables"},{"id":"http://arxiv.org/abs/2308.13133v1","updated":"2023-08-25T01:51:26Z","published":"2023-08-25T01:51:26Z","title":"AccFlow: Backward Accumulation for Long-Range Optical Flow","summary":" Recent deep learning-based optical flow estimators have exhibited impressive\nperformance in generating local flows between consecutive frames. However, the\nestimation of long-range flows between distant frames, particularly under\ncomplex object deformation and large motion occlusion, remains a challenging\ntask. One promising solution is to accumulate local flows explicitly or\nimplicitly to obtain the desired long-range flow. Nevertheless, the\naccumulation errors and flow misalignment can hinder the effectiveness of this\napproach. This paper proposes a novel recurrent framework called AccFlow, which\nrecursively backward accumulates local flows using a deformable module called\nas AccPlus. In addition, an adaptive blending module is designed along with\nAccPlus to alleviate the occlusion effect by backward accumulation and rectify\nthe accumulation error. Notably, we demonstrate the superiority of backward\naccumulation over conventional forward accumulation, which to the best of our\nknowledge has not been explicitly established before. To train and evaluate the\nproposed AccFlow, we have constructed a large-scale high-quality dataset named\nCVO, which provides ground-truth optical flow labels between adjacent and\ndistant frames. Extensive experiments validate the effectiveness of AccFlow in\nhandling long-range optical flow estimation. Codes are available at\nhttps://github.com/mulns/AccFlow .\n","authors":["Guangyang Wu","Xiaohong Liu","Kunming Luo","Xi Liu","Qingqing Zheng","Shuaicheng Liu","Xinyang Jiang","Guangtao Zhai","Wenyi Wang"],"pdf_url":"https://arxiv.org/pdf/2308.13133v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12976v3","updated":"2023-08-25T00:15:14Z","published":"2023-03-23T00:55:48Z","title":"NVAutoNet: Fast and Accurate 360$^{\\circ}$ 3D Visual Perception For Self\n Driving","summary":" Robust, real-time perception of 3D world is essential to the autonomous\nvehicle. We introduce an end-to-end surround camera perception system, named\nNVAutoNet, for self-driving. NVAutoNet is a multi-task, multi-camera network\nwhich takes a variable set of time-synced camera images as input and produces a\nrich collection of 3D signals such as sizes, orientations, locations of\nobstacles, parking spaces and free-spaces, etc. NVAutoNet is modular and\nend-to-end: 1) the outputs can be consumed directly by downstream modules\nwithout any post-processing such as clustering and fusion -- improving speed of\nmodel deployment and in-car testing 2) the whole network training is done in\none single stage -- improving speed of model improvement and iterations. The\nnetwork is carefully designed to have high accuracy while running at 53 fps on\nNVIDIA Orin SoC (system-on-a-chip). The network is robust to sensor mounting\nvariations (within some tolerances) and can be quickly customized for different\nvehicle types via efficient model fine-tuning.\n","authors":["Trung Pham","Mehran Maghoumi","Wanli Jiang","Bala Siva Sashank Jujjavarapu","Mehdi Sajjadi","Xin Liu","Hsuan-Chu Lin","Bor-Jeng Chen","Giang Truong","Chao Fang","Junghyun Kwon","Minwoo Park"],"pdf_url":"https://arxiv.org/pdf/2303.12976v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11507v2","updated":"2023-08-25T00:07:50Z","published":"2023-08-22T15:28:49Z","title":"Unsupervised Prototype Adapter for Vision-Language Models","summary":" Recently, large-scale pre-trained vision-language models (e.g. CLIP and\nALIGN) have demonstrated remarkable effectiveness in acquiring transferable\nvisual representations. To leverage the valuable knowledge encoded within these\nmodels for downstream tasks, several fine-tuning approaches, including prompt\ntuning methods and adapter-based methods, have been developed to adapt\nvision-language models effectively with supervision. However, these methods\nrely on the availability of annotated samples, which can be labor-intensive and\ntime-consuming to acquire, thus limiting scalability. To address this issue, in\nthis work, we design an unsupervised fine-tuning approach for vision-language\nmodels called Unsupervised Prototype Adapter (UP-Adapter). Specifically, for\nthe unannotated target datasets, we leverage the text-image aligning capability\nof CLIP to automatically select the most confident samples for each class.\nUtilizing these selected samples, we generate class prototypes, which serve as\nthe initialization for the learnable prototype model. After fine-tuning, the\nprototype model prediction is combined with the original CLIP's prediction by a\nresidual connection to perform downstream recognition tasks. Our extensive\nexperimental results on image recognition and domain generalization show that\nthe proposed unsupervised method outperforms 8-shot CoOp, 8-shot Tip-Adapter,\nand also the state-of-the-art UPL method by large margins.\n","authors":["Yi Zhang","Ce Zhang","Xueting Hu","Zhihai He"],"pdf_url":"https://arxiv.org/pdf/2308.11507v2.pdf","comment":"Accepted by PRCV 2023"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.13486v1","updated":"2023-08-25T16:50:02Z","published":"2023-08-25T16:50:02Z","title":"On the Practicality of Dynamic Updates in Fast Searchable Encryption","summary":" Searchable encrypted (SE) indexing systems are a useful tool for utilizing\ncloud services to store and manage sensitive information. However, much of the\nwork on SE systems to date has remained theoretical. In order to make them of\npractical use, more work is needed to develop optimal protocols and working\nmodels for them. This includes, in particular, the creation of a working update\nmodel in order to maintain an encrypted index of a dynamic document set such as\nan email inbox. I have created a working, real-world end-to-end SE\nimplementation that satisfies these needs, including the first empirical\nperformance evaluation of the dynamic SE update operation. In doing so, I show\na viable path to move from the theoretical concepts described by previous\nresearchers to a future production-worthy implementation and identify issues\nfor follow-on investigation.\n","authors":["Steven Willoughby"],"pdf_url":"https://arxiv.org/pdf/2308.13486v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13467v1","updated":"2023-08-25T16:11:08Z","published":"2023-08-25T16:11:08Z","title":"Leveraging Knowledge and Reinforcement Learning for Enhanced Reliability\n of Language Models","summary":" The Natural Language Processing(NLP) community has been using crowd sourcing\ntechniques to create benchmark datasets such as General Language Understanding\nand Evaluation(GLUE) for training modern Language Models such as BERT. GLUE\ntasks measure the reliability scores using inter annotator metrics i.e. Cohens\nKappa. However, the reliability aspect of LMs has often been overlooked. To\ncounter this problem, we explore a knowledge-guided LM ensembling approach that\nleverages reinforcement learning to integrate knowledge from ConceptNet and\nWikipedia as knowledge graph embeddings. This approach mimics human annotators\nresorting to external knowledge to compensate for information deficits in the\ndatasets. Across nine GLUE datasets, our research shows that ensembling\nstrengthens reliability and accuracy scores, outperforming state of the art.\n","authors":["Nancy Tyagi","Surjodeep Sarkar","Manas Gaur"],"pdf_url":"https://arxiv.org/pdf/2308.13467v1.pdf","comment":"Accepted at CIKM'23"},{"id":"http://arxiv.org/abs/2308.13292v1","updated":"2023-08-25T10:33:44Z","published":"2023-08-25T10:33:44Z","title":"A Bayesian Active Learning Approach to Comparative Judgement","summary":" Assessment is a crucial part of education. Traditional marking is a source of\ninconsistencies and unconscious bias, placing a high cognitive load on the\nassessors. An approach to address these issues is comparative judgement (CJ).\nIn CJ, the assessor is presented with a pair of items and is asked to select\nthe better one. Following a series of comparisons, a rank is derived using a\nranking model, for example, the BTM, based on the results. While CJ is\nconsidered a reliable method for marking, there are concerns around\ntransparency, and the ideal number of pairwise comparisons to generate a\nreliable estimation of the rank order is not known. Additionally, there have\nbeen attempts to generate a method of selecting pairs that should be compared\nnext in an informative manner, but some existing methods are known to have\ncreated their own bias within results inflating the reliability metric used. As\na result, a random selection approach is usually deployed.\n We propose a novel Bayesian approach to CJ (BCJ) for determining the ranks of\ncompared items alongside a new way to select the pairs to present to the\nmarker(s) using active learning (AL), addressing the key shortcomings of\ntraditional CJ. Furthermore, we demonstrate how the entire approach may provide\ntransparency by providing the user insights into how it is making its decisions\nand, at the same time, being more efficient. Results from our experiments\nconfirm that the proposed BCJ combined with entropy-driven AL pair-selection\nmethod is superior to other alternatives. We also find that the more\ncomparisons done, the more accurate BCJ becomes, which solves the issue the\ncurrent method has of the model deteriorating if too many comparisons are\nperformed. As our approach can generate the complete predicted rank\ndistribution for an item, we also show how this can be utilised in devising a\npredicted grade, guided by the assessor.\n","authors":["Andy Gray","Alma Rahat","Tom Crick","Stephen Lindsay","Darren Wallace"],"pdf_url":"https://arxiv.org/pdf/2308.13292v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2308.13249v1","updated":"2023-08-25T08:54:27Z","published":"2023-08-25T08:54:27Z","title":"Learning and Optimization of Implicit Negative Feedback for Industrial\n Short-video Recommender System","summary":" Short-video recommendation is one of the most important recommendation\napplications in today's industrial information systems. Compared with other\nrecommendation tasks, the enormous amount of feedback is the most typical\ncharacteristic. Specifically, in short-video recommendation, the\neasiest-to-collect user feedback is from the skipping behaviors, which leads to\ntwo critical challenges for the recommendation model. First, the skipping\nbehavior reflects implicit user preferences, and thus it is challenging for\ninterest extraction. Second, the kind of special feedback involves multiple\nobjectives, such as total watching time, which is also very challenging. In\nthis paper, we present our industrial solution in Kuaishou, which serves\nbillion-level users every day. Specifically, we deploy a feedback-aware\nencoding module which well extracts user preference taking the impact of\ncontext into consideration. We further design a multi-objective prediction\nmodule which well distinguishes the relation and differences among different\nmodel objectives in the short-video recommendation. We conduct extensive online\nA/B testing, along with detailed and careful analysis, which verifies the\neffectiveness of our solution.\n","authors":["Yunzhu Pan","Nian Li","Chen Gao","Jianxin Chang","Yanan Niu","Yang Song","Depeng Jin","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2308.13249v1.pdf","comment":"Accepted by CIKM'23"},{"id":"http://arxiv.org/abs/2308.13242v1","updated":"2023-08-25T08:27:43Z","published":"2023-08-25T08:27:43Z","title":"Optimizing Group-Fair Plackett-Luce Ranking Models for Relevance and\n Ex-Post Fairness","summary":" In learning-to-rank (LTR), optimizing only the relevance (or the expected\nranking utility) can cause representational harm to certain categories of\nitems. Moreover, if there is implicit bias in the relevance scores, LTR models\nmay fail to optimize for true relevance. Previous works have proposed efficient\nalgorithms to train stochastic ranking models that achieve fairness of exposure\nto the groups ex-ante (or, in expectation), which may not guarantee\nrepresentation fairness to the groups ex-post, that is, after realizing a\nranking from the stochastic ranking model. Typically, ex-post fairness is\nachieved by post-processing, but previous work does not train stochastic\nranking models that are aware of this post-processing.\n In this paper, we propose a novel objective that maximizes expected relevance\nonly over those rankings that satisfy given representation constraints to\nensure ex-post fairness. Building upon recent work on an efficient sampler for\nex-post group-fair rankings, we propose a group-fair Plackett-Luce model and\nshow that it can be efficiently optimized for our objective in the LTR\nframework.\n Experiments on three real-world datasets show that our group-fair algorithm\nguarantees fairness alongside usually having better relevance compared to the\nLTR baselines. In addition, our algorithm also achieves better relevance than\npost-processing baselines, which also ensures ex-post fairness. Further, when\nimplicit bias is injected into the training data, our algorithm typically\noutperforms existing LTR baselines in relevance.\n","authors":["Sruthi Gorantla","Eshaan Bhansali","Amit Deshpande","Anand Louis"],"pdf_url":"https://arxiv.org/pdf/2308.13242v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2308.02860v2","updated":"2023-08-25T07:59:36Z","published":"2023-08-05T12:22:26Z","title":"Replace Scoring with Arrangement: A Contextual Set-to-Arrangement\n Framework for Learning-to-Rank","summary":" Learning-to-rank is a core technique in the top-N recommendation task, where\nan ideal ranker would be a mapping from an item set to an arrangement (a.k.a.\npermutation). Most existing solutions fall in the paradigm of probabilistic\nranking principle (PRP), i.e., first score each item in the candidate set and\nthen perform a sort operation to generate the top ranking list. However, these\napproaches neglect the contextual dependence among candidate items during\nindividual scoring, and the sort operation is non-differentiable. To bypass the\nabove issues, we propose Set-To-Arrangement Ranking (STARank), a new framework\ndirectly generates the permutations of the candidate items without the need for\nindividually scoring and sort operations; and is end-to-end differentiable. As\na result, STARank can operate when only the ground-truth permutations are\naccessible without requiring access to the ground-truth relevance scores for\nitems. For this purpose, STARank first reads the candidate items in the context\nof the user browsing history, whose representations are fed into a\nPlackett-Luce module to arrange the given items into a list. To effectively\nutilize the given ground-truth permutations for supervising STARank, we\nleverage the internal consistency property of Plackett-Luce models to derive a\ncomputationally efficient list-wise loss. Experimental comparisons against 9\nthe state-of-the-art methods on 2 learning-to-rank benchmark datasets and 3\ntop-N real-world recommendation datasets demonstrate the superiority of STARank\nin terms of conventional ranking metrics. Notice that these ranking metrics do\nnot consider the effects of the contextual dependence among the items in the\nlist, we design a new family of simulation-based ranking metrics, where\nexisting metrics can be regarded as special cases. STARank can consistently\nachieve better performance in terms of PBM and UBM simulation-based metrics.\n","authors":["Jiarui Jin","Xianyu Chen","Weinan Zhang","Mengyue Yang","Yang Wang","Yali Du","Yong Yu","Jun Wang"],"pdf_url":"https://arxiv.org/pdf/2308.02860v2.pdf","comment":"CIKM 2023"},{"id":"http://arxiv.org/abs/2308.13187v1","updated":"2023-08-25T05:40:50Z","published":"2023-08-25T05:40:50Z","title":"MMBAttn: Max-Mean and Bit-wise Attention for CTR Prediction","summary":" With the increasing complexity and scale of click-through rate (CTR)\nprediction tasks in online advertising and recommendation systems, accurately\nestimating the importance of features has become a critical aspect of\ndeveloping effective models. In this paper, we propose an attention-based\napproach that leverages max and mean pooling operations, along with a bit-wise\nattention mechanism, to enhance feature importance estimation in CTR\nprediction. Traditionally, pooling operations such as max and mean pooling have\nbeen widely used to extract relevant information from features. However, these\noperations can lead to information loss and hinder the accurate determination\nof feature importance. To address this challenge, we propose a novel attention\narchitecture that utilizes a bit-based attention structure that emphasizes the\nrelationships between all bits in features, together with maximum and mean\npooling. By considering the fine-grained interactions at the bit level, our\nmethod aims to capture intricate patterns and dependencies that might be\noverlooked by traditional pooling operations. To examine the effectiveness of\nthe proposed method, experiments have been conducted on three public datasets.\nThe experiments demonstrated that the proposed method significantly improves\nthe performance of the base models to achieve state-of-the-art results.\n","authors":["Hasan Saribas","Cagri Yesil","Serdarcan Dilbaz","Halit Orenbas"],"pdf_url":"https://arxiv.org/pdf/2308.13187v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13590v1","updated":"2023-08-25T17:23:12Z","published":"2023-08-25T17:23:12Z","title":"LSTM-based QoE Evaluation for Web Microservices' Reputation Scoring","summary":" Sentiment analysis is the task of mining the authors' opinions about specific\nentities. It allows organizations to monitor different services in real time\nand act accordingly. Reputation is what is generally said or believed about\npeople or things. Informally, reputation combines the measure of reliability\nderived from feedback, reviews, and ratings gathered from users, which reflect\ntheir quality of experience (QoE) and can either increase or harm the\nreputation of the provided services. In this study, we propose to perform\nsentiment analysis on web microservices reviews to exploit the provided\ninformation to assess and score the microservices' reputation. Our proposed\napproach uses the Long Short-Term Memory (LSTM) model to perform sentiment\nanalysis and the Net Brand Reputation (NBR) algorithm to assess reputation\nscores for microservices. This approach is tested on a set of more than 10,000\nreviews related to 15 Amazon Web microservices, and the experimental results\nhave shown that our approach is more accurate than existing approaches, with an\naccuracy and precision of 93% obtained after applying an oversampling strategy\nand a resulting reputation score of the considered microservices community of\n89%.\n","authors":["Maha Driss"],"pdf_url":"https://arxiv.org/pdf/2308.13590v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13292v1","updated":"2023-08-25T10:33:44Z","published":"2023-08-25T10:33:44Z","title":"A Bayesian Active Learning Approach to Comparative Judgement","summary":" Assessment is a crucial part of education. Traditional marking is a source of\ninconsistencies and unconscious bias, placing a high cognitive load on the\nassessors. An approach to address these issues is comparative judgement (CJ).\nIn CJ, the assessor is presented with a pair of items and is asked to select\nthe better one. Following a series of comparisons, a rank is derived using a\nranking model, for example, the BTM, based on the results. While CJ is\nconsidered a reliable method for marking, there are concerns around\ntransparency, and the ideal number of pairwise comparisons to generate a\nreliable estimation of the rank order is not known. Additionally, there have\nbeen attempts to generate a method of selecting pairs that should be compared\nnext in an informative manner, but some existing methods are known to have\ncreated their own bias within results inflating the reliability metric used. As\na result, a random selection approach is usually deployed.\n We propose a novel Bayesian approach to CJ (BCJ) for determining the ranks of\ncompared items alongside a new way to select the pairs to present to the\nmarker(s) using active learning (AL), addressing the key shortcomings of\ntraditional CJ. Furthermore, we demonstrate how the entire approach may provide\ntransparency by providing the user insights into how it is making its decisions\nand, at the same time, being more efficient. Results from our experiments\nconfirm that the proposed BCJ combined with entropy-driven AL pair-selection\nmethod is superior to other alternatives. We also find that the more\ncomparisons done, the more accurate BCJ becomes, which solves the issue the\ncurrent method has of the model deteriorating if too many comparisons are\nperformed. As our approach can generate the complete predicted rank\ndistribution for an item, we also show how this can be utilised in devising a\npredicted grade, guided by the assessor.\n","authors":["Andy Gray","Alma Rahat","Tom Crick","Stephen Lindsay"],"pdf_url":"https://arxiv.org/pdf/2308.13292v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2308.13246v1","updated":"2023-08-25T08:42:45Z","published":"2023-08-25T08:42:45Z","title":"Model-free Reinforcement Learning with Stochastic Reward Stabilization\n for Recommender Systems","summary":" Model-free RL-based recommender systems have recently received increasing\nresearch attention due to their capability to handle partial feedback and\nlong-term rewards. However, most existing research has ignored a critical\nfeature in recommender systems: one user's feedback on the same item at\ndifferent times is random. The stochastic rewards property essentially differs\nfrom that in classic RL scenarios with deterministic rewards, which makes\nRL-based recommender systems much more challenging. In this paper, we first\ndemonstrate in a simulator environment where using direct stochastic feedback\nresults in a significant drop in performance. Then to handle the stochastic\nfeedback more efficiently, we design two stochastic reward stabilization\nframeworks that replace the direct stochastic feedback with that learned by a\nsupervised model. Both frameworks are model-agnostic, i.e., they can\neffectively utilize various supervised models. We demonstrate the superiority\nof the proposed frameworks over different RL-based recommendation baselines\nwith extensive experiments on a recommendation simulator as well as an\nindustrial-level recommender system.\n","authors":["Tianchi Cai","Shenliao Bao","Jiyan Jiang","Shiji Zhou","Wenpeng Zhang","Lihong Gu","Jinjie Gu","Guannan Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.13246v1.pdf","comment":"SIGIR '23"},{"id":"http://arxiv.org/abs/2308.13563v1","updated":"2023-08-25T00:09:16Z","published":"2023-08-25T00:09:16Z","title":"Large Language Models in Analyzing Crash Narratives -- A Comparative\n Study of ChatGPT, BARD and GPT-4","summary":" In traffic safety research, extracting information from crash narratives\nusing text analysis is a common practice. With recent advancements of large\nlanguage models (LLM), it would be useful to know how the popular LLM\ninterfaces perform in classifying or extracting information from crash\nnarratives. To explore this, our study has used the three most popular publicly\navailable LLM interfaces- ChatGPT, BARD and GPT4. This study investigated their\nusefulness and boundaries in extracting information and answering queries\nrelated to accidents from 100 crash narratives from Iowa and Kansas. During the\ninvestigation, their capabilities and limitations were assessed and their\nresponses to the queries were compared. Five questions were asked related to\nthe narratives: 1) Who is at-fault? 2) What is the manner of collision? 3) Has\nthe crash occurred in a work-zone? 4) Did the crash involve pedestrians? and 5)\nWhat are the sequence of harmful events in the crash? For questions 1 through\n4, the overall similarity among the LLMs were 70%, 35%, 96% and 89%,\nrespectively. The similarities were higher while answering direct questions\nrequiring binary responses and significantly lower for complex questions. To\ncompare the responses to question 5, network diagram and centrality measures\nwere analyzed. The network diagram from the three LLMs were not always similar\nalthough they sometimes have the same influencing events with high in-degree,\nout-degree and betweenness centrality. This study suggests using multiple\nmodels to extract viable information from narratives. Also, caution must be\npracticed while using these interfaces to obtain crucial safety related\ninformation.\n","authors":["Maroa Mumtarin","Md Samiullah Chowdhury","Jonathan Wood"],"pdf_url":"https://arxiv.org/pdf/2308.13563v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2212.14020v2","updated":"2023-08-25T17:58:53Z","published":"2022-12-28T18:45:05Z","title":"A System-Level View on Out-of-Distribution Data in Robotics","summary":" When testing conditions differ from those represented in training data,\nso-called out-of-distribution (OOD) inputs can mar the reliability of learned\ncomponents in the modern robot autonomy stack. Therefore, coping with OOD data\nis an important challenge on the path towards trustworthy learning-enabled\nopen-world autonomy. In this paper, we aim to demystify the topic of OOD data\nand its associated challenges in the context of data-driven robotic systems,\ndrawing connections to emerging paradigms in the ML community that study the\neffect of OOD data on learned models in isolation. We argue that as\nroboticists, we should reason about the overall \\textit{system-level}\ncompetence of a robot as it operates in OOD conditions. We highlight key\nresearch questions around this system-level view of OOD problems to guide\nfuture research toward safe and reliable learning-enabled autonomy.\n","authors":["Rohan Sinha","Apoorva Sharma","Somrita Banerjee","Thomas Lew","Rachel Luo","Spencer M. Richards","Yixiao Sun","Edward Schmerling","Marco Pavone"],"pdf_url":"https://arxiv.org/pdf/2212.14020v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13513v1","updated":"2023-08-25T17:46:43Z","published":"2023-08-25T17:46:43Z","title":"Unveiling the Role of Message Passing in Dual-Privacy Preservation on\n GNNs","summary":" Graph Neural Networks (GNNs) are powerful tools for learning representations\non graphs, such as social networks. However, their vulnerability to privacy\ninference attacks restricts their practicality, especially in high-stake\ndomains. To address this issue, privacy-preserving GNNs have been proposed,\nfocusing on preserving node and/or link privacy. This work takes a step back\nand investigates how GNNs contribute to privacy leakage. Through theoretical\nanalysis and simulations, we identify message passing under structural bias as\nthe core component that allows GNNs to \\textit{propagate} and \\textit{amplify}\nprivacy leakage. Building upon these findings, we propose a principled\nprivacy-preserving GNN framework that effectively safeguards both node and link\nprivacy, referred to as dual-privacy preservation. The framework comprises\nthree major modules: a Sensitive Information Obfuscation Module that removes\nsensitive information from node embeddings, a Dynamic Structure Debiasing\nModule that dynamically corrects the structural bias, and an Adversarial\nLearning Module that optimizes the privacy-utility trade-off. Experimental\nresults on four benchmark datasets validate the effectiveness of the proposed\nmodel in protecting both node and link privacy while preserving high utility\nfor downstream tasks, such as node classification.\n","authors":["Tianyi Zhao","Hui Hu","Lu Cheng"],"pdf_url":"https://arxiv.org/pdf/2308.13513v1.pdf","comment":"CIKM 2023"},{"id":"http://arxiv.org/abs/2305.07618v2","updated":"2023-08-25T17:34:49Z","published":"2023-05-12T17:17:01Z","title":"Uncertainty Estimation using the Local Lipschitz for Deep Learning Image\n Reconstruction Models","summary":" The use of supervised deep neural network approaches has been investigated to\nsolve inverse problems in all domains, especially radiology where imaging\ntechnologies are at the heart of diagnostics. However, in deployment, these\nmodels are exposed to input distributions that are widely shifted from training\ndata, due in part to data biases or drifts. It becomes crucial to know whether\na given input lies outside the training data distribution before relying on the\nreconstruction for diagnosis. The goal of this work is three-fold: (i)\ndemonstrate use of the local Lipshitz value as an uncertainty estimation\nthreshold for determining suitable performance, (ii) provide method for\nidentifying out-of-distribution (OOD) images where the model may not have\ngeneralized, and (iii) use the local Lipschitz values to guide proper data\naugmentation through identifying false positives and decrease epistemic\nuncertainty. We provide results for both MRI reconstruction and CT sparse view\nto full view reconstruction using AUTOMAP and UNET architectures due to it\nbeing pertinent in the medical domain that reconstructed images remain\ndiagnostically accurate.\n","authors":["Danyal F. Bhutto","Bo Zhu","Jeremiah Z. Liu","Neha Koonjoo","Bruce R. Rosen","Matthew S. Rosen"],"pdf_url":"https://arxiv.org/pdf/2305.07618v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2010.14449v5","updated":"2023-08-25T17:33:22Z","published":"2020-10-27T17:07:36Z","title":"On Model Identification and Out-of-Sample Prediction of Principal\n Component Regression: Applications to Synthetic Controls","summary":" We analyze principal component regression (PCR) in a high-dimensional\nerror-in-variables setting with fixed design. Under suitable conditions, we\nshow that PCR consistently identifies the unique model with minimum\n$\\ell_2$-norm. These results enable us to establish non-asymptotic\nout-of-sample prediction guarantees that improve upon the best known rates. In\nthe course of our analysis, we introduce a natural linear algebraic condition\nbetween the in- and out-of-sample covariates, which allows us to avoid\ndistributional assumptions for out-of-sample predictions. Our simulations\nillustrate the importance of this condition for generalization, even under\ncovariate shifts. Accordingly, we construct a hypothesis test to check when\nthis conditions holds in practice. As a byproduct, our results also lead to\nnovel results for the synthetic controls literature, a leading approach for\npolicy evaluation. To the best of our knowledge, our prediction guarantees for\nthe fixed design setting have been elusive in both the high-dimensional\nerror-in-variables and synthetic controls literatures.\n","authors":["Anish Agarwal","Devavrat Shah","Dennis Shen"],"pdf_url":"https://arxiv.org/pdf/2010.14449v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13507v1","updated":"2023-08-25T17:33:05Z","published":"2023-08-25T17:33:05Z","title":"Does Asking Clarifying Questions Increases Confidence in Generated Code?\n On the Communication Skills of Large Language Models","summary":" Large language models (LLMs) have significantly improved the ability to\nperform tasks in the field of code generation. However, there is still a gap\nbetween LLMs being capable coders and being top-tier software engineers. Based\non the observation that top-level software engineers often ask clarifying\nquestions to reduce ambiguity in both requirements and coding solutions, we\nargue that the same should be applied to LLMs for code generation tasks. By\nasking probing questions in various topics before generating the final code,\nthe challenges of programming with LLMs, such as unclear intent specification,\nlack of computational thinking, and undesired code quality, may be alleviated.\nThis, in turn, increases confidence in the generated code. In this work, we\nexplore how to leverage better communication skills to achieve greater\nconfidence in generated code. We propose a communication-centered process that\nuses an LLM-generated communicator to identify issues with high ambiguity or\nlow confidence in problem descriptions and generated code. We then ask\nclarifying questions to obtain responses from users for refining the code.\n","authors":["Jie JW Wu"],"pdf_url":"https://arxiv.org/pdf/2308.13507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13504v1","updated":"2023-08-25T17:28:58Z","published":"2023-08-25T17:28:58Z","title":"A2Q: Accumulator-Aware Quantization with Guaranteed Overflow Avoidance","summary":" We present accumulator-aware quantization (A2Q), a novel weight quantization\nmethod designed to train quantized neural networks (QNNs) to avoid overflow\nwhen using low-precision accumulators during inference. A2Q introduces a unique\nformulation inspired by weight normalization that constrains the L1-norm of\nmodel weights according to accumulator bit width bounds that we derive. Thus,\nin training QNNs for low-precision accumulation, A2Q also inherently promotes\nunstructured weight sparsity to guarantee overflow avoidance. We apply our\nmethod to deep learning-based computer vision tasks to show that A2Q can train\nQNNs for low-precision accumulators while maintaining model accuracy\ncompetitive with a floating-point baseline. In our evaluations, we consider the\nimpact of A2Q on both general-purpose platforms and programmable hardware.\nHowever, we primarily target model deployment on FPGAs because they can be\nprogrammed to fully exploit custom accumulator bit widths. Our experimentation\nshows accumulator bit width significantly impacts the resource efficiency of\nFPGA-based accelerators. On average across our benchmarks, A2Q offers up to a\n2.3x reduction in resource utilization over 32-bit accumulator counterparts\nwith 99.2% of the floating-point model accuracy.\n","authors":["Ian Colbert","Alessandro Pappalardo","Jakoba Petri-Koenig"],"pdf_url":"https://arxiv.org/pdf/2308.13504v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2301.13376"},{"id":"http://arxiv.org/abs/2305.18204v2","updated":"2023-08-25T17:28:38Z","published":"2023-05-26T12:59:58Z","title":"Kernel Density Matrices for Probabilistic Deep Learning","summary":" This paper introduces a novel approach to probabilistic deep learning, kernel\ndensity matrices, which provide a simpler yet effective mechanism for\nrepresenting joint probability distributions of both continuous and discrete\nrandom variables. In quantum mechanics, a density matrix is the most general\nway to describe the state of a quantum system. This work extends the concept of\ndensity matrices by allowing them to be defined in a reproducing kernel Hilbert\nspace. This abstraction allows the construction of differentiable models for\ndensity estimation, inference, and sampling, and enables their integration into\nend-to-end deep neural models. In doing so, we provide a versatile\nrepresentation of marginal and joint probability distributions that allows us\nto develop a differentiable, compositional, and reversible inference procedure\nthat covers a wide range of machine learning tasks, including density\nestimation, discriminative learning, and generative modeling. The broad\napplicability of the framework is illustrated by two examples: an image\nclassification model that can be naturally transformed into a conditional\ngenerative model, and a model for learning with label proportions that\ndemonstrates the framework's ability to deal with uncertainty in the training\nsamples.\n","authors":["Fabio A. González","Raúl Ramos-Pollán","Joseph A. Gallego-Mejia"],"pdf_url":"https://arxiv.org/pdf/2305.18204v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.17645v2","updated":"2023-08-25T17:13:55Z","published":"2023-06-30T13:33:27Z","title":"Federated Object Detection for Quality Inspection in Shared Production","summary":" Federated learning (FL) has emerged as a promising approach for training\nmachine learning models on decentralized data without compromising data\nprivacy. In this paper, we propose a FL algorithm for object detection in\nquality inspection tasks using YOLOv5 as the object detection algorithm and\nFederated Averaging (FedAvg) as the FL algorithm. We apply this approach to a\nmanufacturing use-case where multiple factories/clients contribute data for\ntraining a global object detection model while preserving data privacy on a\nnon-IID dataset. Our experiments demonstrate that our FL approach achieves\nbetter generalization performance on the overall clients' test dataset and\ngenerates improved bounding boxes around the objects compared to models trained\nusing local clients' datasets. This work showcases the potential of FL for\nquality inspection tasks in the manufacturing industry and provides valuable\ninsights into the performance and feasibility of utilizing YOLOv5 and FedAvg\nfor federated object detection.\n","authors":["Vinit Hegiste","Tatjana Legler","Martin Ruskowski"],"pdf_url":"https://arxiv.org/pdf/2306.17645v2.pdf","comment":"Will submit it to an IEEE conference"},{"id":"http://arxiv.org/abs/2308.13498v1","updated":"2023-08-25T17:13:42Z","published":"2023-08-25T17:13:42Z","title":"Escaping the Sample Trap: Fast and Accurate Epistemic Uncertainty\n Estimation with Pairwise-Distance Estimators","summary":" This work introduces a novel approach for epistemic uncertainty estimation\nfor ensemble models using pairwise-distance estimators (PaiDEs). These\nestimators utilize the pairwise-distance between model components to establish\nbounds on entropy and uses said bounds as estimates for information-based\ncriterion. Unlike recent deep learning methods for epistemic uncertainty\nestimation, which rely on sample-based Monte Carlo estimators, PaiDEs are able\nto estimate epistemic uncertainty up to 100$\\times$ faster, over a larger space\n(up to 100$\\times$) and perform more accurately in higher dimensions. To\nvalidate our approach, we conducted a series of experiments commonly used to\nevaluate epistemic uncertainty estimation: 1D sinusoidal data, Pendulum-v0,\nHopper-v2, Ant-v2 and Humanoid-v2. For each experimental setting, an Active\nLearning framework was applied to demonstrate the advantages of PaiDEs for\nepistemic uncertainty estimation.\n","authors":["Lucas Berry","David Meger"],"pdf_url":"https://arxiv.org/pdf/2308.13498v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13497v1","updated":"2023-08-25T17:13:20Z","published":"2023-08-25T17:13:20Z","title":"Ngambay-French Neural Machine Translation (sba-Fr)","summary":" In Africa, and the world at large, there is an increasing focus on developing\nNeural Machine Translation (NMT) systems to overcome language barriers. NMT for\nLow-resource language is particularly compelling as it involves learning with\nlimited labelled data. However, obtaining a well-aligned parallel corpus for\nlow-resource languages can be challenging. The disparity between the\ntechnological advancement of a few global languages and the lack of research on\nNMT for local languages in Chad is striking. End-to-end NMT trials on\nlow-resource Chad languages have not been attempted. Additionally, there is a\ndearth of online and well-structured data gathering for research in Natural\nLanguage Processing, unlike some African languages. However, a guided approach\nfor data gathering can produce bitext data for many Chadian language\ntranslation pairs with well-known languages that have ample data. In this\nproject, we created the first sba-Fr Dataset, which is a corpus of\nNgambay-to-French translations, and fine-tuned three pre-trained models using\nthis dataset. Our experiments show that the M2M100 model outperforms other\nmodels with high BLEU scores on both original and original+synthetic data. The\npublicly available bitext dataset can be used for research purposes.\n","authors":["Sakayo Toadoum Sari","Angela Fan","Lema Logamou Seknewna"],"pdf_url":"https://arxiv.org/pdf/2308.13497v1.pdf","comment":"Accepted at RANLP 2023 - International Workshop NLP tools and\n resources for translation and interpreting applications"},{"id":"http://arxiv.org/abs/2306.17829v2","updated":"2023-08-25T17:08:34Z","published":"2023-06-30T17:50:00Z","title":"Federated Ensemble YOLOv5 -- A Better Generalized Object Detection\n Algorithm","summary":" Federated learning (FL) has gained significant traction as a\nprivacy-preserving algorithm, but the underlying resemblances of federated\nlearning algorithms like Federated averaging (FedAvg) or Federated SGD (Fed\nSGD) to ensemble learning algorithms have not been fully explored. The purpose\nof this paper is to examine the application of FL to object detection as a\nmethod to enhance generalizability, and to compare its performance against a\ncentralized training approach for an object detection algorithm. Specifically,\nwe investigate the performance of a YOLOv5 model trained using FL across\nmultiple clients and employ a random sampling strategy without replacement, so\neach client holds a portion of the same dataset used for centralized training.\nOur experimental results showcase the superior efficiency of the FL object\ndetector's global model in generating accurate bounding boxes for unseen\nobjects, with the test set being a mixture of objects from two distinct clients\nnot represented in the training dataset. These findings suggest that FL can be\nviewed from an ensemble algorithm perspective, akin to a synergistic blend of\nBagging and Boosting techniques. As a result, FL can be seen not only as a\nmethod to enhance privacy, but also as a method to enhance the performance of a\nmachine learning model.\n","authors":["Vinit Hegiste","Tatjana Legler","Martin Ruskowski"],"pdf_url":"https://arxiv.org/pdf/2306.17829v2.pdf","comment":"8 pages and submitted to FLTA2023 symposium under IEEE"},{"id":"http://arxiv.org/abs/2308.13490v1","updated":"2023-08-25T17:04:35Z","published":"2023-08-25T17:04:35Z","title":"TpuGraphs: A Performance Prediction Dataset on Large Tensor\n Computational Graphs","summary":" Precise hardware performance models play a crucial role in code\noptimizations. They can assist compilers in making heuristic decisions or aid\nautotuners in identifying the optimal configuration for a given program. For\nexample, the autotuner for XLA, a machine learning compiler, discovered 10-20%\nspeedup on state-of-the-art models serving substantial production traffic at\nGoogle. Although there exist a few datasets for program performance prediction,\nthey target small sub-programs such as basic blocks or kernels. This paper\nintroduces TpuGraphs, a performance prediction dataset on full tensor programs,\nrepresented as computational graphs, running on Tensor Processing Units (TPUs).\nEach graph in the dataset represents the main computation of a machine learning\nworkload, e.g., a training epoch or an inference step. Each data sample\ncontains a computational graph, a compilation configuration, and the execution\ntime of the graph when compiled with the configuration. The graphs in the\ndataset are collected from open-source machine learning programs, featuring\npopular model architectures, e.g., ResNet, EfficientNet, Mask R-CNN, and\nTransformer. TpuGraphs provides 25x more graphs than the largest graph property\nprediction dataset (with comparable graph sizes), and 770x larger graphs on\naverage compared to existing performance prediction datasets on machine\nlearning programs. This graph-level prediction task on large graphs introduces\nnew challenges in learning, ranging from scalability, training efficiency, to\nmodel quality.\n","authors":["Phitchaya Mangpo Phothilimthana","Sami Abu-El-Haija","Kaidi Cao","Bahare Fatemi","Charith Mendis","Bryan Perozzi"],"pdf_url":"https://arxiv.org/pdf/2308.13490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.01693v3","updated":"2023-08-25T16:50:16Z","published":"2023-03-03T03:17:53Z","title":"Cross-domain Transfer Learning and State Inference for Soft Robots via a\n Semi-supervised Sequential Variational Bayes Framework","summary":" Recently, data-driven models such as deep neural networks have shown to be\npromising tools for modelling and state inference in soft robots. However,\nvoluminous amounts of data are necessary for deep models to perform\neffectively, which requires exhaustive and quality data collection,\nparticularly of state labels. Consequently, obtaining labelled state data for\nsoft robotic systems is challenged for various reasons, including difficulty in\nthe sensorization of soft robots and the inconvenience of collecting data in\nunstructured environments. To address this challenge, in this paper, we propose\na semi-supervised sequential variational Bayes (DSVB) framework for transfer\nlearning and state inference in soft robots with missing state labels on\ncertain robot configurations. Considering that soft robots may exhibit distinct\ndynamics under different robot configurations, a feature space transfer\nstrategy is also incorporated to promote the adaptation of latent features\nacross multiple configurations. Unlike existing transfer learning approaches,\nour proposed DSVB employs a recurrent neural network to model the nonlinear\ndynamics and temporal coherence in soft robot data. The proposed framework is\nvalidated on multiple setup configurations of a pneumatic-based soft robot\nfinger. Experimental results on four transfer scenarios demonstrate that DSVB\nperforms effective transfer learning and accurate state inference amidst\nmissing state labels. The data and code are available at\nhttps://github.com/shageenderan/DSVB.\n","authors":["Shageenderan Sapai","Junn Yong Loo","Ze Yang Ding","Chee Pin Tan","Raphael CW Phan","Vishnu Monn Baskaran","Surya Girinatha Nurzaman"],"pdf_url":"https://arxiv.org/pdf/2303.01693v3.pdf","comment":"Accepted at the International Conference on Robotics and Automation\n (ICRA) 2023"},{"id":"http://arxiv.org/abs/2308.12219v2","updated":"2023-08-25T16:32:31Z","published":"2023-08-23T16:01:12Z","title":"Diffusion Language Models Can Perform Many Tasks with Scaling and\n Instruction-Finetuning","summary":" The recent surge of generative AI has been fueled by the generative power of\ndiffusion probabilistic models and the scalable capabilities of large language\nmodels. Despite their potential, it remains elusive whether diffusion language\nmodels can solve general language tasks comparable to their autoregressive\ncounterparts. This paper demonstrates that scaling diffusion models w.r.t.\ndata, sizes, and tasks can effectively make them strong language learners. We\nbuild competent diffusion language models at scale by first acquiring knowledge\nfrom massive data via masked language modeling pretraining thanks to their\nintrinsic connections. We then reprogram pretrained masked language models into\ndiffusion language models via diffusive adaptation, wherein task-specific\nfinetuning and instruction finetuning are explored to unlock their versatility\nin solving general language tasks. Experiments show that scaling diffusion\nlanguage models consistently improves performance across downstream language\ntasks. We further discover that instruction finetuning can elicit zero-shot and\nfew-shot in-context learning abilities that help tackle many unseen tasks by\nfollowing natural language instructions, and show promise in advanced and\nchallenging abilities such as reasoning.\n","authors":["Jiasheng Ye","Zaixiang Zheng","Yu Bao","Lihua Qian","Quanquan Gu"],"pdf_url":"https://arxiv.org/pdf/2308.12219v2.pdf","comment":"added references"},{"id":"http://arxiv.org/abs/2308.12843v2","updated":"2023-08-25T16:28:12Z","published":"2023-08-24T15:06:23Z","title":"Actuator Trajectory Planning for UAVs with Overhead Manipulator using\n Reinforcement Learning","summary":" In this paper, we investigate the operation of an aerial manipulator system,\nnamely an Unmanned Aerial Vehicle (UAV) equipped with a controllable arm with\ntwo degrees of freedom to carry out actuation tasks on the fly. Our solution is\nbased on employing a Q-learning method to control the trajectory of the tip of\nthe arm, also called end-effector. More specifically, we develop a motion\nplanning model based on Time To Collision (TTC), which enables a quadrotor UAV\nto navigate around obstacles while ensuring the manipulator's reachability.\nAdditionally, we utilize a model-based Q-learning model to independently track\nand control the desired trajectory of the manipulator's end-effector, given an\narbitrary baseline trajectory for the UAV platform. Such a combination enables\na variety of actuation tasks such as high-altitude welding, structural\nmonitoring and repair, battery replacement, gutter cleaning, skyscrapper\ncleaning, and power line maintenance in hard-to-reach and risky environments\nwhile retaining compatibility with flight control firmware. Our RL-based\ncontrol mechanism results in a robust control strategy that can handle\nuncertainties in the motion of the UAV, offering promising performance.\nSpecifically, our method achieves 92% accuracy in terms of average displacement\nerror (i.e. the mean distance between the target and obtained trajectory\npoints) using Q-learning with 15,000 episodes\n","authors":["Hazim Alzorgan","Abolfazl Razi","Ata Jahangir Moshayedi"],"pdf_url":"https://arxiv.org/pdf/2308.12843v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.03853v3","updated":"2023-08-25T16:11:50Z","published":"2023-04-07T22:27:18Z","title":"StepMix: A Python Package for Pseudo-Likelihood Estimation of\n Generalized Mixture Models with External Variables","summary":" StepMix is an open-source Python package for the pseudo-likelihood estimation\n(one-, two- and three-step approaches) of generalized finite mixture models\n(latent profile and latent class analysis) with external variables (covariates\nand distal outcomes). In many applications in social sciences, the main\nobjective is not only to cluster individuals into latent classes, but also to\nuse these classes to develop more complex statistical models. These models\ngenerally divide into a measurement model that relates the latent classes to\nobserved indicators, and a structural model that relates covariates and outcome\nvariables to the latent classes. The measurement and structural models can be\nestimated jointly using the so-called one-step approach or sequentially using\nstepwise methods, which present significant advantages for practitioners\nregarding the interpretability of the estimated latent classes. In addition to\nthe one-step approach, StepMix implements the most important stepwise\nestimation methods from the literature, including the bias-adjusted three-step\nmethods with BCH and ML corrections and the more recent two-step approach.\nThese pseudo-likelihood estimators are presented in this paper under a unified\nframework as specific expectation-maximization subroutines. To facilitate and\npromote their adoption among the data science community, StepMix follows the\nobject-oriented design of the scikit-learn library and provides an additional R\nwrapper.\n","authors":["Sacha Morin","Robin Legault","Félix Laliberté","Zsuzsa Bakk","Charles-Édouard Giguère","Roxane de la Sablonnière","Éric Lacourse"],"pdf_url":"https://arxiv.org/pdf/2304.03853v3.pdf","comment":"Sacha Morin and Robin Legault contributed equally"},{"id":"http://arxiv.org/abs/2304.01568v2","updated":"2023-08-25T16:10:53Z","published":"2023-04-04T06:47:54Z","title":"Arrhythmia Classifier Based on Ultra-Lightweight Binary Neural Network","summary":" Reasonably and effectively monitoring arrhythmias through ECG signals has\nsignificant implications for human health. With the development of deep\nlearning, numerous ECG classification algorithms based on deep learning have\nemerged. However, most existing algorithms trade off high accuracy for complex\nmodels, resulting in high storage usage and power consumption. This also\ninevitably increases the difficulty of implementation on wearable Artificial\nIntelligence-of-Things (AIoT) devices with limited resources. In this study, we\nproposed a universally applicable ultra-lightweight binary neural network(BNN)\nthat is capable of 5-class and 17-class arrhythmia classification based on ECG\nsignals. Our BNN achieves 96.90% (full precision 97.09%) and 97.50% (full\nprecision 98.00%) accuracy for 5-class and 17-class classification,\nrespectively, with state-of-the-art storage usage (3.76 KB and 4.45 KB).\nCompared to other binarization works, our approach excels in supporting two\nmulti-classification modes while achieving the smallest known storage space.\nMoreover, our model achieves optimal accuracy in 17-class classification and\nboasts an elegantly simple network architecture. The algorithm we use is\noptimized specifically for hardware implementation. Our research showcases the\npotential of lightweight deep learning models in the healthcare industry,\nspecifically in wearable medical devices, which hold great promise for\nimproving patient outcomes and quality of life. Code is available on:\nhttps://github.com/xpww/ECG_BNN_Net\n","authors":["Ninghao Pu","Zhongxing Wu","Ao Wang","Hanshi Sun","Zijin Liu","Hao Liu"],"pdf_url":"https://arxiv.org/pdf/2304.01568v2.pdf","comment":"6 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.13466v1","updated":"2023-08-25T16:10:44Z","published":"2023-08-25T16:10:44Z","title":"Staleness-Alleviated Distributed GNN Training via Online\n Dynamic-Embedding Prediction","summary":" Despite the recent success of Graph Neural Networks (GNNs), it remains\nchallenging to train GNNs on large-scale graphs due to neighbor explosions. As\na remedy, distributed computing becomes a promising solution by leveraging\nabundant computing resources (e.g., GPU). However, the node dependency of graph\ndata increases the difficulty of achieving high concurrency in distributed GNN\ntraining, which suffers from the massive communication overhead. To address it,\nHistorical value approximation is deemed a promising class of distributed\ntraining techniques. It utilizes an offline memory to cache historical\ninformation (e.g., node embedding) as an affordable approximation of the exact\nvalue and achieves high concurrency. However, such benefits come at the cost of\ninvolving dated training information, leading to staleness, imprecision, and\nconvergence issues. To overcome these challenges, this paper proposes SAT\n(Staleness-Alleviated Training), a novel and scalable distributed GNN training\nframework that reduces the embedding staleness adaptively. The key idea of SAT\nis to model the GNN's embedding evolution as a temporal graph and build a model\nupon it to predict future embedding, which effectively alleviates the staleness\nof the cached historical embedding. We propose an online algorithm to train the\nembedding predictor and the distributed GNN alternatively and further provide a\nconvergence analysis. Empirically, we demonstrate that SAT can effectively\nreduce embedding staleness and thus achieve better performance and convergence\nspeed on multiple large-scale graph datasets.\n","authors":["Guangji Bai","Ziyang Yu","Zheng Chai","Yue Cheng","Liang Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.13466v1.pdf","comment":"Preprint. Do not distribute. arXiv admin note: text overlap with\n arXiv:2206.00057"},{"id":"http://arxiv.org/abs/2306.02207v3","updated":"2023-08-25T16:10:18Z","published":"2023-06-03T22:35:27Z","title":"SpeechGen: Unlocking the Generative Power of Speech Language Models with\n Prompts","summary":" Large language models (LLMs) have gained considerable attention for\nArtificial Intelligence Generated Content (AIGC), particularly with the\nemergence of ChatGPT. However, the direct adaptation of continuous speech to\nLLMs that process discrete tokens remains an unsolved challenge, hindering the\napplication of LLMs for speech generation. The advanced speech LMs are in the\ncorner, as that speech signals encapsulate a wealth of information, including\nspeaker and emotion, beyond textual data alone. Prompt tuning has demonstrated\nnotable gains in parameter efficiency and competitive performance on some\nspeech classification tasks. However, the extent to which prompts can\neffectively elicit generation tasks from speech LMs remains an open question.\nIn this paper, we present pioneering research that explores the application of\nprompt tuning to stimulate speech LMs for various generation tasks, within a\nunified framework called SpeechGen, with around 10M trainable parameters. The\nproposed unified framework holds great promise for efficiency and\neffectiveness, particularly with the imminent arrival of advanced speech LMs,\nwhich will significantly enhance the capabilities of the framework. The code\nand demos of SpeechGen will be available on the project website:\n\\url{https://ga642381.github.io/SpeechPrompt/speechgen}\n","authors":["Haibin Wu","Kai-Wei Chang","Yuan-Kuei Wu","Hung-yi Lee"],"pdf_url":"https://arxiv.org/pdf/2306.02207v3.pdf","comment":"Work in progress. The first three authors contributed equally"},{"id":"http://arxiv.org/abs/2308.03312v2","updated":"2023-08-25T16:08:51Z","published":"2023-08-07T05:40:58Z","title":"Symmetry-Preserving Program Representations for Learning Code Semantics","summary":" Large Language Models (LLMs) have shown promise in automated program\nreasoning, a crucial aspect of many security tasks. However, existing LLM\narchitectures for code are often borrowed from other domains like natural\nlanguage processing, raising concerns about their generalization and robustness\nto unseen code. A key generalization challenge is to incorporate the knowledge\nof code semantics, including control and data flow, into the LLM architectures.\n Drawing inspiration from examples of convolution layers exploiting\ntranslation symmetry, we explore how code symmetries can enhance LLM\narchitectures for program analysis and modeling. We present a rigorous\ngroup-theoretic framework that formally defines code symmetries as\nsemantics-preserving transformations and provides techniques for precisely\nreasoning about symmetry preservation within LLM architectures. Using this\nframework, we introduce a novel variant of self-attention that preserves\nprogram symmetries, demonstrating its effectiveness in generalization and\nrobustness through detailed experimental evaluations across different binary\nand source code analysis tasks. Overall, our code symmetry framework offers\nrigorous and powerful reasoning techniques that can guide the future\ndevelopment of specialized LLMs for code and advance LLM-guided program\nreasoning tasks.\n","authors":["Kexin Pei","Weichen Li","Qirui Jin","Shuyang Liu","Scott Geng","Lorenzo Cavallaro","Junfeng Yang","Suman Jana"],"pdf_url":"https://arxiv.org/pdf/2308.03312v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.05952v2","updated":"2023-08-25T15:59:15Z","published":"2023-06-09T15:09:16Z","title":"Overcoming Adversarial Attacks for Human-in-the-Loop Applications","summary":" Including human analysis has the potential to positively affect the\nrobustness of Deep Neural Networks and is relatively unexplored in the\nAdversarial Machine Learning literature. Neural network visual explanation maps\nhave been shown to be prone to adversarial attacks. Further research is needed\nin order to select robust visualizations of explanations for the image analyst\nto evaluate a given model. These factors greatly impact Human-In-The-Loop\n(HITL) evaluation tools due to their reliance on adversarial images, including\nexplanation maps and measurements of robustness. We believe models of human\nvisual attention may improve interpretability and robustness of human-machine\nimagery analysis systems. Our challenge remains, how can HITL evaluation be\nrobust in this adversarial landscape?\n","authors":["Ryan McCoppin","Marla Kennedy","Platon Lukyanenko","Sean Kennedy"],"pdf_url":"https://arxiv.org/pdf/2306.05952v2.pdf","comment":"New Frontiers in Adversarial Machine Learning, ICML 2022"},{"id":"http://arxiv.org/abs/2308.13453v1","updated":"2023-08-25T15:54:22Z","published":"2023-08-25T15:54:22Z","title":"Learning to Intervene on Concept Bottlenecks","summary":" While traditional deep learning models often lack interpretability, concept\nbottleneck models (CBMs) provide inherent explanations via their concept\nrepresentations. Specifically, they allow users to perform interventional\ninteractions on these concepts by updating the concept values and thus\ncorrecting the predictive output of the model. Traditionally, however, these\ninterventions are applied to the model only once and discarded afterward. To\nrectify this, we present concept bottleneck memory models (CB2M), an extension\nto CBMs. Specifically, a CB2M learns to generalize interventions to appropriate\nnovel situations via a two-fold memory with which it can learn to detect\nmistakes and to reapply previous interventions. In this way, a CB2M learns to\nautomatically improve model performance from a few initially obtained\ninterventions. If no prior human interventions are available, a CB2M can detect\npotential mistakes of the CBM bottleneck and request targeted interventions. In\nour experimental evaluations on challenging scenarios like handling\ndistribution shifts and confounded training data, we illustrate that CB2M are\nable to successfully generalize interventions to unseen data and can indeed\nidentify wrongly inferred concepts. Overall, our results show that CB2M is a\ngreat tool for users to provide interactive feedback on CBMs, e.g., by guiding\na user's interaction and requiring fewer interventions.\n","authors":["David Steinmann","Wolfgang Stammer","Felix Friedrich","Kristian Kersting"],"pdf_url":"https://arxiv.org/pdf/2308.13453v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13451v1","updated":"2023-08-25T15:53:30Z","published":"2023-08-25T15:53:30Z","title":"Gotta match 'em all: Solution diversification in graph matching matched\n filters","summary":" We present a novel approach for finding multiple noisily embedded template\ngraphs in a very large background graph. Our method builds upon the\ngraph-matching-matched-filter technique proposed in Sussman et al., with the\ndiscovery of multiple diverse matchings being achieved by iteratively\npenalizing a suitable node-pair similarity matrix in the matched filter\nalgorithm. In addition, we propose algorithmic speed-ups that greatly enhance\nthe scalability of our matched-filter approach. We present theoretical\njustification of our methodology in the setting of correlated Erdos-Renyi\ngraphs, showing its ability to sequentially discover multiple templates under\nmild model conditions. We additionally demonstrate our method's utility via\nextensive experiments both using simulated models and real-world dataset,\ninclude human brain connectomes and a large transactional knowledge base.\n","authors":["Zhirui Li","Ben Johnson","Daniel L. Sussman","Carey E. Priebe","Vince Lyzinski"],"pdf_url":"https://arxiv.org/pdf/2308.13451v1.pdf","comment":"36 pages, 12 figures, 1 table"},{"id":"http://arxiv.org/abs/2304.11860v2","updated":"2023-08-25T15:23:38Z","published":"2023-04-24T07:15:54Z","title":"On the lifting and reconstruction of nonlinear systems with multiple\n attractors","summary":" The Koopman operator provides a linear perspective on non-linear dynamics by\nfocusing on the evolution of observables in an invariant subspace. Observables\nof interest are typically linearly reconstructed from the Koopman\neigenfunctions. Despite the broad use of Koopman operators over the past few\nyears, there exist some misconceptions about the applicability of Koopman\noperators to dynamical systems with more than one fixed point. In this work, an\nexplanation is provided for the mechanism of lifting for the Koopman operator\nof nonlinear systems with multiple attractors. Considering the example of the\nDuffing oscillator, we show that by exploiting the inherent symmetry between\nthe basins of attraction, a linear reconstruction with three degrees of freedom\nin the Koopman observable space is sufficient to globally linearize the system.\n","authors":["Shaowu Pan","Karthik Duraisamy"],"pdf_url":"https://arxiv.org/pdf/2304.11860v2.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2308.13431v1","updated":"2023-08-25T15:23:06Z","published":"2023-08-25T15:23:06Z","title":"Six Lectures on Linearized Neural Networks","summary":" In these six lectures, we examine what can be learnt about the behavior of\nmulti-layer neural networks from the analysis of linear models. We first recall\nthe correspondence between neural networks and linear models via the so-called\nlazy regime. We then review four models for linearized neural networks: linear\nregression with concentrated features, kernel ridge regression, random feature\nmodel and neural tangent model. Finally, we highlight the limitations of the\nlinear theory and discuss how other approaches can overcome them.\n","authors":["Theodor Misiakiewicz","Andrea Montanari"],"pdf_url":"https://arxiv.org/pdf/2308.13431v1.pdf","comment":"77 pages, 8 figures"},{"id":"http://arxiv.org/abs/2302.01161v2","updated":"2023-08-25T15:21:50Z","published":"2023-02-02T15:32:25Z","title":"Vectorized Scenario Description and Motion Prediction for Scenario-Based\n Testing","summary":" Automated vehicles (AVs) are tested in diverse scenarios, typically specified\nby parameters such as velocities, distances, or curve radii. To describe\nscenarios uniformly independent of such parameters, this paper proposes a\nvectorized scenario description defined by the road geometry and vehicles'\ntrajectories. Data of this form are generated for three scenarios, merged, and\nused to train the motion prediction model VectorNet, allowing to predict an\nAV's trajectory for unseen scenarios. Predicting scenario evaluation metrics,\nVectorNet partially achieves lower errors than regression models that\nseparately process the three scenarios' data. However, for comprehensive\ngeneralization, sufficient variance in the training data must be ensured. Thus,\ncontrary to existing methods, our proposed method can merge diverse scenarios'\ndata and exploit spatial and temporal nuances in the vectorized scenario\ndescription. As a result, data from specified test scenarios and real-world\nscenarios can be compared and combined for (predictive) analyses and scenario\nselection.\n","authors":["Max Winkelmann","Constantin Vasconi","Steffen Müller"],"pdf_url":"https://arxiv.org/pdf/2302.01161v2.pdf","comment":"6 pages, 7 figures, 3 tables"},{"id":"http://arxiv.org/abs/2302.08434v2","updated":"2023-08-25T15:18:49Z","published":"2023-02-16T17:18:03Z","title":"On marginal feature attributions of tree-based models","summary":" Due to their power and ease of use, tree-based machine learning models, such\nas random forests and gradient-boosted tree ensembles, have become very\npopular. To interpret them, local feature attributions based on marginal\nexpectations, e.g. marginal (interventional) Shapley, Owen or Banzhaf values,\nmay be employed. Such methods are true to the model and implementation\ninvariant, i.e. dependent only on the input-output function of the model. We\ncontrast this with the popular TreeSHAP algorithm by presenting two\n(statistically similar) decision trees that compute the exact same function for\nwhich the \"path-dependent\" TreeSHAP yields different rankings of features,\nwhereas the marginal Shapley values coincide. Furthermore, we discuss how the\ninternal structure of tree-based models may be leveraged to help with computing\ntheir marginal feature attributions according to a linear game value. One\nimportant observation is that these are simple (piecewise-constant) functions\nwith respect to a certain grid partition of the input space determined by the\ntrained model. Another crucial observation, showcased by experiments with\nXGBoost, LightGBM and CatBoost libraries, is that only a portion of all\nfeatures appears in a tree from the ensemble. Thus, the complexity of computing\nmarginal Shapley (or Owen or Banzhaf) feature attributions may be reduced. This\nremains valid for a broader class of game values which we shall axiomatically\ncharacterize. A prime example is the case of CatBoost models where the trees\nare oblivious (symmetric) and the number of features in each of them is no\nlarger than the depth. We exploit the symmetry to derive an explicit formula,\nwith improved complexity and only in terms of the internal model parameters,\nfor marginal Shapley (and Banzhaf and Owen) values of CatBoost models. This\nresults in a fast, accurate algorithm for estimating these feature\nattributions.\n","authors":["Khashayar Filom","Alexey Miroshnikov","Konstandinos Kotsiopoulos","Arjun Ravi Kannan"],"pdf_url":"https://arxiv.org/pdf/2302.08434v2.pdf","comment":"Major revision. Notation is simplified, technical details are moved\n to appendix, Algorithm 3.12 is rewritten, the complexity bound in Theorem 3.6\n is improved, {\\S}4 on numerical experiments is expanded. Theorem 2.4 (a\n classification result for game values) and the results of Appendix F\n (generalizations of Theorem 3.6) are new. 29 pages+appendix (63 pages in\n total), 9 figures"},{"id":"http://arxiv.org/abs/2308.01423v2","updated":"2023-08-25T15:13:46Z","published":"2023-08-01T02:08:13Z","title":"ChatMOF: An Autonomous AI System for Predicting and Generating\n Metal-Organic Frameworks","summary":" ChatMOF is an autonomous Artificial Intelligence (AI) system that is built to\npredict and generate metal-organic frameworks (MOFs). By leveraging a\nlarge-scale language model (GPT-4 and GPT-3.5-turbo), ChatMOF extracts key\ndetails from textual inputs and delivers appropriate responses, thus\neliminating the necessity for rigid structured queries. The system is comprised\nof three core components (i.e. an agent, a toolkit, and an evaluator) and it\nforms a robust pipeline that manages a variety of tasks, including data\nretrieval, property prediction, and structure generations. The study further\nexplores the merits and constraints of using large language models (LLMs) AI\nsystem in material sciences using and showcases its transformative potential\nfor future advancements.\n","authors":["Yeonghun Kang","Jihan Kim"],"pdf_url":"https://arxiv.org/pdf/2308.01423v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13418v1","updated":"2023-08-25T15:03:36Z","published":"2023-08-25T15:03:36Z","title":"Nougat: Neural Optical Understanding for Academic Documents","summary":" Scientific knowledge is predominantly stored in books and scientific\njournals, often in the form of PDFs. However, the PDF format leads to a loss of\nsemantic information, particularly for mathematical expressions. We propose\nNougat (Neural Optical Understanding for Academic Documents), a Visual\nTransformer model that performs an Optical Character Recognition (OCR) task for\nprocessing scientific documents into a markup language, and demonstrate the\neffectiveness of our model on a new dataset of scientific documents. The\nproposed approach offers a promising solution to enhance the accessibility of\nscientific knowledge in the digital age, by bridging the gap between\nhuman-readable documents and machine-readable text. We release the models and\ncode to accelerate future work on scientific text recognition.\n","authors":["Lukas Blecher","Guillem Cucurull","Thomas Scialom","Robert Stojnic"],"pdf_url":"https://arxiv.org/pdf/2308.13418v1.pdf","comment":"17 pages, 10 figures"},{"id":"http://arxiv.org/abs/2210.05880v2","updated":"2023-08-25T14:59:21Z","published":"2022-10-12T02:52:00Z","title":"Pathology Steered Stratification Network for Subtype Identification in\n Alzheimer's Disease","summary":" Alzheimer's disease (AD) is a heterogeneous, multifactorial neurodegenerative\ndisorder characterized by beta-amyloid, pathologic tau, and neurodegeneration.\nThere are no effective treatments for Alzheimer's disease at a late stage,\nurging for early intervention. However, existing statistical inference\napproaches of AD subtype identification ignore the pathological domain\nknowledge, which could lead to ill-posed results that are sometimes\ninconsistent with the essential neurological principles. Integrating systems\nbiology modeling with machine learning, we propose a novel pathology steered\nstratification network (PSSN) that incorporates established domain knowledge in\nAD pathology through a reaction-diffusion model, where we consider non-linear\ninteractions between major biomarkers and diffusion along brain structural\nnetwork. Trained on longitudinal multimodal neuroimaging data, the biological\nmodel predicts long-term trajectories that capture individual progression\npattern, filling in the gaps between sparse imaging data available. A deep\npredictive neural network is then built to exploit spatiotemporal dynamics,\nlink neurological examinations with clinical profiles, and generate subtype\nassignment probability on an individual basis. We further identify an\nevolutionary disease graph to quantify subtype transition probabilities through\nextensive simulations. Our stratification achieves superior performance in both\ninter-cluster heterogeneity and intra-cluster homogeneity of various clinical\nscores. Applying our approach to enriched samples of aging populations, we\nidentify six subtypes spanning AD spectrum, where each subtype exhibits a\ndistinctive biomarker pattern that is consistent with its clinical outcome.\nPSSN provides insights into pre-symptomatic diagnosis and practical guidance on\nclinical treatments, which may be further generalized to other\nneurodegenerative diseases.\n","authors":["Enze Xu","Jingwen Zhang","Jiadi Li","Qianqian Song","Defu Yang","Guorong Wu","Minghan Chen"],"pdf_url":"https://arxiv.org/pdf/2210.05880v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13415v1","updated":"2023-08-25T14:55:38Z","published":"2023-08-25T14:55:38Z","title":"An investigation into the impact of deep learning model choice on sex\n and race bias in cardiac MR segmentation","summary":" In medical imaging, artificial intelligence (AI) is increasingly being used\nto automate routine tasks. However, these algorithms can exhibit and exacerbate\nbiases which lead to disparate performances between protected groups. We\ninvestigate the impact of model choice on how imbalances in subject sex and\nrace in training datasets affect AI-based cine cardiac magnetic resonance image\nsegmentation. We evaluate three convolutional neural network-based models and\none vision transformer model. We find significant sex bias in three of the four\nmodels and racial bias in all of the models. However, the severity and nature\nof the bias varies between the models, highlighting the importance of model\nchoice when attempting to train fair AI-based segmentation models for medical\nimaging tasks.\n","authors":["Tiarna Lee","Esther Puyol-Antón","Bram Ruijsink","Keana Aitcheson","Miaojing Shi","Andrew P. King"],"pdf_url":"https://arxiv.org/pdf/2308.13415v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10154v2","updated":"2023-08-25T14:48:38Z","published":"2023-08-20T04:01:30Z","title":"Resource-Adaptive Newton's Method for Distributed Learning","summary":" Distributed stochastic optimization methods based on Newton's method offer\nsignificant advantages over first-order methods by leveraging curvature\ninformation for improved performance. However, the practical applicability of\nNewton's method is hindered in large-scale and heterogeneous learning\nenvironments due to challenges such as high computation and communication costs\nassociated with the Hessian matrix, sub-model diversity, staleness in training,\nand data heterogeneity. To address these challenges, this paper introduces a\nnovel and efficient algorithm called RANL, which overcomes the limitations of\nNewton's method by employing a simple Hessian initialization and adaptive\nassignments of training regions. The algorithm demonstrates impressive\nconvergence properties, which are rigorously analyzed under standard\nassumptions in stochastic optimization. The theoretical analysis establishes\nthat RANL achieves a linear convergence rate while effectively adapting to\navailable resources and maintaining high efficiency. Unlike traditional\nfirst-order methods, RANL exhibits remarkable independence from the condition\nnumber of the problem and eliminates the need for complex parameter tuning.\nThese advantages make RANL a promising approach for distributed stochastic\noptimization in practical scenarios.\n","authors":["Shuzhen Chen","Yuan Yuan","Youming Tao","Zhipeng Cai","Dongxiao Yu"],"pdf_url":"https://arxiv.org/pdf/2308.10154v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.02343v3","updated":"2023-08-25T14:44:06Z","published":"2023-02-05T09:12:07Z","title":"LExecutor: Learning-Guided Execution","summary":" Executing code is essential for various program analysis tasks, e.g., to\ndetect bugs that manifest through exceptions or to obtain execution traces for\nfurther dynamic analysis. However, executing an arbitrary piece of code is\noften difficult in practice, e.g., because of missing variable definitions,\nmissing user inputs, and missing third-party dependencies. This paper presents\nLExecutor, a learning-guided approach for executing arbitrary code snippets in\nan underconstrained way. The key idea is to let a neural model predict missing\nvalues that otherwise would cause the program to get stuck, and to inject these\nvalues into the execution. For example, LExecutor injects likely values for\notherwise undefined variables and likely return values of calls to otherwise\nmissing functions. We evaluate the approach on Python code from popular\nopen-source projects and on code snippets extracted from Stack Overflow. The\nneural model predicts realistic values with an accuracy between 79.5% and\n98.2%, allowing LExecutor to closely mimic real executions. As a result, the\napproach successfully executes significantly more code than any available\ntechnique, such as simply executing the code as-is. For example, executing the\nopen-source code snippets as-is covers only 4.1% of all lines, because the code\ncrashes early on, whereas LExecutor achieves a coverage of 51.6%.\n","authors":["Beatriz Souza","Michael Pradel"],"pdf_url":"https://arxiv.org/pdf/2302.02343v3.pdf","comment":"Accepted in research track of the ACM Joint European Software\n Engineering Conference and Symposium on the Foundations of Software\n Engineering (ESEC/FSE) 2023"},{"id":"http://arxiv.org/abs/2308.08163v2","updated":"2023-08-25T14:43:36Z","published":"2023-08-16T06:11:27Z","title":"Characteristics of networks generated by kernel growing neural gas","summary":" This research aims to develop kernel GNG, a kernelized version of the growing\nneural gas (GNG) algorithm, and to investigate the features of the networks\ngenerated by the kernel GNG. The GNG is an unsupervised artificial neural\nnetwork that can transform a dataset into an undirected graph, thereby\nextracting the features of the dataset as a graph. The GNG is widely used in\nvector quantization, clustering, and 3D graphics. Kernel methods are often used\nto map a dataset to feature space, with support vector machines being the most\nprominent application. This paper introduces the kernel GNG approach and\nexplores the characteristics of the networks generated by kernel GNG. Five\nkernels, including Gaussian, Laplacian, Cauchy, inverse multiquadric, and log\nkernels, are used in this study. The results of this study show that the\naverage degree and the average clustering coefficient decrease as the kernel\nparameter increases for Gaussian, Laplacian, Cauchy, and IMQ kernels. If we\navoid more edges and a higher clustering coefficient (or more triangles), the\nkernel GNG with a larger value of the parameter will be more appropriate.\n","authors":["Kazuhisa Fujita"],"pdf_url":"https://arxiv.org/pdf/2308.08163v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13406v1","updated":"2023-08-25T14:33:59Z","published":"2023-08-25T14:33:59Z","title":"Using Visual and Vehicular Sensors for Driver Behavior Analysis: A\n Survey","summary":" Risky drivers account for 70% of fatal accidents in the United States. With\nrecent advances in sensors and intelligent vehicular systems, there has been\nsignificant research on assessing driver behavior to improve driving\nexperiences and road safety. This paper examines the various techniques used to\nanalyze driver behavior using visual and vehicular data, providing an overview\nof the latest research in this field. The paper also discusses the challenges\nand open problems in the field and offers potential recommendations for future\nresearch. The survey concludes that integrating vision and vehicular\ninformation can significantly enhance the accuracy and effectiveness of driver\nbehavior analysis, leading to improved safety measures and reduced traffic\naccidents.\n","authors":["Bikram Adhikari"],"pdf_url":"https://arxiv.org/pdf/2308.13406v1.pdf","comment":"10 pages, 2 figures, 5 tables"},{"id":"http://arxiv.org/abs/2308.06093v2","updated":"2023-08-25T14:30:45Z","published":"2023-08-11T12:05:12Z","title":"Experts Weights Averaging: A New General Training Scheme for Vision\n Transformers","summary":" Structural re-parameterization is a general training scheme for Convolutional\nNeural Networks (CNNs), which achieves performance improvement without\nincreasing inference cost. As Vision Transformers (ViTs) are gradually\nsurpassing CNNs in various visual tasks, one may question: if a training scheme\nspecifically for ViTs exists that can also achieve performance improvement\nwithout increasing inference cost? Recently, Mixture-of-Experts (MoE) has\nattracted increasing attention, as it can efficiently scale up the capacity of\nTransformers at a fixed cost through sparsely activated experts. Considering\nthat MoE can also be viewed as a multi-branch structure, can we utilize MoE to\nimplement a ViT training scheme similar to structural re-parameterization? In\nthis paper, we affirmatively answer these questions, with a new general\ntraining strategy for ViTs. Specifically, we decouple the training and\ninference phases of ViTs. During training, we replace some Feed-Forward\nNetworks (FFNs) of the ViT with specially designed, more efficient MoEs that\nassign tokens to experts by random uniform partition, and perform Experts\nWeights Averaging (EWA) on these MoEs at the end of each iteration. After\ntraining, we convert each MoE into an FFN by averaging the experts,\ntransforming the model back into original ViT for inference. We further provide\na theoretical analysis to show why and how it works. Comprehensive experiments\nacross various 2D and 3D visual tasks, ViT architectures, and datasets validate\nthe effectiveness and generalizability of the proposed training scheme.\nBesides, our training scheme can also be applied to improve performance when\nfine-tuning ViTs. Lastly, but equally important, the proposed EWA technique can\nsignificantly improve the effectiveness of naive MoE in various 2D visual small\ndatasets and 3D visual tasks.\n","authors":["Yongqi Huang","Peng Ye","Xiaoshui Huang","Sheng Li","Tao Chen","Tong He","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2308.06093v2.pdf","comment":"12 pages, 2 figures"},{"id":"http://arxiv.org/abs/2302.14833v2","updated":"2023-08-25T14:28:20Z","published":"2023-02-28T18:31:07Z","title":"Learning to Control Autonomous Fleets from Observation via Offline\n Reinforcement Learning","summary":" Autonomous Mobility-on-Demand (AMoD) systems are an evolving mode of\ntransportation in which a centrally coordinated fleet of self-driving vehicles\ndynamically serves travel requests. The control of these systems is typically\nformulated as a large network optimization problem, and reinforcement learning\n(RL) has recently emerged as a promising approach to solve the open challenges\nin this space. Recent centralized RL approaches focus on learning from online\ndata, ignoring the per-sample-cost of interactions within real-world\ntransportation systems. To address these limitations, we propose to formalize\nthe control of AMoD systems through the lens of offline reinforcement learning\nand learn effective control strategies using solely offline data, which is\nreadily available to current mobility operators. We further investigate design\ndecisions and provide empirical evidence based on data from real-world mobility\nsystems showing how offline learning allows to recover AMoD control policies\nthat (i) exhibit performance on par with online methods, (ii) allow for\nsample-efficient online fine-tuning and (iii) eliminate the need for complex\nsimulation environments. Crucially, this paper demonstrates that offline RL is\na promising paradigm for the application of RL-based solutions within\neconomically-critical systems, such as mobility systems.\n","authors":["Carolin Schmidt","Daniele Gammelli","Francisco Camara Pereira","Filipe Rodrigues"],"pdf_url":"https://arxiv.org/pdf/2302.14833v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13399v1","updated":"2023-08-25T14:23:40Z","published":"2023-08-25T14:23:40Z","title":"EntropyRank: Unsupervised Keyphrase Extraction via Side-Information\n Optimization for Language Model-based Text Compression","summary":" We propose an unsupervised method to extract keywords and keyphrases from\ntexts based on a pre-trained language model (LM) and Shannon's information\nmaximization. Specifically, our method extracts phrases having the highest\nconditional entropy under the LM. The resulting set of keyphrases turns out to\nsolve a relevant information-theoretic problem: if provided as side\ninformation, it leads to the expected minimal binary code length in compressing\nthe text using the LM and an entropy encoder. Alternately, the resulting set is\nan approximation via a causal LM to the set of phrases that minimize the\nentropy of the text when conditioned upon it. Empirically, the method provides\nresults comparable to the most commonly used methods in various keyphrase\nextraction benchmark challenges.\n","authors":["Alexander Tsvetkov. Alon Kipnis"],"pdf_url":"https://arxiv.org/pdf/2308.13399v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13386v1","updated":"2023-08-25T14:01:43Z","published":"2023-08-25T14:01:43Z","title":"TFDNet: Time-Frequency Enhanced Decomposed Network for Long-term Time\n Series Forecasting","summary":" Long-term time series forecasting is a vital task and has a wide range of\nreal applications. Recent methods focus on capturing the underlying patterns\nfrom one single domain (e.g. the time domain or the frequency domain), and have\nnot taken a holistic view to process long-term time series from the\ntime-frequency domains. In this paper, we propose a Time-Frequency Enhanced\nDecomposed Network (TFDNet) to capture both the long-term underlying patterns\nand temporal periodicity from the time-frequency domain. In TFDNet, we devise a\nmulti-scale time-frequency enhanced encoder backbone and develop two separate\ntrend and seasonal time-frequency blocks to capture the distinct patterns\nwithin the decomposed trend and seasonal components in multi-resolutions.\nDiverse kernel learning strategies of the kernel operations in time-frequency\nblocks have been explored, by investigating and incorporating the potential\ndifferent channel-wise correlation patterns of multivariate time series.\nExperimental evaluation of eight datasets from five benchmark domains\ndemonstrated that TFDNet is superior to state-of-the-art approaches in both\neffectiveness and efficiency.\n","authors":["Yuxiao Luo","Ziyu Lyu","Xingyu Huang"],"pdf_url":"https://arxiv.org/pdf/2308.13386v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13380v1","updated":"2023-08-25T13:50:17Z","published":"2023-08-25T13:50:17Z","title":"In-context learning for model-free system identification","summary":" In traditional system identification, we estimate a model of an unknown\ndynamical system based on given input/output sequences and available physical\nknowledge. Yet, is it also possible to understand the intricacies of dynamical\nsystems not solely from their input/output patterns, but by observing the\nbehavior of other systems within the same class? This central question drives\nthe study presented in this paper.\n In response to this query, we introduce a novel paradigm for system\nidentification, addressing two primary tasks: one-step-ahead prediction and\nmulti-step simulation. Unlike conventional methods, we do not directly estimate\na model for the specific system. Instead, we pretrain a meta model that\nrepresents a class of dynamical systems. This meta model is trained from a\npotentially infinite stream of synthetic data, generated by systems randomly\nextracted from a certain distribution. At its core, the meta model serves as an\nimplicit representation of the main characteristics of a class of dynamical\nsystems. When provided with a brief context from a new system - specifically, a\nshort input/output sequence - the meta model implicitly discerns its dynamics,\nenabling predictions of its behavior.\n The proposed approach harnesses the power of Transformer architectures,\nrenowned for their in-context learning capabilities in Natural Language\nProcessing tasks. For one-step prediction, a GPT-like decoder-only architecture\nis utilized, whereas the simulation problem employs an encoder-decoder\nstructure.\n Initial experimental results affirmatively answer our foundational question,\nopening doors to fresh research avenues in system identification.\n","authors":["Marco Forgione","Filippo Pura","Dario Piga"],"pdf_url":"https://arxiv.org/pdf/2308.13380v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.06074v3","updated":"2023-08-25T13:48:53Z","published":"2021-12-11T21:28:50Z","title":"Early Stopping for Deep Image Prior","summary":" Deep image prior (DIP) and its variants have showed remarkable potential for\nsolving inverse problems in computer vision, without any extra training data.\nPractical DIP models are often substantially overparameterized. During the\nfitting process, these models learn mostly the desired visual content first,\nand then pick up the potential modeling and observational noise, i.e.,\noverfitting. Thus, the practicality of DIP often depends critically on good\nearly stopping (ES) that captures the transition period. In this regard, the\nmajority of DIP works for vision tasks only demonstrates the potential of the\nmodels -- reporting the peak performance against the ground truth, but provides\nno clue about how to operationally obtain near-peak performance without access\nto the groundtruth. In this paper, we set to break this practicality barrier of\nDIP, and propose an efficient ES strategy, which consistently detects near-peak\nperformance across several vision tasks and DIP variants. Based on a simple\nmeasure of dispersion of consecutive DIP reconstructions, our ES method not\nonly outpaces the existing ones -- which only work in very narrow domains, but\nalso remains effective when combined with a number of methods that try to\nmitigate the overfitting. The code is available at\nhttps://github.com/sun-umn/Early_Stopping_for_DIP.\n","authors":["Hengkang Wang","Taihui Li","Zhong Zhuang","Tiancong Chen","Hengyue Liang","Ju Sun"],"pdf_url":"https://arxiv.org/pdf/2112.06074v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13371v1","updated":"2023-08-25T13:32:28Z","published":"2023-08-25T13:32:28Z","title":"EOG Artifact Removal from Single and Multi-channel EEG Recordings\n through the combination of Long Short-Term Memory Networks and Independent\n Component Analysis","summary":" Introduction: Electroencephalogram (EEG) signals have gained significant\npopularity in various applications due to their rich information content.\nHowever, these signals are prone to contamination from various sources of\nartifacts, notably the electrooculogram (EOG) artifacts caused by eye\nmovements. The most effective approach to mitigate EOG artifacts involves\nrecording EOG signals simultaneously with EEG and employing blind source\nseparation techniques, such as independent component analysis (ICA).\nNevertheless, the availability of EOG recordings is not always feasible,\nparticularly in pre-recorded datasets. Objective: In this paper, we present a\nnovel methodology that combines a long short-term memory (LSTM)-based neural\nnetwork with ICA to address the challenge of EOG artifact removal from\ncontaminated EEG signals. Approach: Our approach aims to accomplish two primary\nobjectives: 1) estimate the horizontal and vertical EOG signals from the\ncontaminated EEG data, and 2) employ ICA to eliminate the estimated EOG signals\nfrom the EEG, thereby producing an artifact-free EEG signal. Main results: To\nevaluate the performance of our proposed method, we conducted experiments on a\npublicly available dataset comprising recordings from 27 participants. We\nemployed well-established metrics such as mean squared error, mean absolute\nerror, and mean error to assess the quality of our artifact removal technique.\nSignificance: Furthermore, we compared the performance of our approach with two\nstate-of-the-art deep learning-based methods reported in the literature,\ndemonstrating the superior performance of our proposed methodology.\n","authors":["Behrad TaghiBeyglou","Fatemeh Bagheri"],"pdf_url":"https://arxiv.org/pdf/2308.13371v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.13850v2","updated":"2023-08-25T13:31:36Z","published":"2023-03-24T08:17:31Z","title":"Towards Learning and Explaining Indirect Causal Effects in Neural\n Networks","summary":" Recently, there has been a growing interest in learning and explaining causal\neffects within Neural Network (NN) models. By virtue of NN architectures,\nprevious approaches consider only direct and total causal effects assuming\nindependence among input variables. We view an NN as a structural causal model\n(SCM) and extend our focus to include indirect causal effects by introducing\nfeedforward connections among input neurons. We propose an ante-hoc method that\ncaptures and maintains direct, indirect, and total causal effects during NN\nmodel training. We also propose an algorithm for quantifying learned causal\neffects in an NN model and efficient approximation strategies for quantifying\ncausal effects in high-dimensional data. Extensive experiments conducted on\nsynthetic and real-world datasets demonstrate that the causal effects learned\nby our ante-hoc method better approximate the ground truth effects compared to\nexisting methods.\n","authors":["Abbaavaram Gowtham Reddy","Saketh Bachu","Harsharaj Pathak","Benin L Godfrey","Vineeth N. Balasubramanian","Varshaneya V","Satya Narayanan Kar"],"pdf_url":"https://arxiv.org/pdf/2303.13850v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2009.12462v4","updated":"2023-08-25T13:31:09Z","published":"2020-09-25T22:41:04Z","title":"Symbolic Relational Deep Reinforcement Learning based on Graph Neural\n Networks and Autoregressive Policy Decomposition","summary":" We focus on reinforcement learning (RL) in relational problems that are\nnaturally defined in terms of objects, their relations, and object-centric\nactions. These problems are characterized by variable state and action spaces,\nand finding a fixed-length representation, required by most existing RL\nmethods, is difficult, if not impossible. We present a deep RL framework based\non graph neural networks and auto-regressive policy decomposition that\nnaturally works with these problems and is completely domain-independent. We\ndemonstrate the framework's broad applicability in three distinct domains and\nshow impressive zero-shot generalization over different problem sizes.\n","authors":["Jaromír Janisch","Tomáš Pevný","Viliam Lisý"],"pdf_url":"https://arxiv.org/pdf/2009.12462v4.pdf","comment":"code available at https://github.com/jaromiru/sr-drl"},{"id":"http://arxiv.org/abs/2301.00666v2","updated":"2023-08-25T13:27:10Z","published":"2022-12-30T00:31:58Z","title":"E-commerce users' preferences for delivery options","summary":" Many e-commerce marketplaces offer their users fast delivery options for free\nto meet the increasing needs of users, imposing an excessive burden on city\nlogistics. Therefore, understanding e-commerce users' preference for delivery\noptions is a key to designing logistics policies. To this end, this study\ndesigns a stated choice survey in which respondents are faced with choice tasks\namong different delivery options and time slots, which was completed by 4,062\nusers from the three major metropolitan areas in Japan. To analyze the data,\nmixed logit models capturing taste heterogeneity as well as flexible\nsubstitution patterns have been estimated. The model estimation results\nindicate that delivery attributes including fee, time, and time slot size are\nsignificant determinants of the delivery option choices. Associations between\nusers' preferences and socio-demographic characteristics, such as age, gender,\nteleworking frequency and the presence of a delivery box, were also suggested.\nMoreover, we analyzed two willingness-to-pay measures for delivery, namely, the\nvalue of delivery time savings (VODT) and the value of time slot shortening\n(VOTS), and applied a non-semiparametric approach to estimate their\ndistributions in a data-oriented manner. Although VODT has a large\nheterogeneity among respondents, the estimated median VODT is 25.6 JPY/day,\nimplying that more than half of the respondents would wait an additional day if\nthe delivery fee were increased by only 26 JPY, that is, they do not\nnecessarily need a fast delivery option but often request it when cheap or\nalmost free. Moreover, VOTS was found to be low, distributed with the median of\n5.0 JPY/hour; that is, users do not highly value the reduction in time slot\nsize in monetary terms. These findings on e-commerce users' preferences can\nhelp in designing levels of service for last-mile delivery to significantly\nimprove its efficiency.\n","authors":["Yuki Oyama","Daisuke Fukuda","Naoto Imura","Katsuhiro Nishinari"],"pdf_url":"https://arxiv.org/pdf/2301.00666v2.pdf","comment":"Section 1 needs to be rewritten"},{"id":"http://arxiv.org/abs/2308.13357v1","updated":"2023-08-25T13:06:13Z","published":"2023-08-25T13:06:13Z","title":"A topological model for partial equivariance in deep learning and data\n analysis","summary":" In this article, we propose a topological model to encode partial\nequivariance in neural networks. To this end, we introduce a class of\noperators, called P-GENEOs, that change data expressed by measurements,\nrespecting the action of certain sets of transformations, in a non-expansive\nway. If the set of transformations acting is a group, then we obtain the\nso-called GENEOs. We then study the spaces of measurements, whose domains are\nsubject to the action of certain self-maps, and the space of P-GENEOs between\nthese spaces. We define pseudo-metrics on them and show some properties of the\nresulting spaces. In particular, we show how such spaces have convenient\napproximation and convexity properties.\n","authors":["Lucia Ferrari","Patrizio Frosini","Nicola Quercioli","Francesca Tombari"],"pdf_url":"https://arxiv.org/pdf/2308.13357v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13354v1","updated":"2023-08-25T12:57:59Z","published":"2023-08-25T12:57:59Z","title":"On the Impact of Language Selection for Training and Evaluating\n Programming Language Models","summary":" The recent advancements in Transformer-based Language Models have\ndemonstrated significant potential in enhancing the multilingual capabilities\nof these models. The remarkable progress made in this domain not only applies\nto natural language tasks but also extends to the domain of programming\nlanguages. Despite the ability of these models to learn from multiple\nlanguages, evaluations typically focus on particular combinations of the same\nlanguages. In this study, we evaluate the similarity of programming languages\nby analyzing their representations using a CodeBERT-based model. Our\nexperiments reveal that token representation in languages such as C++, Python,\nand Java exhibit proximity to one another, whereas the same tokens in languages\nsuch as Mathematica and R display significant dissimilarity. Our findings\nsuggest that this phenomenon can potentially result in performance challenges\nwhen dealing with diverse languages. Thus, we recommend using our similarity\nmeasure to select a diverse set of programming languages when training and\nevaluating future models.\n","authors":["Jonathan Katzy","Maliheh Izadi","Arie van Deursen"],"pdf_url":"https://arxiv.org/pdf/2308.13354v1.pdf","comment":"Accepted to 2023 IEEE 23rd International Working Conference on Source\n Code Analysis and Manipulation (SCAM), NIER track"},{"id":"http://arxiv.org/abs/2308.13352v1","updated":"2023-08-25T12:47:59Z","published":"2023-08-25T12:47:59Z","title":"A Generic Machine Learning Framework for Fully-Unsupervised Anomaly\n Detection with Contaminated Data","summary":" Anomaly detection (AD) tasks have been solved using machine learning\nalgorithms in various domains and applications. The great majority of these\nalgorithms use normal data to train a residual-based model, and assign anomaly\nscores to unseen samples based on their dissimilarity with the learned normal\nregime. The underlying assumption of these approaches is that anomaly-free data\nis available for training. This is, however, often not the case in real-world\noperational settings, where the training data may be contaminated with a\ncertain fraction of abnormal samples. Training with contaminated data, in turn,\ninevitably leads to a deteriorated AD performance of the residual-based\nalgorithms.\n In this paper we introduce a framework for a fully unsupervised refinement of\ncontaminated training data for AD tasks. The framework is generic and can be\napplied to any residual-based machine learning model. We demonstrate the\napplication of the framework to two public datasets of multivariate time series\nmachine data from different application fields. We show its clear superiority\nover the naive approach of training with contaminated data without refinement.\nMoreover, we compare it to the ideal, unrealistic reference in which\nanomaly-free data would be available for training. Since the approach exploits\ninformation from the anomalies, and not only from the normal regime, it is\ncomparable and often outperforms the ideal baseline as well.\n","authors":["Markus Ulmer","Jannik Zgraggen","Lilach Goren Huber"],"pdf_url":"https://arxiv.org/pdf/2308.13352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07221v6","updated":"2023-08-25T12:33:22Z","published":"2023-08-14T15:47:25Z","title":"AudioFormer: Audio Transformer learns audio feature representations from\n discrete acoustic codes","summary":" We propose a method named AudioFormer,which learns audio feature\nrepresentations through the acquisition of discrete acoustic codes and\nsubsequently fine-tunes them for audio classification tasks. Initially,we\nintroduce a novel perspective by considering the audio classification task as a\nform of natural language understanding (NLU). Leveraging an existing neural\naudio codec model,we generate discrete acoustic codes and utilize them to train\na masked language model (MLM),thereby obtaining audio feature representations.\nFurthermore,we pioneer the integration of a Multi-Positive sample Contrastive\n(MPC) learning approach. This method enables the learning of joint\nrepresentations among multiple discrete acoustic codes within the same audio\ninput. In our experiments,we treat discrete acoustic codes as textual data and\ntrain a masked language model using a cloze-like methodology,ultimately\nderiving high-quality audio representations. Notably,the MPC learning technique\neffectively captures collaborative representations among distinct positive\nsamples. Our research outcomes demonstrate that AudioFormer attains\nsignificantly improved performance compared to prevailing monomodal audio\nclassification models across multiple datasets,and even outperforms\naudio-visual multimodal classification models on select datasets.\nSpecifically,our approach achieves remarkable results on datasets including\nAudioSet (2M,20K),and FSD50K,with performance scores of 53.9,45.1,and\n65.6,respectively. We have openly shared both the code and models:\nhttps://github.com/LZH-0225/AudioFormer.git.\n","authors":["Zhaohui Li","Haitao Wang","Xinghua Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.07221v6.pdf","comment":"Need to supplement more detailed experiments"},{"id":"http://arxiv.org/abs/2308.13328v1","updated":"2023-08-25T12:02:13Z","published":"2023-08-25T12:02:13Z","title":"Compressor-Based Classification for Atrial Fibrillation Detection","summary":" Atrial fibrillation (AF) is one of the most common arrhythmias with\nchallenging public health implications. Automatic detection of AF episodes is\ntherefore one of the most important tasks in biomedical engineering. In this\npaper, we apply the recently introduced method of compressor-based text\nclassification to the task of AF detection (binary classification between heart\nrhythms). We investigate the normalised compression distance applied to\n$\\Delta$RR and RR-interval sequences, the configuration of the k-Nearest\nNeighbour classifier, and an optimal window length. We achieve good\nclassification results (avg. sensitivity = 97.1%, avg. specificity = 91.7%,\nbest sensitivity of 99.8%, best specificity of 97.6% with 5-fold\ncross-validation). Obtained performance is close to the best specialised AF\ndetection algorithms. Our results suggest that gzip classification, originally\nproposed for texts, is suitable for biomedical data and continuous stochastic\nsequences in general.\n","authors":["Nikita Markov","Konstantin Ushenin","Yakov Bozhko","Olga Solovyova"],"pdf_url":"https://arxiv.org/pdf/2308.13328v1.pdf","comment":"This paper is sent for review at the IEEE conference, 2023"},{"id":"http://arxiv.org/abs/2211.12461v3","updated":"2023-08-25T11:53:01Z","published":"2022-11-22T18:19:10Z","title":"A Neural-Network-Based Convex Regularizer for Inverse Problems","summary":" The emergence of deep-learning-based methods to solve image-reconstruction\nproblems has enabled a significant increase in reconstruction quality.\nUnfortunately, these new methods often lack reliability and explainability, and\nthere is a growing interest to address these shortcomings while retaining the\nboost in performance. In this work, we tackle this issue by revisiting\nregularizers that are the sum of convex-ridge functions. The gradient of such\nregularizers is parameterized by a neural network that has a single hidden\nlayer with increasing and learnable activation functions. This neural network\nis trained within a few minutes as a multistep Gaussian denoiser. The numerical\nexperiments for denoising, CT, and MRI reconstruction show improvements over\nmethods that offer similar reliability guarantees.\n","authors":["Alexis Goujon","Sebastian Neumayer","Pakshal Bohra","Stanislas Ducotterd","Michael Unser"],"pdf_url":"https://arxiv.org/pdf/2211.12461v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13320v1","updated":"2023-08-25T11:49:51Z","published":"2023-08-25T11:49:51Z","title":"Fine-tuning can cripple your foundation model; preserving features may\n be the solution","summary":" Pre-trained foundation models, owing primarily to their enormous capacity and\nexposure to vast amount of training data scraped from the internet, enjoy the\nadvantage of storing knowledge about plenty of real-world concepts. Such models\nare typically fine-tuned on downstream datasets to produce remarkable\nstate-of-the-art performances. While various fine-tuning methods have been\ndevised and are shown to be highly effective, we observe that a fine-tuned\nmodel's ability to recognize concepts on tasks $\\textit{different}$ from the\ndownstream one is reduced significantly compared to its pre-trained\ncounterpart. This is clearly undesirable as a huge amount of time and money\nwent into learning those very concepts in the first place. We call this\nundesirable phenomenon \"concept forgetting\" and via experiments show that most\nend-to-end fine-tuning approaches suffer heavily from this side effect. To this\nend, we also propose a rather simple fix to this problem by designing a method\ncalled LDIFS (short for $\\ell_2$ distance in feature space) that simply\npreserves the features of the original foundation model during fine-tuning. We\nshow that LDIFS significantly reduces concept forgetting without having\nnoticeable impact on the downstream task performance.\n","authors":["Jishnu Mukhoti","Yarin Gal","Philip H. S. Torr","Puneet K. Dokania"],"pdf_url":"https://arxiv.org/pdf/2308.13320v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13317v1","updated":"2023-08-25T11:41:05Z","published":"2023-08-25T11:41:05Z","title":"Transforming the Output of Generative Pre-trained Transformer: The\n Influence of the PGI Framework on Attention Dynamics","summary":" This paper presents a novel approach named Persona-Grouping-Intelligence\n(PGI), which has been crafted to tackle the challenges posed by GPT models when\napplied to real-world business issues. PGI leverages the inherent capabilities\nof the GPT model to comprehend intricate language structures and generate\nresponses that are contextually relevant. The experiment occurred in a business\nscenario where human intelligence was being underutilized due to less optimized\nbusiness processes. The primary objective of this approach is to leverage GPT\nmodels to reduce the workload on humans in tasks that are extensive,\nmonotonous, and repetitive. Instead, the focus is redirected toward\ndecision-making activities. Remarkably, the experiment yielded an accuracy rate\nof 93.81% in validating 4,000 responses generated by the model, underscoring\nthe effectiveness of the PGI strategies. Effectively addressing the issue of\nunderutilized human intelligence, this paradigm shift aligns business\nenvironments with dynamic machine intelligence, enabling them to navigate the\nintricacies of real-world challenges. This approach facilitates the practical\nutilization of these models to tackle actual problems. The methodology offers\nan opportunity to reshape the fundamental structure of business processes by\nseamlessly integrating human decision-making with adaptable machine\nintelligence. Consequently, this optimization enhances operational efficiency\nand elevates strategic decision-making across diverse business contexts.\n","authors":["Aline Ioste"],"pdf_url":"https://arxiv.org/pdf/2308.13317v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13304v1","updated":"2023-08-25T11:04:35Z","published":"2023-08-25T11:04:35Z","title":"Bang and the Artefacts are Gone! Rapid Artefact Removal and Tissue\n Segmentation in Haematoxylin and Eosin Stained Biopsies","summary":" We present H&E Otsu thresholding, a scheme for rapidly detecting tissue in\nwhole-slide images (WSIs) that eliminates a wide range of undesirable artefacts\nsuch as pen marks and scanning artefacts. Our method involves obtaining a\nbid-modal representation of a low-magnification RGB overview image which\nenables simple Otsu thresholding to separate tissue from background and\nartefacts. We demonstrate our method on WSIs prepared from a wide range of\ninstitutions and WSI digital scanners, each containing substantial artefacts\nthat cause other methods to fail. The beauty of our approach lies in its\nsimplicity: manipulating RGB colour space and using Otsu thresholding allows\nfor the rapid removal of artefacts and segmentation of tissue.\n","authors":["B. A. Schreiber","J. Denholm","F. Jaeckle","M. J. Arends","K. M. Branson","C. -B. Schönlieb","E. J. Soilleux"],"pdf_url":"https://arxiv.org/pdf/2308.13304v1.pdf","comment":"4 pages, 2 figures"},{"id":"http://arxiv.org/abs/2308.13300v1","updated":"2023-08-25T10:51:02Z","published":"2023-08-25T10:51:02Z","title":"Learning Compact Neural Networks with Deep Overparameterised Multitask\n Learning","summary":" Compact neural network offers many benefits for real-world applications.\nHowever, it is usually challenging to train the compact neural networks with\nsmall parameter sizes and low computational costs to achieve the same or better\nmodel performance compared to more complex and powerful architecture. This is\nparticularly true for multitask learning, with different tasks competing for\nresources. We present a simple, efficient and effective multitask learning\noverparameterisation neural network design by overparameterising the model\narchitecture in training and sharing the overparameterised model parameters\nmore effectively across tasks, for better optimisation and generalisation.\nExperiments on two challenging multitask datasets (NYUv2 and COCO) demonstrate\nthe effectiveness of the proposed method across various convolutional networks\nand parameter sizes.\n","authors":["Shen Ren","Haosen Shi"],"pdf_url":"https://arxiv.org/pdf/2308.13300v1.pdf","comment":"Accepted for IJCAI2023 workshop, 1st International Workshop on\n Generalizing from Limited Resources in the Open World"},{"id":"http://arxiv.org/abs/2308.13298v1","updated":"2023-08-25T10:47:37Z","published":"2023-08-25T10:47:37Z","title":"Federated Linear Bandit Learning via Over-the-Air Computation","summary":" In this paper, we investigate federated contextual linear bandit learning\nwithin a wireless system that comprises a server and multiple devices. Each\ndevice interacts with the environment, selects an action based on the received\nreward, and sends model updates to the server. The primary objective is to\nminimize cumulative regret across all devices within a finite time horizon. To\nreduce the communication overhead, devices communicate with the server via\nover-the-air computation (AirComp) over noisy fading channels, where the\nchannel noise may distort the signals. In this context, we propose a customized\nfederated linear bandits scheme, where each device transmits an analog signal,\nand the server receives a superposition of these signals distorted by channel\nnoise. A rigorous mathematical analysis is conducted to determine the regret\nbound of the proposed scheme. Both theoretical analysis and numerical\nexperiments demonstrate the competitive performance of our proposed scheme in\nterms of regret bounds in various settings.\n","authors":["Jiali Wang","Yuning Jiang","Xin Liu","Ting Wang","Yuanming Shi"],"pdf_url":"https://arxiv.org/pdf/2308.13298v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13294v1","updated":"2023-08-25T10:40:46Z","published":"2023-08-25T10:40:46Z","title":"Training normalizing flows with computationally intensive target\n probability distributions","summary":" Machine learning techniques, in particular the so-called normalizing flows,\nare becoming increasingly popular in the context of Monte Carlo simulations as\nthey can effectively approximate target probability distributions. In the case\nof lattice field theories (LFT) the target distribution is given by the\nexponential of the action. The common loss function's gradient estimator based\non the \"reparametrization trick\" requires the calculation of the derivative of\nthe action with respect to the fields. This can present a significant\ncomputational cost for complicated, non-local actions like e.g. fermionic\naction in QCD. In this contribution, we propose an estimator for normalizing\nflows based on the REINFORCE algorithm that avoids this issue. We apply it to\ntwo dimensional Schwinger model with Wilson fermions at criticality and show\nthat it is up to ten times faster in terms of the wall-clock time as well as\nrequiring up to $30\\%$ less memory than the reparameterization trick estimator.\nIt is also more numerically stable allowing for single precision calculations\nand the use of half-float tensor cores. We present an in-depth analysis of the\norigins of those improvements. We believe that these benefits will appear also\noutside the realm of the LFT, in each case where the target probability\ndistribution is computationally intensive.\n","authors":["Piotr Bialas","Piotr Korcyl","Tomasz Stebel"],"pdf_url":"https://arxiv.org/pdf/2308.13294v1.pdf","comment":"15 pages, 5 figures, 4 tables, 3 listings"},{"id":"http://arxiv.org/abs/2212.07585v2","updated":"2023-08-25T10:39:23Z","published":"2022-12-15T02:25:22Z","title":"Rethinking the Role of Pre-Trained Networks in Source-Free Domain\n Adaptation","summary":" Source-free domain adaptation (SFDA) aims to adapt a source model trained on\na fully-labeled source domain to an unlabeled target domain. Large-data\npre-trained networks are used to initialize source models during source\ntraining, and subsequently discarded. However, source training can cause the\nmodel to overfit to source data distribution and lose applicable target domain\nknowledge. We propose to integrate the pre-trained network into the target\nadaptation process as it has diversified features important for generalization\nand provides an alternate view of features and classification decisions\ndifferent from the source model. We propose to distil useful target domain\ninformation through a co-learning strategy to improve target pseudolabel\nquality for finetuning the source model. Evaluation on 4 benchmark datasets\nshow that our proposed strategy improves adaptation performance and can be\nsuccessfully integrated with existing SFDA methods. Leveraging modern\npre-trained networks that have stronger representation learning ability in the\nco-learning strategy further boosts performance.\n","authors":["Wenyu Zhang","Li Shen","Chuan-Sheng Foo"],"pdf_url":"https://arxiv.org/pdf/2212.07585v2.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.13292v1","updated":"2023-08-25T10:33:44Z","published":"2023-08-25T10:33:44Z","title":"A Bayesian Active Learning Approach to Comparative Judgement","summary":" Assessment is a crucial part of education. Traditional marking is a source of\ninconsistencies and unconscious bias, placing a high cognitive load on the\nassessors. An approach to address these issues is comparative judgement (CJ).\nIn CJ, the assessor is presented with a pair of items and is asked to select\nthe better one. Following a series of comparisons, a rank is derived using a\nranking model, for example, the BTM, based on the results. While CJ is\nconsidered a reliable method for marking, there are concerns around\ntransparency, and the ideal number of pairwise comparisons to generate a\nreliable estimation of the rank order is not known. Additionally, there have\nbeen attempts to generate a method of selecting pairs that should be compared\nnext in an informative manner, but some existing methods are known to have\ncreated their own bias within results inflating the reliability metric used. As\na result, a random selection approach is usually deployed.\n We propose a novel Bayesian approach to CJ (BCJ) for determining the ranks of\ncompared items alongside a new way to select the pairs to present to the\nmarker(s) using active learning (AL), addressing the key shortcomings of\ntraditional CJ. Furthermore, we demonstrate how the entire approach may provide\ntransparency by providing the user insights into how it is making its decisions\nand, at the same time, being more efficient. Results from our experiments\nconfirm that the proposed BCJ combined with entropy-driven AL pair-selection\nmethod is superior to other alternatives. We also find that the more\ncomparisons done, the more accurate BCJ becomes, which solves the issue the\ncurrent method has of the model deteriorating if too many comparisons are\nperformed. As our approach can generate the complete predicted rank\ndistribution for an item, we also show how this can be utilised in devising a\npredicted grade, guided by the assessor.\n","authors":["Andy Gray","Alma Rahat","Tom Crick","Stephen Lindsay","Darren Wallace"],"pdf_url":"https://arxiv.org/pdf/2308.13292v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2308.13289v1","updated":"2023-08-25T10:26:43Z","published":"2023-08-25T10:26:43Z","title":"JAX-LOB: A GPU-Accelerated limit order book simulator to unlock large\n scale reinforcement learning for trading","summary":" Financial exchanges across the world use limit order books (LOBs) to process\norders and match trades. For research purposes it is important to have large\nscale efficient simulators of LOB dynamics. LOB simulators have previously been\nimplemented in the context of agent-based models (ABMs), reinforcement learning\n(RL) environments, and generative models, processing order flows from\nhistorical data sets and hand-crafted agents alike. For many applications,\nthere is a requirement for processing multiple books, either for the\ncalibration of ABMs or for the training of RL agents. We showcase the first\nGPU-enabled LOB simulator designed to process thousands of books in parallel,\nwith a notably reduced per-message processing time. The implementation of our\nsimulator - JAX-LOB - is based on design choices that aim to best exploit the\npowers of JAX without compromising on the realism of LOB-related mechanisms. We\nintegrate JAX-LOB with other JAX packages, to provide an example of how one may\naddress an optimal execution problem with reinforcement learning, and to share\nsome preliminary results from end-to-end RL training on GPUs.\n","authors":["Sascha Frey","Kang Li","Peer Nagy","Silvia Sapora","Chris Lu","Stefan Zohren","Jakob Foerster","Anisoara Calinescu"],"pdf_url":"https://arxiv.org/pdf/2308.13289v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13280v1","updated":"2023-08-25T10:02:26Z","published":"2023-08-25T10:02:26Z","title":"AtmoRep: A stochastic model of atmosphere dynamics using large scale\n representation learning","summary":" The atmosphere affects humans in a multitude of ways, from loss of life due\nto adverse weather effects to long-term social and economic impacts on\nsocieties. Computer simulations of atmospheric dynamics are, therefore, of\ngreat importance for the well-being of our and future generations. Here, we\npropose AtmoRep, a novel, task-independent stochastic computer model of\natmospheric dynamics that can provide skillful results for a wide range of\napplications. AtmoRep uses large-scale representation learning from artificial\nintelligence to determine a general description of the highly complex,\nstochastic dynamics of the atmosphere from the best available estimate of the\nsystem's historical trajectory as constrained by observations. This is enabled\nby a novel self-supervised learning objective and a unique ensemble that\nsamples from the stochastic model with a variability informed by the one in the\nhistorical record. The task-independent nature of AtmoRep enables skillful\nresults for a diverse set of applications without specifically training for\nthem and we demonstrate this for nowcasting, temporal interpolation, model\ncorrection, and counterfactuals. We also show that AtmoRep can be improved with\nadditional data, for example radar observations, and that it can be extended to\ntasks such as downscaling. Our work establishes that large-scale neural\nnetworks can provide skillful, task-independent models of atmospheric dynamics.\nWith this, they provide a novel means to make the large record of atmospheric\nobservations accessible for applications and for scientific inquiry,\ncomplementing existing simulations based on first principles.\n","authors":["Christian Lessig","Ilaria Luise","Bing Gong","Michael Langguth","Scarlet Stadler","Martin Schultz"],"pdf_url":"https://arxiv.org/pdf/2308.13280v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13279v1","updated":"2023-08-25T10:01:53Z","published":"2023-08-25T10:01:53Z","title":"Hyperbolic Random Forests","summary":" Hyperbolic space is becoming a popular choice for representing data due to\nthe hierarchical structure - whether implicit or explicit - of many real-world\ndatasets. Along with it comes a need for algorithms capable of solving\nfundamental tasks, such as classification, in hyperbolic space. Recently,\nmultiple papers have investigated hyperbolic alternatives to hyperplane-based\nclassifiers, such as logistic regression and SVMs. While effective, these\napproaches struggle with more complex hierarchical data. We, therefore, propose\nto generalize the well-known random forests to hyperbolic space. We do this by\nredefining the notion of a split using horospheres. Since finding the globally\noptimal split is computationally intractable, we find candidate horospheres\nthrough a large-margin classifier. To make hyperbolic random forests work on\nmulti-class data and imbalanced experiments, we furthermore outline a new\nmethod for combining classes based on their lowest common ancestor and a\nclass-balanced version of the large-margin loss. Experiments on standard and\nnew benchmarks show that our approach outperforms both conventional random\nforest algorithms and recent hyperbolic classifiers.\n","authors":["Lars Doorenbos","Pablo Márquez-Neila","Raphael Sznitman","Pascal Mettes"],"pdf_url":"https://arxiv.org/pdf/2308.13279v1.pdf","comment":"Code available at https://github.com/LarsDoorenbos/HoroRF"},{"id":"http://arxiv.org/abs/2308.13278v1","updated":"2023-08-25T10:00:06Z","published":"2023-08-25T10:00:06Z","title":"Integrating LLMs and Decision Transformers for Language Grounded\n Generative Quality-Diversity","summary":" Quality-Diversity is a branch of stochastic optimization that is often\napplied to problems from the Reinforcement Learning and control domains in\norder to construct repertoires of well-performing policies/skills that exhibit\ndiversity with respect to a behavior space. Such archives are usually composed\nof a finite number of reactive agents which are each associated to a unique\nbehavior descriptor, and instantiating behavior descriptors outside of that\ncoarsely discretized space is not straight-forward. While a few recent works\nsuggest solutions to that issue, the trajectory that is generated is not easily\ncustomizable beyond the specification of a target behavior descriptor. We\npropose to jointly solve those problems in environments where semantic\ninformation about static scene elements is available by leveraging a Large\nLanguage Model to augment the repertoire with natural language descriptions of\ntrajectories, and training a policy conditioned on those descriptions. Thus,\nour method allows a user to not only specify an arbitrary target behavior\ndescriptor, but also provide the model with a high-level textual prompt to\nshape the generated trajectory. We also propose an LLM-based approach to\nevaluating the performance of such generative agents. Furthermore, we develop a\nbenchmark based on simulated robot navigation in a 2d maze that we use for\nexperimental validation.\n","authors":["Achkan Salehi","Stephane Doncieux"],"pdf_url":"https://arxiv.org/pdf/2308.13278v1.pdf","comment":"16 pages, 9 figures, 2 tables"},{"id":"http://arxiv.org/abs/2308.13269v1","updated":"2023-08-25T09:42:54Z","published":"2023-08-25T09:42:54Z","title":"Heterogeneous Decentralized Machine Unlearning with Seed Model\n Distillation","summary":" As some recent information security legislation endowed users with\nunconditional rights to be forgotten by any trained machine learning model,\npersonalized IoT service providers have to put unlearning functionality into\ntheir consideration. The most straightforward method to unlearn users'\ncontribution is to retrain the model from the initial state, which is not\nrealistic in high throughput applications with frequent unlearning requests.\nThough some machine unlearning frameworks have been proposed to speed up the\nretraining process, they fail to match decentralized learning scenarios. In\nthis paper, we design a decentralized unlearning framework called HDUS, which\nuses distilled seed models to construct erasable ensembles for all clients.\nMoreover, the framework is compatible with heterogeneous on-device models,\nrepresenting stronger scalability in real-world applications. Extensive\nexperiments on three real-world datasets show that our HDUS achieves\nstate-of-the-art performance.\n","authors":["Guanhua Ye","Guanhua Ye","Quoc Viet Hung Nguyen","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2308.13269v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13265v1","updated":"2023-08-25T09:37:02Z","published":"2023-08-25T09:37:02Z","title":"Heterogeneous Federated Learning via Personalized Generative Networks","summary":" Federated Learning (FL) allows several clients to construct a common global\nmachine-learning model without having to share their data. FL, however, faces\nthe challenge of statistical heterogeneity between the client's data, which\ndegrades performance and slows down the convergence toward the global model. In\nthis paper, we provide theoretical proof that minimizing heterogeneity between\nclients facilitates the convergence of a global model for every single client.\nThis becomes particularly important under empirical concept shifts among\nclients, rather than merely considering imbalanced classes, which have been\nstudied until now. Therefore, we propose a method for knowledge transfer\nbetween clients where the server trains client-specific generators. Each\ngenerator generates samples for the corresponding client to remove the conflict\nwith other clients' models. Experiments conducted on synthetic and real data,\nalong with a theoretical study, support the effectiveness of our method in\nconstructing a well-generalizable global model by reducing the conflict between\nlocal models.\n","authors":["Zahra Taghiyarrenani","Abdallah Abdallah","Slawomir Nowaczyk","Sepideh Pashami"],"pdf_url":"https://arxiv.org/pdf/2308.13265v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11789v4","updated":"2023-08-25T09:35:30Z","published":"2023-03-20T08:37:08Z","title":"Random Inverse Problems Over Graphs: Decentralized Online Learning","summary":" We establish a framework of distributed random inverse problems over network\ngraphs with online measurements, and propose a decentralized online learning\nalgorithm. This unifies the distributed parameter estimation in Hilbert spaces\nand the least mean square problem in reproducing kernel Hilbert spaces\n(RKHS-LMS). We transform the convergence of the algorithm into the asymptotic\nstability of a class of inhomogeneous random difference equations in Hilbert\nspaces with L2-bounded martingale difference terms and develop the L2\n-asymptotic stability theory in Hilbert spaces. It is shown that if the network\ngraph is connected and the sequence of forward operators satisfies the\ninfinite-dimensional spatio-temporal persistence of excitation condition, then\nthe estimates of all nodes are mean square and almost surely strongly\nconsistent. Moreover, we propose a decentralized online learning algorithm in\nRKHS based on non-stationary and non-independent online data streams, and prove\nthat the algorithm is mean square and almost surely strongly consistent if the\noperators induced by the random input data satisfy the infinite-dimensional\nspatio-temporal persistence of excitation condition.\n","authors":["Tao Li","Xiwei Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.11789v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12686v2","updated":"2023-08-25T09:12:02Z","published":"2023-08-24T09:57:11Z","title":"Match-And-Deform: Time Series Domain Adaptation through Optimal\n Transport and Temporal Alignment","summary":" While large volumes of unlabeled data are usually available, associated\nlabels are often scarce. The unsupervised domain adaptation problem aims at\nexploiting labels from a source domain to classify data from a related, yet\ndifferent, target domain. When time series are at stake, new difficulties arise\nas temporal shifts may appear in addition to the standard feature distribution\nshift. In this paper, we introduce the Match-And-Deform (MAD) approach that\naims at finding correspondences between the source and target time series while\nallowing temporal distortions. The associated optimization problem\nsimultaneously aligns the series thanks to an optimal transport loss and the\ntime stamps through dynamic time warping. When embedded into a deep neural\nnetwork, MAD helps learning new representations of time series that both align\nthe domains and maximize the discriminative power of the network. Empirical\nstudies on benchmark datasets and remote sensing data demonstrate that MAD\nmakes meaningful sample-to-sample pairing and time shift estimation, reaching\nsimilar or better classification performance than state-of-the-art deep time\nseries domain adaptation strategies.\n","authors":["François Painblanc","Laetitia Chapel","Nicolas Courty","Chloé Friguet","Charlotte Pelletier","Romain Tavenard"],"pdf_url":"https://arxiv.org/pdf/2308.12686v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.00015v2","updated":"2023-08-25T09:06:36Z","published":"2023-03-30T10:35:00Z","title":"DRIP: Deep Regularizers for Inverse Problems","summary":" In this paper we consider inverse problems that are mathematically ill-posed.\nThat is, given some (noisy) data, there is more than one solution that\napproximately fits the data. In recent years, deep neural techniques that find\nthe most appropriate solution, in the sense that it contains a-priori\ninformation, were developed. However, they suffer from several shortcomings.\nFirst, most techniques cannot guarantee that the solution fits the data at\ninference. Second, while the derivation of the techniques is inspired by the\nexistence of a valid scalar regularization function, such techniques do not in\npractice rely on such a function, and therefore veer away from classical\nvariational techniques. In this work we introduce a new family of neural\nregularizers for the solution of inverse problems. These regularizers are based\non a variational formulation and are guaranteed to fit the data. We demonstrate\ntheir use on a number of highly ill-posed problems, from image deblurring to\nlimited angle tomography.\n","authors":["Moshe Eliasof","Eldad Haber","Eran Treister"],"pdf_url":"https://arxiv.org/pdf/2304.00015v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08915v2","updated":"2023-08-25T09:01:23Z","published":"2023-08-17T11:00:01Z","title":"Beyond Sharing: Conflict-Aware Multivariate Time Series Anomaly\n Detection","summary":" Massive key performance indicators (KPIs) are monitored as multivariate time\nseries data (MTS) to ensure the reliability of the software applications and\nservice system. Accurately detecting the abnormality of MTS is very critical\nfor subsequent fault elimination. The scarcity of anomalies and manual labeling\nhas led to the development of various self-supervised MTS anomaly detection\n(AD) methods, which optimize an overall objective/loss encompassing all\nmetrics' regression objectives/losses. However, our empirical study uncovers\nthe prevalence of conflicts among metrics' regression objectives, causing MTS\nmodels to grapple with different losses. This critical aspect significantly\nimpacts detection performance but has been overlooked in existing approaches.\nTo address this problem, by mimicking the design of multi-gate\nmixture-of-experts (MMoE), we introduce CAD, a Conflict-aware multivariate KPI\nAnomaly Detection algorithm. CAD offers an exclusive structure for each metric\nto mitigate potential conflicts while fostering inter-metric promotions. Upon\nthorough investigation, we find that the poor performance of vanilla MMoE\nmainly comes from the input-output misalignment settings of MTS formulation and\nconvergence issues arising from expansive tasks. To address these challenges,\nwe propose a straightforward yet effective task-oriented metric selection and\np&s (personalized and shared) gating mechanism, which establishes CAD as the\nfirst practicable multi-task learning (MTL) based MTS AD model. Evaluations on\nmultiple public datasets reveal that CAD obtains an average F1-score of 0.943\nacross three public datasets, notably outperforming state-of-the-art methods.\nOur code is accessible at https://github.com/dawnvince/MTS_CAD.\n","authors":["Haotian Si","Changhua Pei","Zhihan Li","Yadong Zhao","Jingjing Li","Haiming Zhang","Zulong Diao","Jianhui Li","Gaogang Xie","Dan Pei"],"pdf_url":"https://arxiv.org/pdf/2308.08915v2.pdf","comment":"11 pages, ESEC/FSE industry track 2023"},{"id":"http://arxiv.org/abs/2308.13252v1","updated":"2023-08-25T08:59:03Z","published":"2023-08-25T08:59:03Z","title":"Kissing to Find a Match: Efficient Low-Rank Permutation Representation","summary":" Permutation matrices play a key role in matching and assignment problems\nacross the fields, especially in computer vision and robotics. However, memory\nfor explicitly representing permutation matrices grows quadratically with the\nsize of the problem, prohibiting large problem instances. In this work, we\npropose to tackle the curse of dimensionality of large permutation matrices by\napproximating them using low-rank matrix factorization, followed by a\nnonlinearity. To this end, we rely on the Kissing number theory to infer the\nminimal rank required for representing a permutation matrix of a given size,\nwhich is significantly smaller than the problem size. This leads to a drastic\nreduction in computation and memory costs, e.g., up to $3$ orders of magnitude\nless memory for a problem of size $n=20000$, represented using $8.4\\times10^5$\nelements in two small matrices instead of using a single huge matrix with\n$4\\times 10^8$ elements. The proposed representation allows for accurate\nrepresentations of large permutation matrices, which in turn enables handling\nlarge problems that would have been infeasible otherwise. We demonstrate the\napplicability and merits of the proposed approach through a series of\nexperiments on a range of problems that involve predicting permutation\nmatrices, from linear and quadratic assignment to shape matching problems.\n","authors":["Hannah Dröge","Zorah Lähner","Yuval Bahat","Onofre Martorell","Felix Heide","Michael Möller"],"pdf_url":"https://arxiv.org/pdf/2308.13252v1.pdf","comment":"13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.13246v1","updated":"2023-08-25T08:42:45Z","published":"2023-08-25T08:42:45Z","title":"Model-free Reinforcement Learning with Stochastic Reward Stabilization\n for Recommender Systems","summary":" Model-free RL-based recommender systems have recently received increasing\nresearch attention due to their capability to handle partial feedback and\nlong-term rewards. However, most existing research has ignored a critical\nfeature in recommender systems: one user's feedback on the same item at\ndifferent times is random. The stochastic rewards property essentially differs\nfrom that in classic RL scenarios with deterministic rewards, which makes\nRL-based recommender systems much more challenging. In this paper, we first\ndemonstrate in a simulator environment where using direct stochastic feedback\nresults in a significant drop in performance. Then to handle the stochastic\nfeedback more efficiently, we design two stochastic reward stabilization\nframeworks that replace the direct stochastic feedback with that learned by a\nsupervised model. Both frameworks are model-agnostic, i.e., they can\neffectively utilize various supervised models. We demonstrate the superiority\nof the proposed frameworks over different RL-based recommendation baselines\nwith extensive experiments on a recommendation simulator as well as an\nindustrial-level recommender system.\n","authors":["Tianchi Cai","Shenliao Bao","Jiyan Jiang","Shiji Zhou","Wenpeng Zhang","Lihong Gu","Jinjie Gu","Guannan Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.13246v1.pdf","comment":"SIGIR '23"},{"id":"http://arxiv.org/abs/2307.06698v3","updated":"2023-08-25T08:37:10Z","published":"2023-07-13T11:54:32Z","title":"IntelliGraphs: Datasets for Benchmarking Knowledge Graph Generation","summary":" Knowledge Graph Embedding (KGE) models are used to learn continuous\nrepresentations of entities and relations. A key task in the literature is\npredicting missing links between entities. However, Knowledge Graphs are not\njust sets of links but also have semantics underlying their structure.\nSemantics is crucial in several downstream tasks, such as query answering or\nreasoning. We introduce the subgraph inference task, where a model has to\ngenerate likely and semantically valid subgraphs. We propose IntelliGraphs, a\nset of five new Knowledge Graph datasets. The IntelliGraphs datasets contain\nsubgraphs with semantics expressed in logical rules for evaluating subgraph\ninference. We also present the dataset generator that produced the synthetic\ndatasets. We designed four novel baseline models, which include three models\nbased on traditional KGEs. We evaluate their expressiveness and show that these\nmodels cannot capture the semantics. We believe this benchmark will encourage\nthe development of machine learning models that emphasize semantic\nunderstanding.\n","authors":["Thiviyan Thanapalasingam","Emile van Krieken","Peter Bloem","Paul Groth"],"pdf_url":"https://arxiv.org/pdf/2307.06698v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13242v1","updated":"2023-08-25T08:27:43Z","published":"2023-08-25T08:27:43Z","title":"Optimizing Group-Fair Plackett-Luce Ranking Models for Relevance and\n Ex-Post Fairness","summary":" In learning-to-rank (LTR), optimizing only the relevance (or the expected\nranking utility) can cause representational harm to certain categories of\nitems. Moreover, if there is implicit bias in the relevance scores, LTR models\nmay fail to optimize for true relevance. Previous works have proposed efficient\nalgorithms to train stochastic ranking models that achieve fairness of exposure\nto the groups ex-ante (or, in expectation), which may not guarantee\nrepresentation fairness to the groups ex-post, that is, after realizing a\nranking from the stochastic ranking model. Typically, ex-post fairness is\nachieved by post-processing, but previous work does not train stochastic\nranking models that are aware of this post-processing.\n In this paper, we propose a novel objective that maximizes expected relevance\nonly over those rankings that satisfy given representation constraints to\nensure ex-post fairness. Building upon recent work on an efficient sampler for\nex-post group-fair rankings, we propose a group-fair Plackett-Luce model and\nshow that it can be efficiently optimized for our objective in the LTR\nframework.\n Experiments on three real-world datasets show that our group-fair algorithm\nguarantees fairness alongside usually having better relevance compared to the\nLTR baselines. In addition, our algorithm also achieves better relevance than\npost-processing baselines, which also ensures ex-post fairness. Further, when\nimplicit bias is injected into the training data, our algorithm typically\noutperforms existing LTR baselines in relevance.\n","authors":["Sruthi Gorantla","Eshaan Bhansali","Amit Deshpande","Anand Louis"],"pdf_url":"https://arxiv.org/pdf/2308.13242v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2308.10199v2","updated":"2023-08-25T08:25:46Z","published":"2023-08-20T08:16:36Z","title":"Deep Reinforcement Learning for Artificial Upwelling Energy Management","summary":" The potential of artificial upwelling (AU) as a means of lifting\nnutrient-rich bottom water to the surface, stimulating seaweed growth, and\nconsequently enhancing ocean carbon sequestration, has been gaining increasing\nattention in recent years. This has led to the development of the first\nsolar-powered and air-lifted AU system (AUS) in China. However, efficient\nscheduling of air injection systems in complex marine environments remains a\ncrucial challenge in operating AUS, as it holds the potential to significantly\nimprove energy efficiency. To tackle this challenge, we propose a novel energy\nmanagement approach that utilizes deep reinforcement learning (DRL) algorithm\nto develop efficient strategies for operating AUS. Specifically, we formulate\nthe problem of maximizing the energy efficiency of AUS as a Markov decision\nprocess and integrate the quantile network in distributional reinforcement\nlearning (QR-DQN) with the deep dueling network to solve it. Through extensive\nsimulations, we evaluate the performance of our algorithm and demonstrate its\nsuperior effectiveness over traditional rule-based approaches and other DRL\nalgorithms in reducing energy wastage while ensuring the stable and efficient\noperation of AUS. Our findings suggest that a DRL-based approach offers a\npromising way to improve the energy efficiency of AUS and enhance the\nsustainability of seaweed cultivation and carbon sequestration in the ocean.\n","authors":["Yiyuan Zhang","Wei Fan"],"pdf_url":"https://arxiv.org/pdf/2308.10199v2.pdf","comment":"31 pages, 13 figures"},{"id":"http://arxiv.org/abs/2308.12673v2","updated":"2023-08-25T08:03:28Z","published":"2023-08-24T09:31:02Z","title":"Masked Feature Modelling: Feature Masking for the Unsupervised\n Pre-training of a Graph Attention Network Block for Bottom-up Video Event\n Recognition","summary":" In this paper, we introduce Masked Feature Modelling (MFM), a novel approach\nfor the unsupervised pre-training of a Graph Attention Network (GAT) block. MFM\nutilizes a pretrained Visual Tokenizer to reconstruct masked features of\nobjects within a video, leveraging the MiniKinetics dataset. We then\nincorporate the pre-trained GAT block into a state-of-the-art bottom-up\nsupervised video-event recognition architecture, ViGAT, to improve the model's\nstarting point and overall accuracy. Experimental evaluations on the YLI-MED\ndataset demonstrate the effectiveness of MFM in improving event recognition\nperformance.\n","authors":["Dimitrios Daskalakis","Nikolaos Gkalelis","Vasileios Mezaris"],"pdf_url":"https://arxiv.org/pdf/2308.12673v2.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2308.13222v1","updated":"2023-08-25T07:38:50Z","published":"2023-08-25T07:38:50Z","title":"Bayesian Reasoning for Physics Informed Neural Networks","summary":" Physics informed neural network (PINN) approach in Bayesian formulation is\npresented. We adopt the Bayesian neural network framework formulated by MacKay\n(Neural Computation 4 (3) (1992) 448). The posterior densities are obtained\nfrom Laplace approximation. For each model (fit), the so-called evidence is\ncomputed. It is a measure that classifies the hypothesis. The most optimal\nsolution has the maximal value of the evidence. The Bayesian framework allows\nus to control the impact of the boundary contribution to the total loss.\nIndeed, the relative weights of loss components are fine-tuned by the Bayesian\nalgorithm. We solve heat, wave, and Burger's equations. The obtained results\nare in good agreement with the exact solutions. All solutions are provided with\nthe uncertainties computed within the Bayesian framework.\n","authors":["Krzysztof M. Graczyk","Kornel Witkowski"],"pdf_url":"https://arxiv.org/pdf/2308.13222v1.pdf","comment":"19 pages, 11 figures"},{"id":"http://arxiv.org/abs/2308.13217v1","updated":"2023-08-25T07:30:18Z","published":"2023-08-25T07:30:18Z","title":"GEMTrans: A General, Echocardiography-based, Multi-Level Transformer\n Framework for Cardiovascular Diagnosis","summary":" Echocardiography (echo) is an ultrasound imaging modality that is widely used\nfor various cardiovascular diagnosis tasks. Due to inter-observer variability\nin echo-based diagnosis, which arises from the variability in echo image\nacquisition and the interpretation of echo images based on clinical experience,\nvision-based machine learning (ML) methods have gained popularity to act as\nsecondary layers of verification. For such safety-critical applications, it is\nessential for any proposed ML method to present a level of explainability along\nwith good accuracy. In addition, such methods must be able to process several\necho videos obtained from various heart views and the interactions among them\nto properly produce predictions for a variety of cardiovascular measurements or\ninterpretation tasks. Prior work lacks explainability or is limited in scope by\nfocusing on a single cardiovascular task. To remedy this, we propose a General,\nEcho-based, Multi-Level Transformer (GEMTrans) framework that provides\nexplainability, while simultaneously enabling multi-video training where the\ninter-play among echo image patches in the same frame, all frames in the same\nvideo, and inter-video relationships are captured based on a downstream task.\nWe show the flexibility of our framework by considering two critical tasks\nincluding ejection fraction (EF) and aortic stenosis (AS) severity detection.\nOur model achieves mean absolute errors of 4.15 and 4.84 for single and\ndual-video EF estimation and an accuracy of 96.5 % for AS detection, while\nproviding informative task-specific attention maps and prototypical\nexplainability.\n","authors":["Masoud Mokhtari","Neda Ahmadi","Teresa S. M. Tsang","Purang Abolmaesumi","Renjie Liao"],"pdf_url":"https://arxiv.org/pdf/2308.13217v1.pdf","comment":"To be published in MLMI 2023"},{"id":"http://arxiv.org/abs/2211.00216v2","updated":"2023-08-25T07:26:35Z","published":"2022-11-01T01:57:00Z","title":"Distributed Graph Neural Network Training: A Survey","summary":" Graph neural networks (GNNs) are a type of deep learning models that are\ntrained on graphs and have been successfully applied in various domains.\nDespite the effectiveness of GNNs, it is still challenging for GNNs to\nefficiently scale to large graphs. As a remedy, distributed computing becomes a\npromising solution of training large-scale GNNs, since it is able to provide\nabundant computing resources. However, the dependency of graph structure\nincreases the difficulty of achieving high-efficiency distributed GNN training,\nwhich suffers from the massive communication and workload imbalance. In recent\nyears, many efforts have been made on distributed GNN training, and an array of\ntraining algorithms and systems have been proposed. Yet, there is a lack of\nsystematic review on the optimization techniques for the distributed execution\nof GNN training. In this survey, we analyze three major challenges in\ndistributed GNN training that are massive feature communication, the loss of\nmodel accuracy and workload imbalance. Then we introduce a new taxonomy for the\noptimization techniques in distributed GNN training that address the above\nchallenges. The new taxonomy classifies existing techniques into four\ncategories that are GNN data partition, GNN batch generation, GNN execution\nmodel, and GNN communication protocol. We carefully discuss the techniques in\neach category. In the end, we summarize existing distributed GNN systems for\nmulti-GPUs, GPU-clusters and CPU-clusters, respectively, and give a discussion\nabout the future direction on distributed GNN training.\n","authors":["Yingxia Shao","Hongzheng Li","Xizhi Gu","Hongbo Yin","Yawen Li","Xupeng Miao","Wentao Zhang","Bin Cui","Lei Chen"],"pdf_url":"https://arxiv.org/pdf/2211.00216v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2109.10072v5","updated":"2023-08-25T07:19:16Z","published":"2021-09-21T10:22:29Z","title":"Scenario generation for market risk models using generative neural\n networks","summary":" In this research, we show how to expand existing approaches of using\ngenerative adversarial networks (GANs) as economic scenario generators (ESG) to\na whole internal market risk model - with enough risk factors to model the full\nband-width of investments for an insurance company and for a one year time\nhorizon as required in Solvency 2. We demonstrate that the results of a\nGAN-based internal model are similar to regulatory approved internal models in\nEurope. Therefore, GAN-based models can be seen as a data-driven alternative\nway of market risk modeling.\n","authors":["Solveig Flaig","Gero Junike"],"pdf_url":"https://arxiv.org/pdf/2109.10072v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13212v1","updated":"2023-08-25T07:15:58Z","published":"2023-08-25T07:15:58Z","title":"Physics-Inspired Neural Graph ODE for Long-term Dynamical Simulation","summary":" Simulating and modeling the long-term dynamics of multi-object physical\nsystems is an essential and challenging task. Current studies model the\nphysical systems utilizing Graph Neural Networks (GNNs) with equivariant\nproperties. Specifically, they model the dynamics as a sequence of discrete\nstates with a fixed time interval and learn a direct mapping for all the two\nadjacent states. However, this direct mapping overlooks the continuous nature\nbetween the two states. Namely, we have verified that there are countless\npossible trajectories between two discrete dynamic states in current GNN-based\ndirect mapping models. This issue greatly hinders the model generalization\nability, leading to poor performance of the long-term simulation. In this\npaper, to better model the latent trajectory through discrete supervision\nsignals, we propose a Physics-Inspired Neural Graph ODE (PINGO) algorithm. In\nPINGO, to ensure the uniqueness of the trajectory, we construct a\nPhysics-Inspired Neural ODE framework to update the latent trajectory.\nMeanwhile, to effectively capture intricate interactions among objects, we use\na GNN-based model to parameterize Neural ODE in a plug-and-play manner.\nFurthermore, we prove that the discrepancy between the learned trajectory of\nPIGNO and the true trajectory can be theoretically bounded. Extensive\nexperiments verify our theoretical findings and demonstrate that our model\nyields an order-of-magnitude improvement over the state-of-the-art baselines,\nespecially on long-term predictions and roll-out errors.\n","authors":["Yang Liu","Jiashun Cheng","Haihong Zhao","Tingyang Xu","Peilin Zhao","Fugee Tsung","Jia Li","Yu Rong"],"pdf_url":"https://arxiv.org/pdf/2308.13212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13208v1","updated":"2023-08-25T07:04:16Z","published":"2023-08-25T07:04:16Z","title":"Physics-inspired Equivariant Descriptors of Non-bonded Interactions","summary":" Most of the existing machine-learning schemes applied to atomic-scale\nsimulations rely on a local description of the geometry of a structure, and\nstruggle to model effects that are driven by long-range physical interactions.\nEfforts to overcome these limitations have focused on the direct incorporation\nof electrostatics, which is the most prominent effect, often relying on\narchitectures that mirror the functional form of explicit physical models.\nIncluding other forms of non-bonded interactions, or predicting properties\nother than the interatomic potential, requires ad hoc modifications. We propose\nan alternative approach that extends the long-distance equivariant (LODE)\nframework to generate local descriptors of an atomic environment that resemble\nnon-bonded potentials with arbitrary asymptotic behaviors, ranging from\npoint-charge electrostatics to dispersion forces. We show that the LODE\nformalism is amenable to a direct physical interpretation in terms of a\ngeneralized multipole expansion, that simplifies its implementation and reduces\nthe number of descriptors needed to capture a given asymptotic behavior. These\ngeneralized LODE features provide improved extrapolation capabilities when\ntrained on structures dominated by a given asymptotic behavior, but do not help\nin capturing the wildly different energy scales that are relevant for a more\nheterogeneous data set. This approach provides a practical scheme to\nincorporate different types of non-bonded interactions, and a framework to\ninvestigate the interplay of physical and data-related considerations that\nunderlie this challenging modeling problem.\n","authors":["Kevin K. Huguenin-Dumittan","Philip Loche","Ni Haoran","Michele Ceriotti"],"pdf_url":"https://arxiv.org/pdf/2308.13208v1.pdf","comment":null},{"id":"http://arxiv.org/abs/1911.00567v6","updated":"2023-08-25T07:02:15Z","published":"2019-11-01T19:48:57Z","title":"Frequentist Regret Bounds for Randomized Least-Squares Value Iteration","summary":" We consider the exploration-exploitation dilemma in finite-horizon\nreinforcement learning (RL). When the state space is large or continuous,\ntraditional tabular approaches are unfeasible and some form of function\napproximation is mandatory. In this paper, we introduce an\noptimistically-initialized variant of the popular randomized least-squares\nvalue iteration (RLSVI), a model-free algorithm where exploration is induced by\nperturbing the least-squares approximation of the action-value function. Under\nthe assumption that the Markov decision process has low-rank transition\ndynamics, we prove that the frequentist regret of RLSVI is upper-bounded by\n$\\widetilde O(d^2 H^2 \\sqrt{T})$ where $ d $ are the feature dimension, $ H $\nis the horizon, and $ T $ is the total number of steps. To the best of our\nknowledge, this is the first frequentist regret analysis for randomized\nexploration with function approximation.\n","authors":["Andrea Zanette","David Brandfonbrener","Emma Brunskill","Matteo Pirotta","Alessandro Lazaric"],"pdf_url":"https://arxiv.org/pdf/1911.00567v6.pdf","comment":"Minor bug fixes"},{"id":"http://arxiv.org/abs/2302.02261v2","updated":"2023-08-25T06:36:08Z","published":"2023-02-04T23:42:07Z","title":"NeuRI: Diversifying DNN Generation via Inductive Rule Inference","summary":" Deep Learning (DL) is prevalently used in various industries to improve\ndecision-making and automate processes, driven by the ever-evolving DL\nlibraries and compilers. The correctness of DL systems is crucial for trust in\nDL applications. As such, the recent wave of research has been studying the\nautomated synthesis of test-cases (i.e., DNN models and their inputs) for\nfuzzing DL systems. However, existing model generators only subsume a limited\nnumber of operators, lacking the ability to pervasively model operator\nconstraints. To address this challenge, we propose NeuRI, a fully automated\napproach for generating valid and diverse DL models composed of hundreds of\ntypes of operators. NeuRI adopts a three-step process: (i) collecting valid and\ninvalid API traces from various sources; (ii) applying inductive program\nsynthesis over the traces to infer the constraints for constructing valid\nmodels; and (iii) performing hybrid model generation by incorporating both\nsymbolic and concrete operators concolically. Our evaluation shows that NeuRI\nimproves branch coverage of TensorFlow and PyTorch by 24% and 15% over the\nstate-of-the-art model-level fuzzers. NeuRI finds 100 new bugs for PyTorch and\nTensorFlow in four months, with 81 already fixed or confirmed, and 8\nhigh-priority bugs labeled by PyTorch, constituting 10% of all high-priority\nbugs of the period. Additionally, open-source developers regard error-inducing\nmodels reported by us as \"high-quality\" and \"common in practice\".\n","authors":["Jiawei Liu","Jinjun Peng","Yuyao Wang","Lingming Zhang"],"pdf_url":"https://arxiv.org/pdf/2302.02261v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.05699v3","updated":"2023-08-25T06:34:14Z","published":"2023-03-10T04:49:01Z","title":"Feature Unlearning for Pre-trained GANs and VAEs","summary":" We tackle the problem of feature unlearning from a pre-trained image\ngenerative model: GANs and VAEs. Unlike a common unlearning task where an\nunlearning target is a subset of the training set, we aim to unlearn a specific\nfeature, such as hairstyle from facial images, from the pre-trained generative\nmodels. As the target feature is only presented in a local region of an image,\nunlearning the entire image from the pre-trained model may result in losing\nother details in the remaining region of the image. To specify which features\nto unlearn, we collect randomly generated images that contain the target\nfeatures. We then identify a latent representation corresponding to the target\nfeature and then use the representation to fine-tune the pre-trained model.\nThrough experiments on MNIST and CelebA datasets, we show that target features\nare successfully removed while keeping the fidelity of the original models.\nFurther experiments with an adversarial attack show that the unlearned model is\nmore robust under the presence of malicious parties.\n","authors":["Saemi Moon","Seunghyuk Cho","Dongwoo Kim"],"pdf_url":"https://arxiv.org/pdf/2303.05699v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07873v4","updated":"2023-08-25T06:01:01Z","published":"2023-07-15T19:20:49Z","title":"Why Does Little Robustness Help? Understanding and Improving Adversarial\n Transferability from Surrogate Training","summary":" Adversarial examples (AEs) for DNNs have been shown to be transferable: AEs\nthat successfully fool white-box surrogate models can also deceive other\nblack-box models with different architectures. Although a bunch of empirical\nstudies have provided guidance on generating highly transferable AEs, many of\nthese findings lack explanations and even lead to inconsistent advice. In this\npaper, we take a further step towards understanding adversarial\ntransferability, with a particular focus on surrogate aspects. Starting from\nthe intriguing little robustness phenomenon, where models adversarially trained\nwith mildly perturbed adversarial samples can serve as better surrogates, we\nattribute it to a trade-off between two predominant factors: model smoothness\nand gradient similarity. Our investigations focus on their joint effects,\nrather than their separate correlations with transferability. Through a series\nof theoretical and empirical analyses, we conjecture that the data distribution\nshift in adversarial training explains the degradation of gradient similarity.\nBuilding on these insights, we explore the impacts of data augmentation and\ngradient regularization on transferability and identify that the trade-off\ngenerally exists in the various training mechanisms, thus building a\ncomprehensive blueprint for the regulation mechanism behind transferability.\nFinally, we provide a general route for constructing better surrogates to boost\ntransferability which optimizes both model smoothness and gradient similarity\nsimultaneously, e.g., the combination of input gradient regularization and\nsharpness-aware minimization (SAM), validated by extensive experiments. In\nsummary, we call for attention to the united impacts of these two factors for\nlaunching effective transfer attacks, rather than optimizing one while ignoring\nthe other, and emphasize the crucial role of manipulating surrogate models.\n","authors":["Yechao Zhang","Shengshan Hu","Leo Yu Zhang","Junyu Shi","Minghui Li","Xiaogeng Liu","Wei Wan","Hai Jin"],"pdf_url":"https://arxiv.org/pdf/2307.07873v4.pdf","comment":"Accepted by IEEE Symposium on Security and Privacy (Oakland) 2024; 21\n pages, 11 figures, 13 tables"},{"id":"http://arxiv.org/abs/2305.10947v2","updated":"2023-08-25T05:57:08Z","published":"2023-05-18T13:09:45Z","title":"Comparative Study: Standalone IEEE 16-bit Floating-Point for Image\n Classification","summary":" Reducing the number of bits needed to encode the weights and activations of\nneural networks is highly desirable as it speeds up their training and\ninference time while reducing memory consumption. It is unsurprising that\nconsiderable attention has been drawn to developing neural networks that employ\nlower-precision computation. This includes IEEE 16-bit, Google bfloat16, 8-bit,\n4-bit floating-point or fixed-point, 2-bit, and various mixed-precision\nalgorithms. Out of these low-precision formats, IEEE 16-bit stands out due to\nits universal compatibility with contemporary GPUs. This accessibility\ncontrasts with bfloat16, which needs high-end GPUs, or other non-standard\nfewer-bit designs, which typically require software simulation. This study\nfocuses on the widely accessible IEEE 16-bit format for comparative analysis.\nThis analysis involves an in-depth theoretical investigation of the factors\nthat lead to discrepancies between 16-bit and 32-bit models, including a\nformalization of the concepts of floating-point error and tolerance to\nunderstand the conditions under which a 16-bit model can approximate 32-bit\nresults. Contrary to literature that credits the success of noise-tolerated\nneural networks to regularization effects, our study-supported by a series of\nrigorous experiments-provides a quantitative explanation of why standalone IEEE\n16-bit floating-point neural networks can perform on par with 32-bit and\nmixed-precision networks in various image classification tasks. Because no\nprior research has studied IEEE 16-bit as a standalone floating-point precision\nin neural networks, we believe our findings will have significant impacts,\nencouraging the adoption of standalone IEEE 16-bit networks in future neural\nnetwork applications.\n","authors":["Juyoung Yun","Byungkon Kang","Francois Rameau","Zhoulai Fu"],"pdf_url":"https://arxiv.org/pdf/2305.10947v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.12263v3","updated":"2023-08-25T05:41:23Z","published":"2022-08-24T08:05:18Z","title":"Augmenting Reinforcement Learning with Transformer-based Scene\n Representation Learning for Decision-making of Autonomous Driving","summary":" Decision-making for urban autonomous driving is challenging due to the\nstochastic nature of interactive traffic participants and the complexity of\nroad structures. Although reinforcement learning (RL)-based decision-making\nscheme is promising to handle urban driving scenarios, it suffers from low\nsample efficiency and poor adaptability. In this paper, we propose Scene-Rep\nTransformer to improve the RL decision-making capabilities with better scene\nrepresentation encoding and sequential predictive latent distillation.\nSpecifically, a multi-stage Transformer (MST) encoder is constructed to model\nnot only the interaction awareness between the ego vehicle and its neighbors\nbut also intention awareness between the agents and their candidate routes. A\nsequential latent Transformer (SLT) with self-supervised learning objectives is\nemployed to distill the future predictive information into the latent scene\nrepresentation, in order to reduce the exploration space and speed up training.\nThe final decision-making module based on soft actor-critic (SAC) takes as\ninput the refined latent scene representation from the Scene-Rep Transformer\nand outputs driving actions. The framework is validated in five challenging\nsimulated urban scenarios with dense traffic, and its performance is manifested\nquantitatively by the substantial improvements in data efficiency and\nperformance in terms of success rate, safety, and efficiency. The qualitative\nresults reveal that our framework is able to extract the intentions of neighbor\nagents to help make decisions and deliver more diversified driving behaviors.\n","authors":["Haochen Liu","Zhiyu Huang","Xiaoyu Mo","Chen Lv"],"pdf_url":"https://arxiv.org/pdf/2208.12263v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.02691v3","updated":"2023-08-25T05:24:59Z","published":"2023-05-04T10:09:08Z","title":"PGB: A PubMed Graph Benchmark for Heterogeneous Network Representation\n Learning","summary":" There has been rapid growth in biomedical literature, yet capturing the\nheterogeneity of the bibliographic information of these articles remains\nrelatively understudied. Although graph mining research via heterogeneous graph\nneural networks has taken center stage, it remains unclear whether these\napproaches capture the heterogeneity of the PubMed database, a vast digital\nrepository containing over 33 million articles. We introduce PubMed Graph\nBenchmark (PGB), a new benchmark dataset for evaluating heterogeneous graph\nembeddings for biomedical literature. The benchmark contains rich metadata\nincluding abstract, authors, citations, MeSH terms, MeSH hierarchy, and some\nother information. The benchmark contains three different evaluation tasks\nencompassing systematic reviews, node classification, and node clustering. In\nPGB, we aggregate the metadata associated with the biomedical articles from\nPubMed into a unified source and make the benchmark publicly available for any\nfuture works.\n","authors":["Eric W Lee","Joyce C Ho"],"pdf_url":"https://arxiv.org/pdf/2305.02691v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.15749v2","updated":"2023-08-25T05:24:48Z","published":"2023-06-27T19:04:00Z","title":"To Spike or Not To Spike: A Digital Hardware Perspective on Deep\n Learning Acceleration","summary":" As deep learning models scale, they become increasingly competitive from\ndomains spanning computer vision to natural language processing; however, this\nhappens at the expense of efficiency since they require increasingly more\nmemory and computing power. The power efficiency of the biological brain\noutperforms the one of any large-scale deep learning (DL) model; thus,\nneuromorphic computing tries to mimic the brain operations, such as spike-based\ninformation processing, to improve the efficiency of DL models. Despite the\nbenefits of the brain, such as efficient information transmission, dense\nneuronal interconnects, and the co-location of computation and memory, the\navailable biological substrate has severely constrained the evolution of\nbiological brains. Electronic hardware does not have the same constraints;\ntherefore, while modeling spiking neural networks (SNNs) might uncover one\npiece of the puzzle, the design of efficient hardware backends for SNNs needs\nfurther investigation, potentially taking inspiration from the available work\ndone on the artificial neural networks (ANN s) side. As such, when is it wise\nto look at the brain while designing new hardware, and when should it be\nignored? To answer this question, we quantitatively compare the digital\nhardware acceleration techniques and platforms of ANN s and SNNs.\n","authors":["Fabrizio Ottati","Chang Gao","Qinyu Chen","Giovanni Brignone","Mario R. Casu","Jason K. Eshraghian","Luciano Lavagno"],"pdf_url":"https://arxiv.org/pdf/2306.15749v2.pdf","comment":"Replace with reviewed version. Submitted to JETCAS"},{"id":"http://arxiv.org/abs/2308.13182v1","updated":"2023-08-25T05:24:23Z","published":"2023-08-25T05:24:23Z","title":"Structural Cycle GAN for Virtual Immunohistochemistry Staining of Gland\n Markers in the Colon","summary":" With the advent of digital scanners and deep learning, diagnostic operations\nmay move from a microscope to a desktop. Hematoxylin and Eosin (H&E) staining\nis one of the most frequently used stains for disease analysis, diagnosis, and\ngrading, but pathologists do need different immunohistochemical (IHC) stains to\nanalyze specific structures or cells. Obtaining all of these stains (H&E and\ndifferent IHCs) on a single specimen is a tedious and time-consuming task.\nConsequently, virtual staining has emerged as an essential research direction.\nHere, we propose a novel generative model, Structural Cycle-GAN (SC-GAN), for\nsynthesizing IHC stains from H&E images, and vice versa. Our method expressly\nincorporates structural information in the form of edges (in addition to color\ndata) and employs attention modules exclusively in the decoder of the proposed\ngenerator model. This integration enhances feature localization and preserves\ncontextual information during the generation process. In addition, a structural\nloss is incorporated to ensure accurate structure alignment between the\ngenerated and input markers. To demonstrate the efficacy of the proposed model,\nexperiments are conducted with two IHC markers emphasizing distinct structures\nof glands in the colon: the nucleus of epithelial cells (CDX2) and the\ncytoplasm (CK818). Quantitative metrics such as FID and SSIM are frequently\nused for the analysis of generative models, but they do not correlate\nexplicitly with higher-quality virtual staining results. Therefore, we propose\ntwo new quantitative metrics that correlate directly with the virtual staining\nspecificity of IHC markers.\n","authors":["Shikha Dubey","Tushar Kataria","Beatrice Knudsen","Shireen Y. Elhabian"],"pdf_url":"https://arxiv.org/pdf/2308.13182v1.pdf","comment":"Accepted to MICCAI Workshop 2023"},{"id":"http://arxiv.org/abs/2304.06931v2","updated":"2023-08-25T05:07:43Z","published":"2023-04-14T05:32:01Z","title":"Scale Federated Learning for Label Set Mismatch in Medical Image\n Classification","summary":" Federated learning (FL) has been introduced to the healthcare domain as a\ndecentralized learning paradigm that allows multiple parties to train a model\ncollaboratively without privacy leakage. However, most previous studies have\nassumed that every client holds an identical label set. In reality, medical\nspecialists tend to annotate only diseases within their area of expertise or\ninterest. This implies that label sets in each client can be different and even\ndisjoint. In this paper, we propose the framework FedLSM to solve the problem\nof Label Set Mismatch. FedLSM adopts different training strategies on data with\ndifferent uncertainty levels to efficiently utilize unlabeled or partially\nlabeled data as well as class-wise adaptive aggregation in the classification\nlayer to avoid inaccurate aggregation when clients have missing labels. We\nevaluated FedLSM on two public real-world medical image datasets, including\nchest X-ray (CXR) diagnosis with 112,120 CXR images and skin lesion diagnosis\nwith 10,015 dermoscopy images, and showed that it significantly outperformed\nother state-of-the-art FL algorithms. The code can be found at\nhttps://github.com/dzp2095/FedLSM.\n","authors":["Zhipeng Deng","Luyang Luo","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2304.06931v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10892v2","updated":"2023-08-25T05:05:06Z","published":"2023-08-17T05:42:29Z","title":"Bayesian polynomial neural networks and polynomial neural ordinary\n differential equations","summary":" Symbolic regression with polynomial neural networks and polynomial neural\nordinary differential equations (ODEs) are two recent and powerful approaches\nfor equation recovery of many science and engineering problems. However, these\nmethods provide point estimates for the model parameters and are currently\nunable to accommodate noisy data. We address this challenge by developing and\nvalidating the following Bayesian inference methods: the Laplace approximation,\nMarkov Chain Monte Carlo (MCMC) sampling methods, and variational inference. We\nhave found the Laplace approximation to be the best method for this class of\nproblems. Our work can be easily extended to the broader class of symbolic\nneural networks to which the polynomial neural network belongs.\n","authors":["Colby Fronk","Jaewoong Yun","Prashant Singh","Linda Petzold"],"pdf_url":"https://arxiv.org/pdf/2308.10892v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08128v2","updated":"2023-08-25T04:53:19Z","published":"2023-08-16T03:35:52Z","title":"How to Mask in Error Correction Code Transformer: Systematic and Double\n Masking","summary":" In communication and storage systems, error correction codes (ECCs) are\npivotal in ensuring data reliability. As deep learning's applicability has\nbroadened across diverse domains, there is a growing research focus on neural\nnetwork-based decoders that outperform traditional decoding algorithms. Among\nthese neural decoders, Error Correction Code Transformer (ECCT) has achieved\nthe state-of-the-art performance, outperforming other methods by large margins.\nTo further enhance the performance of ECCT, we propose two novel methods.\nFirst, leveraging the systematic encoding technique of ECCs, we introduce a new\nmasking matrix for ECCT, aiming to improve the performance and reduce the\ncomputational complexity. Second, we propose a novel transformer architecture\nof ECCT called a double-masked ECCT. This architecture employs two different\nmask matrices in a parallel manner to learn more diverse features of the\nrelationship between codeword bits in the masked self-attention blocks.\nExtensive simulation results show that the proposed double-masked ECCT\noutperforms the conventional ECCT, achieving the state-of-the-art decoding\nperformance with significant margins.\n","authors":["Seong-Joon Park","Hee-Youl Kwak","Sang-Hyo Kim","Sunghwan Kim","Yongjune Kim","Jong-Seon No"],"pdf_url":"https://arxiv.org/pdf/2308.08128v2.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.13176v1","updated":"2023-08-25T04:52:54Z","published":"2023-08-25T04:52:54Z","title":"Using Adamic-Adar Index Algorithm to Predict Volunteer Collaboration:\n Less is More","summary":" Social networks exhibit a complex graph-like structure due to the uncertainty\nsurrounding potential collaborations among participants. Machine learning\nalgorithms possess generic outstanding performance in multiple real-world\nprediction tasks. However, whether machine learning algorithms outperform\nspecific algorithms designed for graph link prediction remains unknown to us.\nTo address this issue, the Adamic-Adar Index (AAI), Jaccard Coefficient (JC)\nand common neighbour centrality (CNC) as representatives of graph-specific\nalgorithms were applied to predict potential collaborations, utilizing data\nfrom volunteer activities during the Covid-19 pandemic in Shenzhen city, along\nwith the classical machine learning algorithms such as random forest, support\nvector machine, and gradient boosting as single predictors and components of\nensemble learning. This paper introduces that the AAI algorithm outperformed\nthe traditional JC and CNC, and other machine learning algorithms in analyzing\ngraph node attributes for this task.\n","authors":["Chao Wu","Peng Chen","Baiqiao Yin","Zijuan Lin","Chen Jiang","Di Yu","Changhong Zou","Chunwang Lui"],"pdf_url":"https://arxiv.org/pdf/2308.13176v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13168v1","updated":"2023-08-25T04:14:02Z","published":"2023-08-25T04:14:02Z","title":"IOMatch: Simplifying Open-Set Semi-Supervised Learning with Joint\n Inliers and Outliers Utilization","summary":" Semi-supervised learning (SSL) aims to leverage massive unlabeled data when\nlabels are expensive to obtain. Unfortunately, in many real-world applications,\nthe collected unlabeled data will inevitably contain unseen-class outliers not\nbelonging to any of the labeled classes. To deal with the challenging open-set\nSSL task, the mainstream methods tend to first detect outliers and then filter\nthem out. However, we observe a surprising fact that such approach could result\nin more severe performance degradation when labels are extremely scarce, as the\nunreliable outlier detector may wrongly exclude a considerable portion of\nvaluable inliers. To tackle with this issue, we introduce a novel open-set SSL\nframework, IOMatch, which can jointly utilize inliers and outliers, even when\nit is difficult to distinguish exactly between them. Specifically, we propose\nto employ a multi-binary classifier in combination with the standard closed-set\nclassifier for producing unified open-set classification targets, which regard\nall outliers as a single new class. By adopting these targets as open-set\npseudo-labels, we optimize an open-set classifier with all unlabeled samples\nincluding both inliers and outliers. Extensive experiments have shown that\nIOMatch significantly outperforms the baseline methods across different\nbenchmark datasets and different settings despite its remarkable simplicity.\nOur code and models are available at https://github.com/nukezil/IOMatch.\n","authors":["Zekun Li","Lei Qi","Yinghuan Shi","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2308.13168v1.pdf","comment":"Accepted by ICCV 2023, selected for an Oral presentation"},{"id":"http://arxiv.org/abs/2108.06808v5","updated":"2023-08-25T03:48:04Z","published":"2021-08-15T20:12:58Z","title":"Implicit Regularization of Bregman Proximal Point Algorithm and Mirror\n Descent on Separable Data","summary":" Bregman proximal point algorithm (BPPA) has witnessed emerging machine\nlearning applications, yet its theoretical understanding has been largely\nunexplored. We study the computational properties of BPPA through learning\nlinear classifiers with separable data, and demonstrate provable algorithmic\nregularization of BPPA. For any BPPA instantiated with a fixed Bregman\ndivergence, we provide a lower bound of the margin obtained by BPPA with\nrespect to an arbitrarily chosen norm. The obtained margin lower bound differs\nfrom the maximal margin by a multiplicative factor, which inversely depends on\nthe condition number of the distance-generating function measured in the dual\nnorm. We show that the dependence on the condition number is tight, thus\ndemonstrating the importance of divergence in affecting the quality of the\nlearned classifiers. We then extend our findings to mirror descent, for which\nwe establish similar connections between the margin and Bregman divergence,\ntogether with a non-asymptotic analysis. Numerical experiments on both\nsynthetic and real-world datasets are provided to support our theoretical\nfindings. To the best of our knowledge, the aforementioned findings appear to\nbe new in the literature of algorithmic regularization.\n","authors":["Yan Li","Caleb Ju","Ethan X. Fang","Tuo Zhao"],"pdf_url":"https://arxiv.org/pdf/2108.06808v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13158v1","updated":"2023-08-25T03:35:29Z","published":"2023-08-25T03:35:29Z","title":"DAG-ACFL: Asynchronous Clustered Federated Learning based on DAG-DLT","summary":" Federated learning (FL) aims to collaboratively train a global model while\nensuring client data privacy. However, FL faces challenges from the non-IID\ndata distribution among clients. Clustered FL (CFL) has emerged as a promising\nsolution, but most existing CFL frameworks adopt synchronous frameworks lacking\nasynchrony. An asynchronous CFL framework called SDAGFL based on directed\nacyclic graph distributed ledger techniques (DAG-DLT) was proposed, but its\ncomplete decentralization leads to high communication and storage costs. We\npropose DAG-ACFL, an asynchronous clustered FL framework based on directed\nacyclic graph distributed ledger techniques (DAG-DLT). We first detail the\ncomponents of DAG-ACFL. A tip selection algorithm based on the cosine\nsimilarity of model parameters is then designed to aggregate models from\nclients with similar distributions. An adaptive tip selection algorithm\nleveraging change-point detection dynamically determines the number of selected\ntips. We evaluate the clustering and training performance of DAG-ACFL on\nmultiple datasets and analyze its communication and storage costs. Experiments\nshow the superiority of DAG-ACFL in asynchronous clustered FL. By combining\nDAG-DLT with clustered FL, DAG-ACFL realizes robust, decentralized and private\nmodel training with efficient performance.\n","authors":["Xiaofeng Xue","Haokun Mao","Qiong Li"],"pdf_url":"https://arxiv.org/pdf/2308.13158v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13157v1","updated":"2023-08-25T03:31:22Z","published":"2023-08-25T03:31:22Z","title":"Federated Learning in IoT: a Survey from a Resource-Constrained\n Perspective","summary":" The IoT ecosystem is able to leverage vast amounts of data for intelligent\ndecision-making. Federated Learning (FL), a decentralized machine learning\ntechnique, is widely used to collect and train machine learning models from a\nvariety of distributed data sources. Both IoT and FL systems can be\ncomplementary and used together. However, the resource-constrained nature of\nIoT devices prevents the widescale deployment FL in the real world. This\nresearch paper presents a comprehensive survey of the challenges and solutions\nassociated with implementing Federated Learning (FL) in resource-constrained\nInternet of Things (IoT) environments, viewed from 2 levels, client and server.\nWe focus on solutions regarding limited client resources, presence of\nheterogeneous client data, server capacity, and high communication costs, and\nassess their effectiveness in various scenarios. Furthermore, we categorize the\nsolutions based on the location of their application, i.e., the IoT client, and\nthe FL server. In addition to a comprehensive review of existing research and\npotential future directions, this paper also presents new evaluation metrics\nthat would allow researchers to evaluate their solutions on\nresource-constrained IoT devices.\n","authors":["Ishmeet Kaur andAdwaita Janardhan Jadhav"],"pdf_url":"https://arxiv.org/pdf/2308.13157v1.pdf","comment":"Presented and accepted at The IEEE 2023 International Conference on\n Artificial Intelligence, Robotics, Signal and Image Processing (AIRoSIP)"},{"id":"http://arxiv.org/abs/2307.14623v2","updated":"2023-08-25T03:17:29Z","published":"2023-07-27T04:47:05Z","title":"BubbleML: A Multi-Physics Dataset and Benchmarks for Machine Learning","summary":" In the field of phase change phenomena, the lack of accessible and diverse\ndatasets suitable for machine learning (ML) training poses a significant\nchallenge. Existing experimental datasets are often restricted, with limited\navailability and sparse ground truth data, impeding our understanding of this\ncomplex multiphysics phenomena. To bridge this gap, we present the BubbleML\nDataset\n\\footnote{\\label{git_dataset}\\url{https://github.com/HPCForge/BubbleML}} which\nleverages physics-driven simulations to provide accurate ground truth\ninformation for various boiling scenarios, encompassing nucleate pool boiling,\nflow boiling, and sub-cooled boiling. This extensive dataset covers a wide\nrange of parameters, including varying gravity conditions, flow rates,\nsub-cooling levels, and wall superheat, comprising 79 simulations. BubbleML is\nvalidated against experimental observations and trends, establishing it as an\ninvaluable resource for ML research. Furthermore, we showcase its potential to\nfacilitate exploration of diverse downstream tasks by introducing two\nbenchmarks: (a) optical flow analysis to capture bubble dynamics, and (b)\noperator networks for learning temperature dynamics. The BubbleML dataset and\nits benchmarks serve as a catalyst for advancements in ML-driven research on\nmultiphysics phase change phenomena, enabling the development and comparison of\nstate-of-the-art techniques and models.\n","authors":["Sheikh Md Shakeel Hassan","Arthur Feeney","Akash Dhruv","Jihoon Kim","Youngjoon Suh","Jaiyoung Ryu","Yoonjin Won","Aparna Chandramowlishwaran"],"pdf_url":"https://arxiv.org/pdf/2307.14623v2.pdf","comment":"Submitted to Neurips Datasets and Benchmarks Track 2023"},{"id":"http://arxiv.org/abs/2306.09297v2","updated":"2023-08-25T03:15:04Z","published":"2023-06-15T17:25:15Z","title":"Fix Fairness, Don't Ruin Accuracy: Performance Aware Fairness Repair\n using AutoML","summary":" Machine learning (ML) is increasingly being used in critical decision-making\nsoftware, but incidents have raised questions about the fairness of ML\npredictions. To address this issue, new tools and methods are needed to\nmitigate bias in ML-based software. Previous studies have proposed bias\nmitigation algorithms that only work in specific situations and often result in\na loss of accuracy. Our proposed solution is a novel approach that utilizes\nautomated machine learning (AutoML) techniques to mitigate bias. Our approach\nincludes two key innovations: a novel optimization function and a\nfairness-aware search space. By improving the default optimization function of\nAutoML and incorporating fairness objectives, we are able to mitigate bias with\nlittle to no loss of accuracy. Additionally, we propose a fairness-aware search\nspace pruning method for AutoML to reduce computational cost and repair time.\nOur approach, built on the state-of-the-art Auto-Sklearn tool, is designed to\nreduce bias in real-world scenarios. In order to demonstrate the effectiveness\nof our approach, we evaluated our approach on four fairness problems and 16\ndifferent ML models, and our results show a significant improvement over the\nbaseline and existing bias mitigation techniques. Our approach, Fair-AutoML,\nsuccessfully repaired 60 out of 64 buggy cases, while existing bias mitigation\ntechniques only repaired up to 44 out of 64 cases.\n","authors":["Giang Nguyen","Sumon Biswas","Hridesh Rajan"],"pdf_url":"https://arxiv.org/pdf/2306.09297v2.pdf","comment":"In Proceedings of The 31st ACM Joint European Software Engineering\n Conference and Symposium on the Foundations of Software Engineering (ESEC/FSE\n 2023)"},{"id":"http://arxiv.org/abs/2307.06886v3","updated":"2023-08-25T03:14:32Z","published":"2023-07-13T16:39:01Z","title":"Min-Max Optimization under Delays","summary":" Delays and asynchrony are inevitable in large-scale machine-learning problems\nwhere communication plays a key role. As such, several works have extensively\nanalyzed stochastic optimization with delayed gradients. However, as far as we\nare aware, no analogous theory is available for min-max optimization, a topic\nthat has gained recent popularity due to applications in adversarial\nrobustness, game theory, and reinforcement learning. Motivated by this gap, we\nexamine the performance of standard min-max optimization algorithms with\ndelayed gradient updates. First, we show (empirically) that even small delays\ncan cause prominent algorithms like Extra-gradient (\\texttt{EG}) to diverge on\nsimple instances for which \\texttt{EG} guarantees convergence in the absence of\ndelays. Our empirical study thus suggests the need for a careful analysis of\ndelayed versions of min-max optimization algorithms. Accordingly, under\nsuitable technical assumptions, we prove that Gradient Descent-Ascent\n(\\texttt{GDA}) and \\texttt{EG} with delayed updates continue to guarantee\nconvergence to saddle points for convex-concave and strongly convex-strongly\nconcave settings. Our complexity bounds reveal, in a transparent manner, the\nslow-down in convergence caused by delays.\n","authors":["Arman Adibi","Aritra Mitra","Hamed Hassani"],"pdf_url":"https://arxiv.org/pdf/2307.06886v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13150v1","updated":"2023-08-25T03:08:41Z","published":"2023-08-25T03:08:41Z","title":"Enhancing Breast Cancer Classification Using Transfer ResNet with\n Lightweight Attention Mechanism","summary":" Deep learning models have revolutionized image classification by learning\ncomplex feature hierarchies in raw pixel data. This paper introduces an image\nclassification method based on the ResNet model, and introduces a lightweight\nattention mechanism framework to improve performance. The framework optimizes\nfeature representation, enhances classification capabilities, and improves\nfeature discriminativeness. We verified the effectiveness of the algorithm on\nthe Breakhis dataset, showing its superior performance in many aspects. Not\nonly in terms of conventional models, our method also shows advantages on\nstate-of-the-art methods such as contemporary visual transformers. Significant\nimprovements have been achieved in metrics such as precision, accuracy, recall,\nF1-score, and G-means, while also performing well in terms of convergence time.\nThese results strengthen the performance of the algorithm and solidify its\napplication prospects in practical image classification tasks. Keywords: ResNet\nmodel, Lightweight attention mechanism\n","authors":["Suxing Liu"],"pdf_url":"https://arxiv.org/pdf/2308.13150v1.pdf","comment":"6 pages, 4 figures,6 tables"},{"id":"http://arxiv.org/abs/2308.13139v1","updated":"2023-08-25T02:32:36Z","published":"2023-08-25T02:32:36Z","title":"MatchXML: An Efficient Text-label Matching Framework for Extreme\n Multi-label Text Classification","summary":" The eXtreme Multi-label text Classification(XMC) refers to training a\nclassifier that assigns a text sample with relevant labels from an extremely\nlarge-scale label set (e.g., millions of labels). We propose MatchXML, an\nefficient text-label matching framework for XMC. We observe that the label\nembeddings generated from the sparse Term Frequency-Inverse Document\nFrequency(TF-IDF) features have several limitations. We thus propose label2vec\nto effectively train the semantic dense label embeddings by the Skip-gram\nmodel. The dense label embeddings are then used to build a Hierarchical Label\nTree by clustering. In fine-tuning the pre-trained encoder Transformer, we\nformulate the multi-label text classification as a text-label matching problem\nin a bipartite graph. We then extract the dense text representations from the\nfine-tuned Transformer. Besides the fine-tuned dense text embeddings, we also\nextract the static dense sentence embeddings from a pre-trained Sentence\nTransformer. Finally, a linear ranker is trained by utilizing the sparse TF-IDF\nfeatures, the fine-tuned dense text representations and static dense sentence\nfeatures. Experimental results demonstrate that MatchXML achieves\nstate-of-the-art accuracy on five out of six datasets. As for the speed,\nMatchXML outperforms the competing methods on all the six datasets. Our source\ncode is publicly available at https://github.com/huiyegit/MatchXML.\n","authors":["Hui Ye","Rajshekhar Sunderraman","Shihao Ji"],"pdf_url":"https://arxiv.org/pdf/2308.13139v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13137v1","updated":"2023-08-25T02:28:35Z","published":"2023-08-25T02:28:35Z","title":"OmniQuant: Omnidirectionally Calibrated Quantization for Large Language\n Models","summary":" Large language models (LLMs) have revolutionized natural language processing\ntasks. However, their practical deployment is hindered by their immense memory\nand computation requirements. Although recent post-training quantization (PTQ)\nmethods are effective in reducing memory footprint and improving the\ncomputational efficiency of LLM, they hand-craft quantization parameters, which\nleads to low performance and fails to deal with extremely low-bit quantization.\nTo tackle this issue, we introduce an Omnidirectionally calibrated Quantization\n(OmniQuant) technique for LLMs, which achieves good performance in diverse\nquantization settings while maintaining the computational efficiency of PTQ by\nefficiently optimizing various quantization parameters. OmniQuant comprises two\ninnovative components including Learnable Weight Clipping (LWC) and Learnable\nEquivalent Transformation (LET). LWC modulates the extreme values of weights by\noptimizing the clipping threshold. Meanwhile, LET tackles activation outliers\nby shifting the challenge of quantization from activations to weights through a\nlearnable equivalent transformation. Operating within a differentiable\nframework using block-wise error minimization, OmniQuant can optimize the\nquantization process efficiently for both weight-only and weight-activation\nquantization. For instance, the LLaMA-2 model family with the size of 7-70B can\nbe processed with OmniQuant on a single A100-40G GPU within 1-16 hours using\n128 samples. Extensive experiments validate OmniQuant's superior performance\nacross diverse quantization configurations such as W4A4, W6A6, W4A16, W3A16,\nand W2A16. Additionally, OmniQuant demonstrates effectiveness in\ninstruction-tuned models and delivers notable improvements in inference speed\nand memory reduction on real devices. Codes and models are available at\n\\url{https://github.com/OpenGVLab/OmniQuant}.\n","authors":["Wenqi Shao","Mengzhao Chen","Zhaoyang Zhang","Peng Xu","Lirui Zhao","Zhiqian Li","Kaipeng Zhang","Peng Gao","Yu Qiao","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2308.13137v1.pdf","comment":"A differentiable quantization method for LLM"},{"id":"http://arxiv.org/abs/2308.13135v1","updated":"2023-08-25T02:05:51Z","published":"2023-08-25T02:05:51Z","title":"Nonparametric Additive Value Functions: Interpretable Reinforcement\n Learning with an Application to Surgical Recovery","summary":" We propose a nonparametric additive model for estimating interpretable value\nfunctions in reinforcement learning. Learning effective adaptive clinical\ninterventions that rely on digital phenotyping features is a major for concern\nmedical practitioners. With respect to spine surgery, different post-operative\nrecovery recommendations concerning patient mobilization can lead to\nsignificant variation in patient recovery. While reinforcement learning has\nachieved widespread success in domains such as games, recent methods heavily\nrely on black-box methods, such neural networks. Unfortunately, these methods\nhinder the ability of examining the contribution each feature makes in\nproducing the final suggested decision. While such interpretations are easily\nprovided in classical algorithms such as Least Squares Policy Iteration, basic\nlinearity assumptions prevent learning higher-order flexible interactions\nbetween features. In this paper, we present a novel method that offers a\nflexible technique for estimating action-value functions without making\nexplicit parametric assumptions regarding their additive functional form. This\nnonparametric estimation strategy relies on incorporating local kernel\nregression and basis expansion to obtain a sparse, additive representation of\nthe action-value function. Under this approach, we are able to locally\napproximate the action-value function and retrieve the nonlinear, independent\ncontribution of select features as well as joint feature pairs. We validate the\nproposed approach with a simulation study, and, in an application to spine\ndisease, uncover recovery recommendations that are inline with related clinical\nknowledge.\n","authors":["Patrick Emedom-Nnamdi","Timothy R. Smith","Jukka-Pekka Onnela","Junwei Lu"],"pdf_url":"https://arxiv.org/pdf/2308.13135v1.pdf","comment":"28 pages, 13 figures"},{"id":"http://arxiv.org/abs/2308.11521v2","updated":"2023-08-25T00:25:06Z","published":"2023-08-16T09:04:36Z","title":"Self-Deception: Reverse Penetrating the Semantic Firewall of Large\n Language Models","summary":" Large language models (LLMs), such as ChatGPT, have emerged with astonishing\ncapabilities approaching artificial general intelligence. While providing\nconvenience for various societal needs, LLMs have also lowered the cost of\ngenerating harmful content. Consequently, LLM developers have deployed\nsemantic-level defenses to recognize and reject prompts that may lead to\ninappropriate content. Unfortunately, these defenses are not foolproof, and\nsome attackers have crafted \"jailbreak\" prompts that temporarily hypnotize the\nLLM into forgetting content defense rules and answering any improper questions.\nTo date, there is no clear explanation of the principles behind these\nsemantic-level attacks and defenses in both industry and academia.\n This paper investigates the LLM jailbreak problem and proposes an automatic\njailbreak method for the first time. We propose the concept of a semantic\nfirewall and provide three technical implementation approaches. Inspired by the\nattack that penetrates traditional firewalls through reverse tunnels, we\nintroduce a \"self-deception\" attack that can bypass the semantic firewall by\ninducing LLM to generate prompts that facilitate jailbreak. We generated a\ntotal of 2,520 attack payloads in six languages (English, Russian, French,\nSpanish, Chinese, and Arabic) across seven virtual scenarios, targeting the\nthree most common types of violations: violence, hate, and pornography. The\nexperiment was conducted on two models, namely the GPT-3.5-Turbo and GPT-4. The\nsuccess rates on the two models were 86.2% and 67%, while the failure rates\nwere 4.7% and 2.2%, respectively. This highlighted the effectiveness of the\nproposed attack method. All experimental code and raw data will be released as\nopen-source to inspire future research. We believe that manipulating AI\nbehavior through carefully crafted prompts will become an important research\ndirection in the future.\n","authors":["Zhenhua Wang","Wei Xie","Kai Chen","Baosheng Wang","Zhiwen Gui","Enze Wang"],"pdf_url":"https://arxiv.org/pdf/2308.11521v2.pdf","comment":"Serious errors were found in the experiment, which may lead to the\n overturning of the overall conclusions of the paper"}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.13421v1","updated":"2023-08-25T15:06:14Z","published":"2023-08-25T15:06:14Z","title":"Exploiting Diverse Feature for Multimodal Sentiment Analysis","summary":" In this paper, we present our solution to the MuSe-Personalisation\nsub-challenge in the MuSe 2023 Multimodal Sentiment Analysis Challenge. The\ntask of MuSe-Personalisation aims to predict the continuous arousal and valence\nvalues of a participant based on their audio-visual, language, and\nphysiological signal modalities data. Considering different people have\npersonal characteristics, the main challenge of this task is how to build\nrobustness feature presentation for sentiment prediction. To address this\nissue, we propose exploiting diverse features. Specifically, we proposed a\nseries of feature extraction methods to build a robust representation and model\nensemble. We empirically evaluate the performance of the utilized method on the\nofficially provided dataset. \\textbf{As a result, we achieved 3rd place in the\nMuSe-Personalisation sub-challenge.} Specifically, we achieve the results of\n0.8492 and 0.8439 for MuSe-Personalisation in terms of arousal and valence CCC.\n","authors":["Jia Li","Wei Qian","Kun Li","Qi Li","Dan Guo","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.13421v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12898v2","updated":"2023-08-25T12:22:53Z","published":"2023-08-24T16:17:40Z","title":"Can Linguistic Knowledge Improve Multimodal Alignment in Vision-Language\n Pretraining?","summary":" The multimedia community has shown a significant interest in perceiving and\nrepresenting the physical world with multimodal pretrained neural network\nmodels, and among them, the visual-language pertaining (VLP) is, currently, the\nmost captivating topic. However, there have been few endeavors dedicated to the\nexploration of 1) whether essential linguistic knowledge (e.g., semantics and\nsyntax) can be extracted during VLP, and 2) how such linguistic knowledge\nimpact or enhance the multimodal alignment. In response, here we aim to\nelucidate the impact of comprehensive linguistic knowledge, including semantic\nexpression and syntactic structure, on multimodal alignment. Specifically, we\ndesign and release the SNARE, the first large-scale multimodal alignment\nprobing benchmark, to detect the vital linguistic components, e.g., lexical,\nsemantic, and syntax knowledge, containing four tasks: Semantic structure,\nNegation logic, Attribute ownership, and Relationship composition. Based on our\nproposed probing benchmarks, our holistic analyses of five advanced VLP models\nillustrate that the VLP model: i) shows insensitivity towards complex syntax\nstructures and relies on content words for sentence comprehension; ii)\ndemonstrates limited comprehension of combinations between sentences and\nnegations; iii) faces challenges in determining the presence of actions or\nspatial relationships within visual information and struggles with verifying\nthe correctness of triple combinations. We make our benchmark and code\navailable at \\url{https://github.com/WangFei-2019/SNARE/}.\n","authors":["Fei Wang","Liang Ding","Jun Rao","Ye Liu","Li Shen","Changxing Ding"],"pdf_url":"https://arxiv.org/pdf/2308.12898v2.pdf","comment":"[TL;DR] we design and release the SNARE, the first large-scale\n multimodal alignment probing benchmark for current vision-language pretrained\n models"},{"id":"http://arxiv.org/abs/2308.13273v1","updated":"2023-08-25T09:51:03Z","published":"2023-08-25T09:51:03Z","title":"Bridging the Gap: Fine-to-Coarse Sketch Interpolation Network for\n High-Quality Animation Sketch Inbetweening","summary":" The 2D animation workflow is typically initiated with the creation of\nkeyframes using sketch-based drawing. Subsequent inbetweens (i.e., intermediate\nsketch frames) are crafted through manual interpolation for smooth animations,\nwhich is a labor-intensive process. Thus, the prospect of automatic animation\nsketch interpolation has become highly appealing. However, existing video\ninterpolation methods are generally hindered by two key issues for sketch\ninbetweening: 1) limited texture and colour details in sketches, and 2)\nexaggerated alterations between two sketch keyframes. To overcome these issues,\nwe propose a novel deep learning method, namely Fine-to-Coarse Sketch\nInterpolation Network (FC-SIN). This approach incorporates multi-level guidance\nthat formulates region-level correspondence, sketch-level correspondence and\npixel-level dynamics. A multi-stream U-Transformer is then devised to\ncharacterize sketch inbewteening patterns using these multi-level guides\nthrough the integration of both self-attention and cross-attention mechanisms.\nAdditionally, to facilitate future research on animation sketch inbetweening,\nwe constructed a large-scale dataset - STD-12K, comprising 30 sketch animation\nseries in diverse artistic styles. Comprehensive experiments on this dataset\nconvincingly show that our proposed FC-SIN surpasses the state-of-the-art\ninterpolation methods. Our code and dataset will be publicly available.\n","authors":["Jiaming Shen","Kun Hu","Wei Bao","Chang Wen Chen","Zhiyong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.13273v1.pdf","comment":"7pages,6figures"},{"id":"http://arxiv.org/abs/2308.12673v2","updated":"2023-08-25T08:03:28Z","published":"2023-08-24T09:31:02Z","title":"Masked Feature Modelling: Feature Masking for the Unsupervised\n Pre-training of a Graph Attention Network Block for Bottom-up Video Event\n Recognition","summary":" In this paper, we introduce Masked Feature Modelling (MFM), a novel approach\nfor the unsupervised pre-training of a Graph Attention Network (GAT) block. MFM\nutilizes a pretrained Visual Tokenizer to reconstruct masked features of\nobjects within a video, leveraging the MiniKinetics dataset. We then\nincorporate the pre-trained GAT block into a state-of-the-art bottom-up\nsupervised video-event recognition architecture, ViGAT, to improve the model's\nstarting point and overall accuracy. Experimental evaluations on the YLI-MED\ndataset demonstrate the effectiveness of MFM in improving event recognition\nperformance.\n","authors":["Dimitrios Daskalakis","Nikolaos Gkalelis","Vasileios Mezaris"],"pdf_url":"https://arxiv.org/pdf/2308.12673v2.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2308.11681v2","updated":"2023-08-25T06:55:14Z","published":"2023-08-22T14:58:36Z","title":"VadCLIP: Adapting Vision-Language Models for Weakly Supervised Video\n Anomaly Detection","summary":" The recent contrastive language-image pre-training (CLIP) model has shown\ngreat success in a wide range of image-level tasks, revealing remarkable\nability for learning powerful visual representations with rich semantics. An\nopen and worthwhile problem is efficiently adapting such a strong model to the\nvideo domain and designing a robust video anomaly detector. In this work, we\npropose VadCLIP, a new paradigm for weakly supervised video anomaly detection\n(WSVAD) by leveraging the frozen CLIP model directly without any pre-training\nand fine-tuning process. Unlike current works that directly feed extracted\nfeatures into the weakly supervised classifier for frame-level binary\nclassification, VadCLIP makes full use of fine-grained associations between\nvision and language on the strength of CLIP and involves dual branch. One\nbranch simply utilizes visual features for coarse-grained binary\nclassification, while the other fully leverages the fine-grained language-image\nalignment. With the benefit of dual branch, VadCLIP achieves both\ncoarse-grained and fine-grained video anomaly detection by transferring\npre-trained knowledge from CLIP to WSVAD task. We conduct extensive experiments\non two commonly-used benchmarks, demonstrating that VadCLIP achieves the best\nperformance on both coarse-grained and fine-grained WSVAD, surpassing the\nstate-of-the-art methods by a large margin. Specifically, VadCLIP achieves\n84.51% AP and 88.02% AUC on XD-Violence and UCF-Crime, respectively. Code and\nfeatures will be released to facilitate future VAD research.\n","authors":["Peng Wu","Xuerong Zhou","Guansong Pang","Lingru Zhou","Qingsen Yan","Peng Wang","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.11681v2.pdf","comment":"Submitted"}]},"2023-08-28T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2209.06767v3","updated":"2023-08-28T17:59:49Z","published":"2022-09-14T16:45:13Z","title":"Parameter-Efficient Finetuning for Robust Continual Multilingual\n Learning","summary":" We introduce and study the problem of Continual Multilingual Learning (CML)\nwhere a previously trained multilingual model is periodically updated using new\ndata arriving in stages. If the new data is present only in a subset of\nlanguages, we find that the resulting model shows improved performance only on\nthe languages included in the latest update (and a few closely related\nlanguages) while its performance on all the remaining languages degrade\nsignificantly. We address this challenge by proposing LAFT-URIEL, a\nparameter-efficient finetuning strategy which aims to increase the number of\nlanguages on which the model improves after an update, while reducing the\nmagnitude of loss in performance for the remaining languages. LAFT-URIEL uses\nlinguistic knowledge to balance overfitting and knowledge sharing across\nlanguages, allowing for an additional 25% of task languages to see an\nimprovement in performance after an update, while also reducing the average\nmagnitude of losses on the remaining languages by 78% relative.\n","authors":["Kartikeya Badola","Shachi Dave","Partha Talukdar"],"pdf_url":"https://arxiv.org/pdf/2209.06767v3.pdf","comment":"Published at ACL Findings 2023"},{"id":"http://arxiv.org/abs/2308.13506v2","updated":"2023-08-28T17:46:59Z","published":"2023-08-25T17:31:46Z","title":"Training and Meta-Evaluating Machine Translation Evaluation Metrics at\n the Paragraph Level","summary":" As research on machine translation moves to translating text beyond the\nsentence level, it remains unclear how effective automatic evaluation metrics\nare at scoring longer translations. In this work, we first propose a method for\ncreating paragraph-level data for training and meta-evaluating metrics from\nexisting sentence-level data. Then, we use these new datasets to benchmark\nexisting sentence-level metrics as well as train learned metrics at the\nparagraph level. Interestingly, our experimental results demonstrate that using\nsentence-level metrics to score entire paragraphs is equally as effective as\nusing a metric designed to work at the paragraph level. We speculate this\nresult can be attributed to properties of the task of reference-based\nevaluation as well as limitations of our datasets with respect to capturing all\ntypes of phenomena that occur in paragraph-level translations.\n","authors":["Daniel Deutsch","Juraj Juraska","Mara Finkelstein","Markus Freitag"],"pdf_url":"https://arxiv.org/pdf/2308.13506v2.pdf","comment":"Removing extra \"and\" from author list"},{"id":"http://arxiv.org/abs/2308.14683v1","updated":"2023-08-28T16:18:50Z","published":"2023-08-28T16:18:50Z","title":"Fine-Tuning Llama 2 Large Language Models for Detecting Online Sexual\n Predatory Chats and Abusive Texts","summary":" Detecting online sexual predatory behaviours and abusive language on social\nmedia platforms has become a critical area of research due to the growing\nconcerns about online safety, especially for vulnerable populations such as\nchildren and adolescents. Researchers have been exploring various techniques\nand approaches to develop effective detection systems that can identify and\nmitigate these risks. Recent development of large language models (LLMs) has\nopened a new opportunity to address this problem more effectively. This paper\nproposes an approach to detection of online sexual predatory chats and abusive\nlanguage using the open-source pretrained Llama 2 7B-parameter model, recently\nreleased by Meta GenAI. We fine-tune the LLM using datasets with different\nsizes, imbalance degrees, and languages (i.e., English, Roman Urdu and Urdu).\nBased on the power of LLMs, our approach is generic and automated without a\nmanual search for a synergy between feature extraction and classifier design\nsteps like conventional methods in this domain. Experimental results show a\nstrong performance of the proposed approach, which performs proficiently and\nconsistently across three distinct datasets with five sets of experiments. This\nstudy's outcomes indicate that the proposed method can be implemented in\nreal-world applications (even with non-English languages) for flagging sexual\npredators, offensive or toxic content, hate speech, and discriminatory language\nin online discussions and comments to maintain respectful internet or digital\ncommunities. Furthermore, it can be employed for solving text classification\nproblems with other potential applications such as sentiment analysis, spam and\nphishing detection, sorting legal documents, fake news detection, language\nidentification, user intent recognition, text-based product categorization,\nmedical record analysis, and resume screening.\n","authors":["Thanh Thi Nguyen","Campbell Wilson","Janis Dalins"],"pdf_url":"https://arxiv.org/pdf/2308.14683v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12421v3","updated":"2023-08-28T16:15:21Z","published":"2023-05-21T10:40:55Z","title":"Evaluating Open-QA Evaluation","summary":" This study focuses on the evaluation of the Open Question Answering (Open-QA)\ntask, which can directly estimate the factuality of large language models\n(LLMs). Current automatic evaluation methods have shown limitations, indicating\nthat human evaluation still remains the most reliable approach. We introduce a\nnew task, Evaluating QA Evaluation (QA-Eval) and the corresponding dataset\nEVOUNA, designed to assess the accuracy of AI-generated answers in relation to\nstandard answers within Open-QA. Our evaluation of these methods utilizes\nhuman-annotated results to measure their performance. Specifically, the work\ninvestigates methods that show high correlation with human evaluations, deeming\nthem more reliable. We also discuss the pitfalls of current methods and methods\nto improve LLM-based evaluators. We believe this new QA-Eval task and\ncorresponding dataset EVOUNA will facilitate the development of more effective\nautomatic evaluation tools and prove valuable for future research in this area.\nAll resources are available at \\url{https://github.com/wangcunxiang/QA-Eval}\nand it is under the Apache-2.0 License.\n","authors":["Cunxiang Wang","Sirui Cheng","Qipeng Guo","Zhikun Xu","Bowen Ding","Yidong Wang","Xiangkun Hu","Zheng Zhang","Yue Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.12421v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14669v1","updated":"2023-08-28T15:54:48Z","published":"2023-08-28T15:54:48Z","title":"ANER: Arabic and Arabizi Named Entity Recognition using\n Transformer-Based Approach","summary":" One of the main tasks of Natural Language Processing (NLP), is Named Entity\nRecognition (NER). It is used in many applications and also can be used as an\nintermediate step for other tasks. We present ANER, a web-based named entity\nrecognizer for the Arabic, and Arabizi languages. The model is built upon BERT,\nwhich is a transformer-based encoder. It can recognize 50 different entity\nclasses, covering various fields. We trained our model on the WikiFANE\\_Gold\ndataset which consists of Wikipedia articles. We achieved an F1 score of\n88.7\\%, which beats CAMeL Tools' F1 score of 83\\% on the ANERcorp dataset,\nwhich has only 4 classes. We also got an F1 score of 77.7\\% on the\nNewsFANE\\_Gold dataset which contains out-of-domain data from News articles.\nThe system is deployed on a user-friendly web interface that accepts users'\ninputs in Arabic, or Arabizi. It allows users to explore the entities in the\ntext by highlighting them. It can also direct users to get information about\nentities through Wikipedia directly. We added the ability to do NER using our\nmodel, or CAMeL Tools' model through our website. ANER is publicly accessible\nat \\url{http://www.aner.online}. We also deployed our model on HuggingFace at\nhttps://huggingface.co/boda/ANER, to allow developers to test and use it.\n","authors":["Abdelrahman \"Boda\" Sadallah","Omar Ahmed","Shimaa Mohamed","Omar Hatem","Doaa Hesham","Ahmed H. Yousef"],"pdf_url":"https://arxiv.org/pdf/2308.14669v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14654v1","updated":"2023-08-28T15:36:33Z","published":"2023-08-28T15:36:33Z","title":"Joint Multiple Intent Detection and Slot Filling with Supervised\n Contrastive Learning and Self-Distillation","summary":" Multiple intent detection and slot filling are two fundamental and crucial\ntasks in spoken language understanding. Motivated by the fact that the two\ntasks are closely related, joint models that can detect intents and extract\nslots simultaneously are preferred to individual models that perform each task\nindependently. The accuracy of a joint model depends heavily on the ability of\nthe model to transfer information between the two tasks so that the result of\none task can correct the result of the other. In addition, since a joint model\nhas multiple outputs, how to train the model effectively is also challenging.\nIn this paper, we present a method for multiple intent detection and slot\nfilling by addressing these challenges. First, we propose a bidirectional joint\nmodel that explicitly employs intent information to recognize slots and slot\nfeatures to detect intents. Second, we introduce a novel method for training\nthe proposed joint model using supervised contrastive learning and\nself-distillation. Experimental results on two benchmark datasets MixATIS and\nMixSNIPS show that our method outperforms state-of-the-art models in both\ntasks. The results also demonstrate the contributions of both bidirectional\ndesign and the training method to the accuracy improvement. Our source code is\navailable at https://github.com/anhtunguyen98/BiSLU\n","authors":["Nguyen Anh Tu","Hoang Thi Thu Uyen","Tu Minh Phuong","Ngo Xuan Bach"],"pdf_url":"https://arxiv.org/pdf/2308.14654v1.pdf","comment":"Accepted at ECAI 2023"},{"id":"http://arxiv.org/abs/2306.11167v2","updated":"2023-08-28T15:34:27Z","published":"2023-06-19T21:14:57Z","title":"Large Language Models are Fixated by Red Herrings: Exploring Creative\n Problem Solving and Einstellung Effect using the Only Connect Wall Dataset","summary":" The quest for human imitative AI has been an enduring topic in AI research\nsince its inception. The technical evolution and emerging capabilities of the\nlatest cohort of large language models (LLMs) have reinvigorated the subject\nbeyond academia to the cultural zeitgeist. While recent NLP evaluation\nbenchmark tasks test some aspects of human-imitative behaviour (e.g.,\nBIG-bench's 'human-like behavior' tasks), few, if not none, examine creative\nproblem solving abilities. Creative problem solving in humans is a well-studied\ntopic in cognitive neuroscience with standardized tests that predominantly use\nthe ability to associate (heterogeneous) connections among clue words as a\nmetric for creativity. Exposure to misleading stimuli - distractors dubbed red\nherrings - impede human performance in such tasks via the fixation effect and\nEinstellung paradigm. In cognitive neuroscience studies, such fixations are\nexperimentally induced by pre-exposing participants to orthographically similar\nincorrect words to subsequent word-fragments or clues. The popular British quiz\nshow Only Connect's Connecting Wall segment essentially mimics Mednick's Remote\nAssociates Test (RAT) formulation with built-in, deliberate red herrings, which\nmakes it an ideal proxy dataset to explore and study fixation effect and\nEinstellung paradigm from cognitive neuroscience in LLMs. In this paper we\npresent the novel Only Connect Wall (OCW) dataset and report results from our\nevaluation of selected pre-trained language models and LLMs on creative problem\nsolving tasks like grouping clue words by heterogeneous connections, and\nidentifying correct open knowledge domain connections in respective groups. We\nsynthetically generate two additional datasets: OCW-Randomized, OCW-WordNet to\nfurther analyze our red-herrings hypothesis in language models. The code and\nlink to the dataset are available at https://github.com/TaatiTeam/OCW.\n","authors":["Saeid Naeini","Raeid Saqur","Mozhgan Saeidi","John Giorgi","Babak Taati"],"pdf_url":"https://arxiv.org/pdf/2306.11167v2.pdf","comment":"V2: with added OCW-Randomized and OCW-WordNet results in Section 4.3\n (added). 22 pages with Appendix"},{"id":"http://arxiv.org/abs/2308.14641v1","updated":"2023-08-28T15:12:34Z","published":"2023-08-28T15:12:34Z","title":"Challenges of GPT-3-based Conversational Agents for Healthca","summary":" The potential to provide patients with faster information access while\nallowing medical specialists to concentrate on critical tasks makes medical\ndomain dialog agents appealing. However, the integration of large-language\nmodels (LLMs) into these agents presents certain limitations that may result in\nserious consequences. This paper investigates the challenges and risks of using\nGPT-3-based models for medical question-answering (MedQA). We perform several\nevaluations contextualized in terms of standard medical principles. We provide\na procedure for manually designing patient queries to stress-test high-risk\nlimitations of LLMs in MedQA systems. Our analysis reveals that LLMs fail to\nrespond adequately to these queries, generating erroneous medical information,\nunsafe recommendations, and content that may be considered offensive.\n","authors":["Fabian Lechner","Allison Lahnala","Charles Welch","Lucie Flek"],"pdf_url":"https://arxiv.org/pdf/2308.14641v1.pdf","comment":"12 pages, 9 Tables, accepted to RANLP 2023"},{"id":"http://arxiv.org/abs/2308.14634v1","updated":"2023-08-28T15:04:16Z","published":"2023-08-28T15:04:16Z","title":"Breaking the Bank with ChatGPT: Few-Shot Text Classification for Finance","summary":" We propose the use of conversational GPT models for easy and quick few-shot\ntext classification in the financial domain using the Banking77 dataset. Our\napproach involves in-context learning with GPT-3.5 and GPT-4, which minimizes\nthe technical expertise required and eliminates the need for expensive GPU\ncomputing while yielding quick and accurate results. Additionally, we fine-tune\nother pre-trained, masked language models with SetFit, a recent contrastive\nlearning technique, to achieve state-of-the-art results both in full-data and\nfew-shot settings. Our findings show that querying GPT-3.5 and GPT-4 can\noutperform fine-tuned, non-generative models even with fewer examples. However,\nsubscription fees associated with these solutions may be considered costly for\nsmall organizations. Lastly, we find that generative models perform better on\nthe given task when shown representative samples selected by a human expert\nrather than when shown random ones. We conclude that a) our proposed methods\noffer a practical solution for few-shot tasks in datasets with limited label\navailability, and b) our state-of-the-art results can inspire future work in\nthe area.\n","authors":["Lefteris Loukas","Ilias Stogiannidis","Prodromos Malakasiotis","Stavros Vassos"],"pdf_url":"https://arxiv.org/pdf/2308.14634v1.pdf","comment":"Early pre-print; Accepted at the 5th FinNLP workshop @ IJCAI-2023"},{"id":"http://arxiv.org/abs/2207.01964v3","updated":"2023-08-28T15:00:24Z","published":"2022-07-05T11:21:09Z","title":"Quantum Circuit Compiler for a Shuttling-Based Trapped-Ion Quantum\n Computer","summary":" The increasing capabilities of quantum computing hardware and the challenge\nof realizing deep quantum circuits require fully automated and efficient tools\nfor compiling quantum circuits. To express arbitrary circuits in a sequence of\nnative gates specific to the quantum computer architecture, it is necessary to\nmake algorithms portable across the landscape of quantum hardware providers. In\nthis work, we present a compiler capable of transforming and optimizing a\nquantum circuit targeting a shuttling-based trapped-ion quantum processor. It\nconsists of custom algorithms set on top of the quantum circuit framework\nPytket. The performance was evaluated for a wide range of quantum circuits and\nthe results show that the gate counts can be reduced by factors up to 5.1\ncompared to standard Pytket and up to 2.2 compared to standard Qiskit\ncompilation.\n","authors":["Fabian Kreppel","Christian Melzer","Diego Olvera Millán","Janis Wagner","Janine Hilder","Ulrich Poschinger","Ferdinand Schmidt-Kaler","André Brinkmann"],"pdf_url":"https://arxiv.org/pdf/2207.01964v3.pdf","comment":"35 pages, 25 figures, 4 tables"},{"id":"http://arxiv.org/abs/2308.14608v1","updated":"2023-08-28T14:23:04Z","published":"2023-08-28T14:23:04Z","title":"AI in the Gray: Exploring Moderation Policies in Dialogic Large Language\n Models vs. Human Answers in Controversial Topics","summary":" The introduction of ChatGPT and the subsequent improvement of Large Language\nModels (LLMs) have prompted more and more individuals to turn to the use of\nChatBots, both for information and assistance with decision-making. However,\nthe information the user is after is often not formulated by these ChatBots\nobjectively enough to be provided with a definite, globally accepted answer.\n Controversial topics, such as \"religion\", \"gender identity\", \"freedom of\nspeech\", and \"equality\", among others, can be a source of conflict as partisan\nor biased answers can reinforce preconceived notions or promote disinformation.\nBy exposing ChatGPT to such debatable questions, we aim to understand its level\nof awareness and if existing models are subject to socio-political and/or\neconomic biases. We also aim to explore how AI-generated answers compare to\nhuman ones. For exploring this, we use a dataset of a social media platform\ncreated for the purpose of debating human-generated claims on polemic subjects\namong users, dubbed Kialo.\n Our results show that while previous versions of ChatGPT have had important\nissues with controversial topics, more recent versions of ChatGPT\n(gpt-3.5-turbo) are no longer manifesting significant explicit biases in\nseveral knowledge areas. In particular, it is well-moderated regarding economic\naspects. However, it still maintains degrees of implicit libertarian leaning\ntoward right-winged ideals which suggest the need for increased moderation from\nthe socio-political point of view. In terms of domain knowledge on\ncontroversial topics, with the exception of the \"Philosophical\" category,\nChatGPT is performing well in keeping up with the collective human level of\nknowledge. Finally, we see that sources of Bing AI have slightly more tendency\nto the center when compared to human answers. All the analyses we make are\ngeneralizable to other types of biases and domains.\n","authors":["Vahid Ghafouri","Vibhor Agarwal","Yong Zhang","Nishanth Sastry","Jose Such","Guillermo Suarez-Tangil"],"pdf_url":"https://arxiv.org/pdf/2308.14608v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12890v2","updated":"2023-08-28T14:16:42Z","published":"2023-08-24T16:09:13Z","title":"Large Language Models Vote: Prompting for Rare Disease Identification","summary":" The emergence of generative Large Language Models (LLMs) emphasizes the need\nfor accurate and efficient prompting approaches. LLMs are often applied in\nFew-Shot Learning (FSL) contexts, where tasks are executed with minimal\ntraining data. FSL has become popular in many Artificial Intelligence (AI)\nsubdomains, including AI for health. Rare diseases affect a small fraction of\nthe population. Rare disease identification from clinical notes inherently\nrequires FSL techniques due to limited data availability. Manual data\ncollection and annotation is both expensive and time-consuming. In this paper,\nwe propose Models-Vote Prompting (MVP), a flexible prompting approach for\nimproving the performance of LLM queries in FSL settings. MVP works by\nprompting numerous LLMs to perform the same tasks and then conducting a\nmajority vote on the resulting outputs. This method achieves improved results\nto any one model in the ensemble on one-shot rare disease identification and\nclassification tasks. We also release a novel rare disease dataset for FSL,\navailable to those who signed the MIMIC-IV Data Use Agreement (DUA).\nFurthermore, in using MVP, each model is prompted multiple times, substantially\nincreasing the time needed for manual annotation, and to address this, we\nassess the feasibility of using JSON for automating generative LLM evaluation.\n","authors":["David Oniani","Jordan Hilsman","Hang Dong","Fengyi Gao","Shiven Verma","Yanshan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.12890v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11891v2","updated":"2023-08-28T14:07:12Z","published":"2023-08-23T03:38:21Z","title":"Bridging the Gap: Deciphering Tabular Data Using Large Language Model","summary":" In the realm of natural language processing, the understanding of tabular\ndata has perpetually stood as a focal point of scholarly inquiry. The emergence\nof expansive language models, exemplified by the likes of ChatGPT, has ushered\nin a wave of endeavors wherein researchers aim to harness these models for\ntasks related to table-based question answering. Central to our investigative\npursuits is the elucidation of methodologies that amplify the aptitude of such\nlarge language models in discerning both the structural intricacies and\ninherent content of tables, ultimately facilitating their capacity to provide\ninformed responses to pertinent queries. To this end, we have architected a\ndistinctive module dedicated to the serialization of tables for seamless\nintegration with expansive language models. Additionally, we've instituted a\ncorrective mechanism within the model to rectify potential inaccuracies.\nExperimental results indicate that, although our proposed method trails the\nSOTA by approximately 11.7% in overall metrics, it surpasses the SOTA by about\n1.2% in tests on specific datasets. This research marks the first application\nof large language models to table-based question answering tasks, enhancing the\nmodel's comprehension of both table structures and content.\n","authors":["Hengyuan Zhang","Peng Chang","Zongcheng Ji"],"pdf_url":"https://arxiv.org/pdf/2308.11891v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14536v1","updated":"2023-08-28T12:47:41Z","published":"2023-08-28T12:47:41Z","title":"Spoken Language Intelligence of Large Language Models for Language\n Learning","summary":" People have long hoped for a conversational system that can assist in\nreal-life situations, and recent progress on large language models (LLMs) is\nbringing this idea closer to reality. While LLMs are often impressive in\nperformance, their efficacy in real-world scenarios that demand expert\nknowledge remains unclear. LLMs are believed to hold the most potential and\nvalue in education, especially in the development of Artificial intelligence\n(AI) based virtual teachers capable of facilitating language learning. Our\nfocus is centered on evaluating the efficacy of LLMs in the realm of education,\nspecifically in the areas of spoken language learning which encompass\nphonetics, phonology, and second language acquisition. We introduce a new\nmultiple-choice question dataset to evaluate the effectiveness of LLMs in the\naforementioned scenarios, including understanding and application of spoken\nlanguage knowledge. In addition, we investigate the influence of various\nprompting techniques such as zero- and few-shot method (prepending the question\nwith question-answer exemplars), chain-of-thought (CoT, think step-by-step),\nin-domain exampler and external tools (Google, Wikipedia). We conducted\nlarge-scale evaluation on popular LLMs (20 distinct models) using these\nmethods. We achieved significant performance improvements compared to the\nzero-shot baseline in the practical questions reasoning (GPT-3.5, 49.1% ->\n63.1%; LLaMA2-70B-Chat, 42.2% -> 48.6%). We found that models of different\nsizes have good understanding of concepts in phonetics, phonology, and second\nlanguage acquisition, but show limitations in reasoning for real-world\nproblems. Additionally, we also explore preliminary findings on conversational\ncommunication.\n","authors":["Linkai Peng","Baorian Nuchged","Yingming Gao"],"pdf_url":"https://arxiv.org/pdf/2308.14536v1.pdf","comment":"28 pages, 7 figures, Preprint"},{"id":"http://arxiv.org/abs/2308.14533v1","updated":"2023-08-28T12:46:21Z","published":"2023-08-28T12:46:21Z","title":"A Multi-Task Semantic Decomposition Framework with Task-specific\n Pre-training for Few-Shot NER","summary":" The objective of few-shot named entity recognition is to identify named\nentities with limited labeled instances. Previous works have primarily focused\non optimizing the traditional token-wise classification framework, while\nneglecting the exploration of information based on NER data characteristics. To\naddress this issue, we propose a Multi-Task Semantic Decomposition Framework\nvia Joint Task-specific Pre-training (MSDP) for few-shot NER. Drawing\ninspiration from demonstration-based and contrastive learning, we introduce two\nnovel pre-training tasks: Demonstration-based Masked Language Modeling (MLM)\nand Class Contrastive Discrimination. These tasks effectively incorporate\nentity boundary information and enhance entity representation in Pre-trained\nLanguage Models (PLMs). In the downstream main task, we introduce a multi-task\njoint optimization framework with the semantic decomposing method, which\nfacilitates the model to integrate two different semantic information for\nentity classification. Experimental results of two few-shot NER benchmarks\ndemonstrate that MSDP consistently outperforms strong baselines by a large\nmargin. Extensive analyses validate the effectiveness and generalization of\nMSDP.\n","authors":["Guanting Dong","Zechen Wang","Jinxu Zhao","Gang Zhao","Daichi Guo","Dayuan Fu","Tingfeng Hui","Chen Zeng","Keqing He","Xuefeng Li","Liwen Wang","Xinyue Cui","Weiran Xu"],"pdf_url":"https://arxiv.org/pdf/2308.14533v1.pdf","comment":"Accepted by CIKM 2023 (Oral Presentation)"},{"id":"http://arxiv.org/abs/2308.14508v1","updated":"2023-08-28T11:53:40Z","published":"2023-08-28T11:53:40Z","title":"LongBench: A Bilingual, Multitask Benchmark for Long Context\n Understanding","summary":" Although large language models (LLMs) demonstrate impressive performance for\nmany language tasks, most of them can only handle texts a few thousand tokens\nlong, limiting their applications on longer sequence inputs, such as books,\nreports, and codebases. Recent works have proposed methods to improve LLMs'\nlong context capabilities by extending context windows and more sophisticated\nmemory mechanisms. However, comprehensive benchmarks tailored for evaluating\nlong context understanding are lacking. In this paper, we introduce LongBench,\nthe first bilingual, multi-task benchmark for long context understanding,\nenabling a more rigorous evaluation of long context understanding. LongBench\ncomprises 21 datasets across 6 task categories in both English and Chinese,\nwith an average length of 6,711 words (English) and 13,386 characters\n(Chinese). These tasks cover key long-text application areas including\nsingle-doc QA, multi-doc QA, summarization, few-shot learning, synthetic tasks,\nand code completion. All datasets in LongBench are standardized into a unified\nformat, allowing for effortless automatic evaluation of LLMs. Upon\ncomprehensive evaluation of 8 LLMs on LongBench, we find that: (1) Commercial\nmodel (GPT-3.5-Turbo-16k) outperforms other open-sourced models, but still\nstruggles on longer contexts. (2) Scaled position embedding and fine-tuning on\nlonger sequences lead to substantial improvement on long context understanding.\n(3) Context compression technique such as retrieval brings improvement for\nmodel with weak ability on long contexts, but the performance still lags behind\nmodels that have strong long context understanding capability. The code and\ndatasets are available at https://github.com/THUDM/LongBench.\n","authors":["Yushi Bai","Xin Lv","Jiajie Zhang","Hongchang Lyu","Jiankai Tang","Zhidian Huang","Zhengxiao Du","Xiao Liu","Aohan Zeng","Lei Hou","Yuxiao Dong","Jie Tang","Juanzi Li"],"pdf_url":"https://arxiv.org/pdf/2308.14508v1.pdf","comment":"18 pages, 6 figures"},{"id":"http://arxiv.org/abs/2206.08955v4","updated":"2023-08-28T11:19:57Z","published":"2022-06-17T18:11:34Z","title":"Making first order linear logic a generating grammar","summary":" It is known that different categorial grammars have surface representation in\na fragment of first order multiplicative linear logic (MLL1). We show that the\nfragment of interest is equivalent to the recently introduced extended tensor\ntype calculus (ETTC). ETTC is a calculus of specific typed terms, which\nrepresent tuples of strings, more precisely bipartite graphs decorated with\nstrings. Types are derived from linear logic formulas, and rules correspond to\nconcrete operations on these string-labeled graphs, so that they can be\nconveniently visualized. This provides the above mentioned fragment of MLL1\nthat is relevant for language modeling not only with some alternative syntax\nand intuitive geometric representation, but also with an intrinsic deductive\nsystem, which has been absent.\n In this work we consider a non-trivial notationally enriched variation of the\npreviously introduced {\\bf ETTC}, which allows more concise and transparent\ncomputations. We present both a cut-free sequent calculus and a natural\ndeduction formalism.\n","authors":["Sergey Slavnov"],"pdf_url":"https://arxiv.org/pdf/2206.08955v4.pdf","comment":"Revised and extended version with detailed proofs. arXiv admin note:\n substantial text overlap with arXiv:2112.15253"},{"id":"http://arxiv.org/abs/2305.07358v2","updated":"2023-08-28T11:07:56Z","published":"2023-05-12T10:08:46Z","title":"Towards Versatile and Efficient Visual Knowledge Integration into\n Pre-trained Language Models with Cross-Modal Adapters","summary":" Humans learn language via multi-modal knowledge. However, due to the\ntext-only pre-training scheme, most existing pre-trained language models (PLMs)\nare hindered from the multi-modal information.\n To inject visual knowledge into PLMs, existing methods incorporate either the\ntext or image encoder of vision-language models (VLMs) to encode the visual\ninformation and update all the original parameters of PLMs for knowledge\nfusion.\n In this paper, we propose a new plug-and-play module, X-adapter, to flexibly\nleverage the aligned visual and textual knowledge learned in pre-trained VLMs\nand efficiently inject them into PLMs.\n Specifically, we insert X-adapters into PLMs, and only the added parameters\nare updated during adaptation.\n To fully exploit the potential in VLMs, X-adapters consist of two\nsub-modules, V-expert and T-expert, to fuse VLMs' image and text\nrepresentations, respectively.\n We can opt for activating different sub-modules depending on the downstream\ntasks.\n Experimental results show that our method can significantly improve the\nperformance on object-color reasoning and natural language understanding (NLU)\ntasks compared with PLM baselines.\n","authors":["Xinyun Zhang","Haochen Tan","Han Wu","Mingjie Zhan","Ding Liang","Bei Yu"],"pdf_url":"https://arxiv.org/pdf/2305.07358v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14484v1","updated":"2023-08-28T10:51:11Z","published":"2023-08-28T10:51:11Z","title":"Multimodal Detection of Social Spambots in Twitter using Transformers","summary":" Although not all bots are malicious, the vast majority of them are\nresponsible for spreading misinformation and manipulating the public opinion\nabout several issues, i.e., elections and many more. Therefore, the early\ndetection of social spambots is crucial. Although there have been proposed\nmethods for detecting bots in social media, there are still substantial\nlimitations. For instance, existing research initiatives still extract a large\nnumber of features and train traditional machine learning algorithms or use\nGloVe embeddings and train LSTMs. However, feature extraction is a tedious\nprocedure demanding domain expertise. Also, language models based on\ntransformers have been proved to be better than LSTMs. Other approaches create\nlarge graphs and train graph neural networks requiring in this way many hours\nfor training and access to computational resources. To tackle these\nlimitations, this is the first study employing only the user description field\nand images of three channels denoting the type and content of tweets posted by\nthe users. Firstly, we create digital DNA sequences, transform them to 3d\nimages, and apply pretrained models of the vision domain, including\nEfficientNet, AlexNet, VGG16, etc. Next, we propose a multimodal approach,\nwhere we use TwHIN-BERT for getting the textual representation of the user\ndescription field and employ VGG16 for acquiring the visual representation for\nthe image modality. We propose three different fusion methods, namely\nconcatenation, gated multimodal unit, and crossmodal attention, for fusing the\ndifferent modalities and compare their performances. Extensive experiments\nconducted on the Cresci '17 dataset demonstrate valuable advantages of our\nintroduced approaches over state-of-the-art ones reaching Accuracy up to\n99.98%.\n","authors":["Loukas Ilias","Ioannis Michail Kazelidis","Dimitris Askounis"],"pdf_url":"https://arxiv.org/pdf/2308.14484v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14482v1","updated":"2023-08-28T10:44:18Z","published":"2023-08-28T10:44:18Z","title":"An Empirical Study of Consistency Regularization for End-to-End\n Speech-to-Text Translation","summary":" Consistency regularization methods, such as R-Drop (Liang et al., 2021) and\nCrossConST (Gao et al., 2023), have achieved impressive supervised and\nzero-shot performance in the neural machine translation (NMT) field. Can we\nalso boost end-to-end (E2E) speech-to-text translation (ST) by leveraging\nconsistency regularization? In this paper, we conduct empirical studies on\nintra-modal and cross-modal consistency and propose two training strategies,\nSimRegCR and SimZeroCR, for E2E ST in regular and zero-shot scenarios.\nExperiments on the MuST-C benchmark show that our approaches achieve\nstate-of-the-art (SOTA) performance in most translation directions. The\nanalyses prove that regularization brought by the intra-modal consistency,\ninstead of modality gap, is crucial for the regular E2E ST, and the cross-modal\nconsistency could close the modality gap and boost the zero-shot E2E ST\nperformance.\n","authors":["Pengzhi Gao","Ruiqing Zhang","Zhongjun He","Hua Wu","Haifeng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2109.05090v3","updated":"2023-08-28T10:22:14Z","published":"2021-09-10T20:06:27Z","title":"Enhancing Self-Disclosure In Neural Dialog Models By Candidate\n Re-ranking","summary":" Neural language modelling has progressed the state-of-the-art in different\ndownstream Natural Language Processing (NLP) tasks. One such area is of\nopen-domain dialog modelling, neural dialog models based on GPT-2 such as\nDialoGPT have shown promising performance in single-turn conversation. However,\nsuch (neural) dialog models have been criticized for generating responses which\nalthough may have relevance to the previous human response, tend to quickly\ndissipate human interest and descend into trivial conversation. One reason for\nsuch performance is the lack of explicit conversation strategy being employed\nin human-machine conversation. Humans employ a range of conversation strategies\nwhile engaging in a conversation, one such key social strategies is\nSelf-disclosure(SD). A phenomenon of revealing information about one-self to\nothers. Social penetration theory (SPT) proposes that communication between two\npeople moves from shallow to deeper levels as the relationship progresses\nprimarily through self-disclosure. Disclosure helps in creating rapport among\nthe participants engaged in a conversation. In this paper, Self-disclosure\nenhancement architecture (SDEA) is introduced utilizing Self-disclosure Topic\nModel (SDTM) during inference stage of a neural dialog model to re-rank\nresponse candidates to enhance self-disclosure in single-turn responses from\nfrom the model.\n","authors":["Mayank Soni","Benjamin Cowan","Vincent Wade"],"pdf_url":"https://arxiv.org/pdf/2109.05090v3.pdf","comment":"10 pages, 3 figures, 2 table"},{"id":"http://arxiv.org/abs/2308.12086v2","updated":"2023-08-28T09:42:59Z","published":"2023-08-23T12:11:27Z","title":"Out of the Cage: How Stochastic Parrots Win in Cyber Security\n Environments","summary":" Large Language Models (LLMs) have gained widespread popularity across diverse\ndomains involving text generation, summarization, and various natural language\nprocessing tasks. Despite their inherent limitations, LLM-based designs have\nshown promising capabilities in planning and navigating open-world scenarios.\nThis paper introduces a novel application of pre-trained LLMs as agents within\ncybersecurity network environments, focusing on their utility for sequential\ndecision-making processes.\n We present an approach wherein pre-trained LLMs are leveraged as attacking\nagents in two reinforcement learning environments. Our proposed agents\ndemonstrate similar or better performance against state-of-the-art agents\ntrained for thousands of episodes in most scenarios and configurations. In\naddition, the best LLM agents perform similarly to human testers of the\nenvironment without any additional training process. This design highlights the\npotential of LLMs to efficiently address complex decision-making tasks within\ncybersecurity.\n Furthermore, we introduce a new network security environment named\nNetSecGame. The environment is designed to eventually support complex\nmulti-agent scenarios within the network security domain. The proposed\nenvironment mimics real network attacks and is designed to be highly modular\nand adaptable for various scenarios.\n","authors":["Maria Rigaki","Ondřej Lukáš","Carlos A. Catania","Sebastian Garcia"],"pdf_url":"https://arxiv.org/pdf/2308.12086v2.pdf","comment":"Under review. 10 pages plus appendices, 7 figures, 4 tables. Edit:\n fix e-mails and code repository"},{"id":"http://arxiv.org/abs/2303.17650v3","updated":"2023-08-28T09:34:59Z","published":"2023-03-30T18:28:33Z","title":"Comparing Abstractive Summaries Generated by ChatGPT to Real Summaries\n Through Blinded Reviewers and Text Classification Algorithms","summary":" Large Language Models (LLMs) have gathered significant attention due to their\nimpressive performance on a variety of tasks. ChatGPT, developed by OpenAI, is\na recent addition to the family of language models and is being called a\ndisruptive technology by a few, owing to its human-like text-generation\ncapabilities. Although, many anecdotal examples across the internet have\nevaluated ChatGPT's strength and weakness, only a few systematic research\nstudies exist. To contribute to the body of literature of systematic research\non ChatGPT, we evaluate the performance of ChatGPT on Abstractive Summarization\nby the means of automated metrics and blinded human reviewers. We also build\nautomatic text classifiers to detect ChatGPT generated summaries. We found that\nwhile text classification algorithms can distinguish between real and generated\nsummaries, humans are unable to distinguish between real summaries and those\nproduced by ChatGPT.\n","authors":["Mayank Soni","Vincent Wade"],"pdf_url":"https://arxiv.org/pdf/2303.17650v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18004v2","updated":"2023-08-28T09:34:39Z","published":"2023-05-29T10:33:08Z","title":"The Effects of Political Martyrdom on Election Results: The\n Assassination of Abe","summary":" In developed nations assassinations are rare and thus the impact of such acts\non the electoral and political landscape is understudied. In this paper, we\nfocus on Twitter data to examine the effects of Japan's former Primer Minister\nAbe's assassination on the Japanese House of Councillors elections in 2022. We\nutilize sentiment analysis and emotion detection together with topic modeling\non over 2 million tweets and compare them against tweets during previous\nelection cycles. Our findings indicate that Twitter sentiments were negatively\nimpacted by the event in the short term and that social media attention span\nhas shortened. We also discuss how \"necropolitics\" affected the outcome of the\nelections in favor of the deceased's party meaning that there seems to have\nbeen an effect of Abe's death on the election outcome though the findings\nwarrant further investigation for conclusive results.\n","authors":["Miu Nicole Takagi"],"pdf_url":"https://arxiv.org/pdf/2305.18004v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14436v1","updated":"2023-08-28T09:22:02Z","published":"2023-08-28T09:22:02Z","title":"Bridging the KB-Text Gap: Leveraging Structured Knowledge-aware\n Pre-training for KBQA","summary":" Knowledge Base Question Answering (KBQA) aims to answer natural language\nquestions with factual information such as entities and relations in KBs.\nHowever, traditional Pre-trained Language Models (PLMs) are directly\npre-trained on large-scale natural language corpus, which poses challenges for\nthem in understanding and representing complex subgraphs in structured KBs. To\nbridge the gap between texts and structured KBs, we propose a Structured\nKnowledge-aware Pre-training method (SKP). In the pre-training stage, we\nintroduce two novel structured knowledge-aware tasks, guiding the model to\neffectively learn the implicit relationship and better representations of\ncomplex subgraphs. In downstream KBQA task, we further design an efficient\nlinearization strategy and an interval attention mechanism, which assist the\nmodel to better encode complex subgraphs and shield the interference of\nirrelevant subgraphs during reasoning respectively. Detailed experiments and\nanalyses on WebQSP verify the effectiveness of SKP, especially the significant\nimprovement in subgraph retrieval (+4.08% H@10).\n","authors":["Guanting Dong","Rumei Li","Sirui Wang","Yupeng Zhang","Yunsen Xian","Weiran Xu"],"pdf_url":"https://arxiv.org/pdf/2308.14436v1.pdf","comment":"Accepted as a short paper at CIKM 2023"},{"id":"http://arxiv.org/abs/2308.14429v1","updated":"2023-08-28T09:06:28Z","published":"2023-08-28T09:06:28Z","title":"Biomedical Entity Linking with Triple-aware Pre-Training","summary":" Linking biomedical entities is an essential aspect in biomedical natural\nlanguage processing tasks, such as text mining and question answering. However,\na difficulty of linking the biomedical entities using current large language\nmodels (LLM) trained on a general corpus is that biomedical entities are\nscarcely distributed in texts and therefore have been rarely seen during\ntraining by the LLM. At the same time, those LLMs are not aware of high level\nsemantic connection between different biomedical entities, which are useful in\nidentifying similar concepts in different textual contexts. To cope with\naforementioned problems, some recent works focused on injecting knowledge graph\ninformation into LLMs. However, former methods either ignore the relational\nknowledge of the entities or lead to catastrophic forgetting. Therefore, we\npropose a novel framework to pre-train the powerful generative LLM by a corpus\nsynthesized from a KG. In the evaluations we are unable to confirm the benefit\nof including synonym, description or relational information.\n","authors":["Xi Yan","Cedric Möller","Ricardo Usbeck"],"pdf_url":"https://arxiv.org/pdf/2308.14429v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14423v1","updated":"2023-08-28T09:04:03Z","published":"2023-08-28T09:04:03Z","title":"GADePo: Graph-Assisted Declarative Pooling Transformers for\n Document-Level Relation Extraction","summary":" Document-level relation extraction aims to identify relationships between\nentities within a document. Current methods rely on text-based encoders and\nemploy various hand-coded pooling heuristics to aggregate information from\nentity mentions and associated contexts. In this paper, we replace these rigid\npooling functions with explicit graph relations by leveraging the intrinsic\ngraph processing capabilities of the Transformer model. We propose a joint\ntext-graph Transformer model, and a graph-assisted declarative pooling (GADePo)\nspecification of the input which provides explicit and high-level instructions\nfor information aggregation. This allows the pooling process to be guided by\ndomain-specific knowledge or desired outcomes but still learned by the\nTransformer, leading to more flexible and customizable pooling strategies. We\nextensively evaluate our method across diverse datasets and models, and show\nthat our approach yields promising results that are comparable to those\nachieved by the hand-coded pooling functions.\n","authors":["Andrei C. Coman","Christos Theodoropoulos","Marie-Francine Moens","James Henderson"],"pdf_url":"https://arxiv.org/pdf/2308.14423v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11117v3","updated":"2023-08-28T08:44:09Z","published":"2023-03-20T13:58:35Z","title":"EmotionIC: Emotional Inertia and Contagion-Driven Dependency Modeling\n for Emotion Recognition in Conversation","summary":" Emotion Recognition in Conversation (ERC) has attracted growing attention in\nrecent years as a result of the advancement and implementation of\nhuman-computer interface technologies. In this paper, we propose a novel\napproach to dependency modeling driven by Emotional Inertia and Contagion\n(EmotionIC) for ERC task. Our EmotionIC consists of three main components,\ni.e., Identity Masked Multi-Head Attention (IMMHA), Dialogue-based Gated\nRecurrent Unit (DiaGRU), and Skip-chain Conditional Random Field (SkipCRF).\nCompared to previous ERC models, EmotionIC can model a conversation more\nthoroughly at both the feature-extraction and classification levels. The\nproposed model attempts to integrate the advantages of attention- and\nrecurrence-based methods at the feature-extraction level. Specifically, IMMHA\nis applied to capture identity-based global contextual dependencies, while\nDiaGRU is utilized to extract speaker- and temporal-aware local contextual\ninformation. At the classification level, SkipCRF can explicitly mine complex\nemotional flows from higher-order neighboring utterances in the conversation.\nExperimental results show that our method can significantly outperform the\nstate-of-the-art models on four benchmark datasets. The ablation studies\nconfirm that our modules can effectively model emotional inertia and contagion.\n","authors":["Yingjian Liu","Jiang Li","Xiaoping Wang","Zhigang Zeng"],"pdf_url":"https://arxiv.org/pdf/2303.11117v3.pdf","comment":"19 pages,10 figures"},{"id":"http://arxiv.org/abs/2307.07924v3","updated":"2023-08-28T08:38:38Z","published":"2023-07-16T02:11:34Z","title":"Communicative Agents for Software Development","summary":" Software engineering is a domain characterized by intricate decision-making\nprocesses, often relying on nuanced intuition and consultation. Recent\nadvancements in deep learning have started to revolutionize software\nengineering practices through elaborate designs implemented at various stages\nof software development. In this paper, we present an innovative paradigm that\nleverages large language models (LLMs) throughout the entire software\ndevelopment process, streamlining and unifying key processes through natural\nlanguage communication, thereby eliminating the need for specialized models at\neach phase. At the core of this paradigm lies ChatDev, a virtual chat-powered\nsoftware development company that mirrors the established waterfall model,\nmeticulously dividing the development process into four distinct chronological\nstages: designing, coding, testing, and documenting. Each stage engages a team\nof agents, such as programmers, code reviewers, and test engineers, fostering\ncollaborative dialogue and facilitating a seamless workflow. The chat chain\nacts as a facilitator, breaking down each stage into atomic subtasks. This\nenables dual roles, allowing for proposing and validating solutions through\ncontext-aware communication, leading to efficient resolution of specific\nsubtasks. The instrumental analysis of ChatDev highlights its remarkable\nefficacy in software generation, enabling the completion of the entire software\ndevelopment process in under seven minutes at a cost of less than one dollar.\nIt not only identifies and alleviates potential vulnerabilities but also\nrectifies potential hallucinations while maintaining commendable efficiency and\ncost-effectiveness. The potential of ChatDev unveils fresh possibilities for\nintegrating LLMs into the realm of software development.\n","authors":["Chen Qian","Xin Cong","Wei Liu","Cheng Yang","Weize Chen","Yusheng Su","Yufan Dang","Jiahao Li","Juyuan Xu","Dahai Li","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2307.07924v3.pdf","comment":"https://github.com/OpenBMB/ChatDev"},{"id":"http://arxiv.org/abs/2307.08487v3","updated":"2023-08-28T08:35:28Z","published":"2023-07-17T13:49:52Z","title":"Latent Jailbreak: A Benchmark for Evaluating Text Safety and Output\n Robustness of Large Language Models","summary":" Considerable research efforts have been devoted to ensuring that large\nlanguage models (LLMs) align with human values and generate safe text. However,\nan excessive focus on sensitivity to certain topics can compromise the model's\nrobustness in following instructions, thereby impacting its overall performance\nin completing tasks. Previous benchmarks for jailbreaking LLMs have primarily\nfocused on evaluating the safety of the models without considering their\nrobustness. In this paper, we propose a benchmark that assesses both the safety\nand robustness of LLMs, emphasizing the need for a balanced approach. To\ncomprehensively study text safety and output robustness, we introduce a latent\njailbreak prompt dataset, each involving malicious instruction embedding.\nSpecifically, we instruct the model to complete a regular task, such as\ntranslation, with the text to be translated containing malicious instructions.\nTo further analyze safety and robustness, we design a hierarchical annotation\nframework. We present a systematic analysis of the safety and robustness of\nLLMs regarding the position of explicit normal instructions, word replacements\n(verbs in explicit normal instructions, target groups in malicious\ninstructions, cue words for explicit normal instructions), and instruction\nreplacements (different explicit normal instructions). Our results demonstrate\nthat current LLMs not only prioritize certain instruction verbs but also\nexhibit varying jailbreak rates for different instruction verbs in explicit\nnormal instructions. Code and data are available at\nhttps://github.com/qiuhuachuan/latent-jailbreak.\n","authors":["Huachuan Qiu","Shuai Zhang","Anqi Li","Hongliang He","Zhenzhong Lan"],"pdf_url":"https://arxiv.org/pdf/2307.08487v3.pdf","comment":"Code and data are available at\n https://github.com/qiuhuachuan/latent-jailbreak"},{"id":"http://arxiv.org/abs/2308.14391v1","updated":"2023-08-28T08:14:20Z","published":"2023-08-28T08:14:20Z","title":"FIRE: Food Image to REcipe generation","summary":" Food computing has emerged as a prominent multidisciplinary field of research\nin recent years. An ambitious goal of food computing is to develop end-to-end\nintelligent systems capable of autonomously producing recipe information for a\nfood image. Current image-to-recipe methods are retrieval-based and their\nsuccess depends heavily on the dataset size and diversity, as well as the\nquality of learned embeddings. Meanwhile, the emergence of powerful\nattention-based vision and language models presents a promising avenue for\naccurate and generalizable recipe generation, which has yet to be extensively\nexplored. This paper proposes FIRE, a novel multimodal methodology tailored to\nrecipe generation in the food computing domain, which generates the food title,\ningredients, and cooking instructions based on input food images. FIRE\nleverages the BLIP model to generate titles, utilizes a Vision Transformer with\na decoder for ingredient extraction, and employs the T5 model to generate\nrecipes incorporating titles and ingredients as inputs. We showcase two\npractical applications that can benefit from integrating FIRE with large\nlanguage model prompting: recipe customization to fit recipes to user\npreferences and recipe-to-code transformation to enable automated cooking\nprocesses. Our experimental findings validate the efficacy of our proposed\napproach, underscoring its potential for future advancements and widespread\nadoption in food computing.\n","authors":["Prateek Chhikara","Dhiraj Chaurasia","Yifan Jiang","Omkar Masur","Filip Ilievski"],"pdf_url":"https://arxiv.org/pdf/2308.14391v1.pdf","comment":"5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2307.03104v5","updated":"2023-08-28T08:11:12Z","published":"2023-07-06T16:26:34Z","title":"Efficient Domain Adaptation of Sentence Embeddings Using Adapters","summary":" Sentence embeddings enable us to capture the semantic similarity of short\ntexts. Most sentence embedding models are trained for general semantic textual\nsimilarity tasks. Therefore, to use sentence embeddings in a particular domain,\nthe model must be adapted to it in order to achieve good results. Usually, this\nis done by fine-tuning the entire sentence embedding model for the domain of\ninterest. While this approach yields state-of-the-art results, all of the\nmodel's weights are updated during fine-tuning, making this method\nresource-intensive. Therefore, instead of fine-tuning entire sentence embedding\nmodels for each target domain individually, we propose to train lightweight\nadapters. These domain-specific adapters do not require fine-tuning all\nunderlying sentence embedding model parameters. Instead, we only train a small\nnumber of additional parameters while keeping the weights of the underlying\nsentence embedding model fixed. Training domain-specific adapters allows always\nusing the same base model and only exchanging the domain-specific adapters to\nadapt sentence embeddings to a specific domain. We show that using adapters for\nparameter-efficient domain adaptation of sentence embeddings yields competitive\nperformance within 1% of a domain-adapted, entirely fine-tuned sentence\nembedding model while only training approximately 3.6% of the parameters.\n","authors":["Tim Schopf","Dennis N. Schneider","Florian Matthes"],"pdf_url":"https://arxiv.org/pdf/2307.03104v5.pdf","comment":"Accepted to the 14th International Conference on Recent Advances in\n Natural Language Processing (RANLP 2023)"},{"id":"http://arxiv.org/abs/2302.04391v5","updated":"2023-08-28T08:02:47Z","published":"2023-02-09T01:09:57Z","title":"The Re-Label Method For Data-Centric Machine Learning","summary":" In industry deep learning application, our manually labeled data has a\ncertain number of noisy data. To solve this problem and achieve more than 90\nscore in dev dataset, we present a simple method to find the noisy data and\nre-label the noisy data by human, given the model predictions as references in\nhuman labeling. In this paper, we illustrate our idea for a broad set of deep\nlearning tasks, includes classification, sequence tagging, object detection,\nsequence generation, click-through rate prediction. The experimental results\nand human evaluation results verify our idea.\n","authors":["Tong Guo"],"pdf_url":"https://arxiv.org/pdf/2302.04391v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09729v3","updated":"2023-08-28T07:37:36Z","published":"2023-08-17T16:59:50Z","title":"MindMap: Knowledge Graph Prompting Sparks Graph of Thoughts in Large\n Language Models","summary":" LLMs usually exhibit limitations in their ability to incorporate new\nknowledge, the generation of hallucinations, and the transparency of their\ndecision-making process. In this paper, we explore how to prompt LLMs with\nknowledge graphs (KG), working as a remedy to engage LLMs with up-to-date\nknowledge and elicit the reasoning pathways from LLMs. Specifically, we build a\nprompting pipeline that endows LLMs with the capability of comprehending KG\ninputs and inferring with a combined implicit knowledge and the retrieved\nexternal knowledge. In addition, we investigate eliciting the mind map on which\nLLMs perform the reasoning and generate the answers. It is identified that the\nproduced mind map exhibits the reasoning pathways of LLMs grounded on the\nontology of knowledge, hence bringing the prospects of probing and gauging LLM\ninference in production. The experiments on three question & answering datasets\nalso show that MindMap prompting leads to a striking empirical gain. For\ninstance, prompting a GPT-3.5 with MindMap yields an overwhelming performance\nover GPT-4 consistently. We also demonstrate that with structured facts\nretrieved from KG, MindMap can outperform a series of\nprompting-with-document-retrieval methods, benefiting from more accurate,\nconcise, and comprehensive knowledge from KGs.\n","authors":["Yilin Wen","Zifeng Wang","Jimeng Sun"],"pdf_url":"https://arxiv.org/pdf/2308.09729v3.pdf","comment":"7 pages, 8 figures, 9 tables"},{"id":"http://arxiv.org/abs/2305.07011v4","updated":"2023-08-28T07:29:03Z","published":"2023-05-11T17:53:29Z","title":"Region-Aware Pretraining for Open-Vocabulary Object Detection with\n Vision Transformers","summary":" We present Region-aware Open-vocabulary Vision Transformers (RO-ViT) - a\ncontrastive image-text pretraining recipe to bridge the gap between image-level\npretraining and open-vocabulary object detection. At the pretraining phase, we\npropose to randomly crop and resize regions of positional embeddings instead of\nusing the whole image positional embeddings. This better matches the use of\npositional embeddings at region-level in the detection finetuning phase. In\naddition, we replace the common softmax cross entropy loss in contrastive\nlearning with focal loss to better learn the informative yet difficult\nexamples. Finally, we leverage recent advances in novel object proposals to\nimprove open-vocabulary detection finetuning. We evaluate our full model on the\nLVIS and COCO open-vocabulary detection benchmarks and zero-shot transfer.\nRO-ViT achieves a state-of-the-art 34.1 $AP_r$ on LVIS, surpassing the best\nexisting approach by +7.8 points in addition to competitive zero-shot transfer\ndetection. Surprisingly, RO-ViT improves the image-level representation as well\nand achieves the state of the art on 9 out of 12 metrics on COCO and Flickr\nimage-text retrieval benchmarks, outperforming competitive approaches with\nlarger models.\n","authors":["Dahun Kim","Anelia Angelova","Weicheng Kuo"],"pdf_url":"https://arxiv.org/pdf/2305.07011v4.pdf","comment":"CVPR 2023 Highlight - https://github.com/mcahny/rovit ; adds LAION-2B\n result"},{"id":"http://arxiv.org/abs/2209.02552v2","updated":"2023-08-28T07:15:51Z","published":"2022-09-06T15:01:06Z","title":"Explaining Machine Learning Models in Natural Conversations: Towards a\n Conversational XAI Agent","summary":" The goal of Explainable AI (XAI) is to design methods to provide insights\ninto the reasoning process of black-box models, such as deep neural networks,\nin order to explain them to humans. Social science research states that such\nexplanations should be conversational, similar to human-to-human explanations.\nIn this work, we show how to incorporate XAI in a conversational agent, using a\nstandard design for the agent comprising natural language understanding and\ngeneration components. We build upon an XAI question bank which we extend by\nquality-controlled paraphrases to understand the user's information needs. We\nfurther systematically survey the literature for suitable explanation methods\nthat provide the information to answer those questions, and present a\ncomprehensive list of suggestions. Our work is the first step towards truly\nnatural conversations about machine learning models with an explanation agent.\nThe comprehensive list of XAI questions and the corresponding explanation\nmethods may support other researchers in providing the necessary information to\naddress users' demands.\n","authors":["Van Bach Nguyen","Jörg Schlötterer","Christin Seifert"],"pdf_url":"https://arxiv.org/pdf/2209.02552v2.pdf","comment":"Accepted at The World Conference on eXplainable Artificial\n Intelligence 2023 (XAI-2023)"},{"id":"http://arxiv.org/abs/2308.14359v1","updated":"2023-08-28T07:11:27Z","published":"2023-08-28T07:11:27Z","title":"Effect of Attention and Self-Supervised Speech Embeddings on\n Non-Semantic Speech Tasks","summary":" Human emotion understanding is pivotal in making conversational technology\nmainstream. We view speech emotion understanding as a perception task which is\na more realistic setting. With varying contexts (languages, demographics, etc.)\ndifferent share of people perceive the same speech segment as a non-unanimous\nemotion. As part of the ACM Multimedia 2023 Computational Paralinguistics\nChallengE (ComParE) in the EMotion Share track, we leverage their rich dataset\nof multilingual speakers and multi-label regression target of 'emotion share'\nor perception of that emotion. We demonstrate that the training scheme of\ndifferent foundation models dictates their effectiveness for tasks beyond\nspeech recognition, especially for non-semantic speech tasks like emotion\n understanding. This is a very complex task due to multilingual speakers,\nvariability in the target labels, and inherent imbalance in the regression\ndataset. Our results show that HuBERT-Large with a self-attention-based\nlight-weight sequence model provides 4.6% improvement over the reported\nbaseline.\n","authors":["Payal Mohapatra","Akash Pandey","Yueyuan Sui","Qi Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.14359v1.pdf","comment":"Accepted to appear at ACM Multimedia 2023 Multimedia Grand Challenges\n Track"},{"id":"http://arxiv.org/abs/2308.14353v1","updated":"2023-08-28T06:56:44Z","published":"2023-08-28T06:56:44Z","title":"ZhuJiu: A Multi-dimensional, Multi-faceted Chinese Benchmark for Large\n Language Models","summary":" The unprecedented performance of large language models (LLMs) requires\ncomprehensive and accurate evaluation. We argue that for LLMs evaluation,\nbenchmarks need to be comprehensive and systematic. To this end, we propose the\nZhuJiu benchmark, which has the following strengths: (1) Multi-dimensional\nability coverage: We comprehensively evaluate LLMs across 7 ability dimensions\ncovering 51 tasks. Especially, we also propose a new benchmark that focuses on\nknowledge ability of LLMs. (2) Multi-faceted evaluation methods collaboration:\nWe use 3 different yet complementary evaluation methods to comprehensively\nevaluate LLMs, which can ensure the authority and accuracy of the evaluation\nresults. (3) Comprehensive Chinese benchmark: ZhuJiu is the pioneering\nbenchmark that fully assesses LLMs in Chinese, while also providing equally\nrobust evaluation abilities in English. (4) Avoiding potential data leakage: To\navoid data leakage, we construct evaluation data specifically for 37 tasks. We\nevaluate 10 current mainstream LLMs and conduct an in-depth discussion and\nanalysis of their results. The ZhuJiu benchmark and open-participation\nleaderboard are publicly released at http://www.zhujiu-benchmark.com/ and we\nalso provide a demo video at https://youtu.be/qypkJ89L1Ic.\n","authors":["Baoli Zhang","Haining Xie","Pengfan Du","Junhao Chen","Pengfei Cao","Yubo Chen","Shengping Liu","Kang Liu","Jun Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.14353v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14352v1","updated":"2023-08-28T06:56:08Z","published":"2023-08-28T06:56:08Z","title":"EdgeMoE: Fast On-Device Inference of MoE-based Large Language Models","summary":" Large Language Models (LLMs) such as GPTs and LLaMa have ushered in a\nrevolution in machine intelligence, owing to their exceptional capabilities in\na wide range of machine learning tasks. However, the transition of LLMs from\ndata centers to edge devices presents a set of challenges and opportunities.\nWhile this shift can enhance privacy and availability, it is hampered by the\nenormous parameter sizes of these models, leading to impractical runtime costs.\nIn light of these considerations, we introduce EdgeMoE, the first on-device\ninference engine tailored for mixture-of-expert (MoE) LLMs, a popular variant\nof sparse LLMs that exhibit nearly constant computational complexity as their\nparameter size scales. EdgeMoE achieves both memory and computational\nefficiency by strategically partitioning the model across the storage\nhierarchy. Specifically, non-expert weights are stored in the device's memory,\nwhile expert weights are kept in external storage and are fetched into memory\nonly when they are activated. This design is underpinned by a crucial insight\nthat expert weights, though voluminous, are infrequently accessed due to sparse\nactivation patterns. To further mitigate the overhead associated with expert\nI/O swapping, EdgeMoE incorporates two innovative techniques: (1) Expert-wise\nbitwidth adaptation: This method reduces the size of expert weights with an\nacceptable level of accuracy loss. (2) Expert management: It predicts the\nexperts that will be activated in advance and preloads them into the\ncompute-I/O pipeline, thus further optimizing the process. In empirical\nevaluations conducted on well-established MoE LLMs and various edge devices,\nEdgeMoE demonstrates substantial memory savings and performance improvements\nwhen compared to competitive baseline solutions.\n","authors":["Rongjie Yi","Liwei Guo","Shiyun Wei","Ao Zhou","Shangguang Wang","Mengwei Xu"],"pdf_url":"https://arxiv.org/pdf/2308.14352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14346v1","updated":"2023-08-28T06:41:49Z","published":"2023-08-28T06:41:49Z","title":"DISC-MedLLM: Bridging General Large Language Models and Real-World\n Medical Consultation","summary":" We propose DISC-MedLLM, a comprehensive solution that leverages Large\nLanguage Models (LLMs) to provide accurate and truthful medical response in\nend-to-end conversational healthcare services. To construct high-quality\nSupervised Fine-Tuning (SFT) datasets, we employ three strategies: utilizing\nmedical knowledge-graphs, reconstructing real-world dialogues, and\nincorporating human-guided preference rephrasing. These datasets are\ninstrumental in training DISC-MedLLM, surpassing existing medical LLMs in both\nsingle-turn and multi-turn consultation scenarios. Extensive experimental\nresults demonstrate the effectiveness of the proposed model in bridging the gap\nbetween general language models and real-world medical consultation.\nAdditionally, we release the constructed dataset and model weights to further\ncontribute to research and development. Further details and resources can be\nfound at https://github.com/FudanDISC/DISC-MedLLM\n","authors":["Zhijie Bao","Wei Chen","Shengze Xiao","Kuang Ren","Jiaao Wu","Cheng Zhong","Jiajie Peng","Xuanjing Huang","Zhongyu Wei"],"pdf_url":"https://arxiv.org/pdf/2308.14346v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2308.14337v1","updated":"2023-08-28T06:30:33Z","published":"2023-08-28T06:30:33Z","title":"Cognitive Effects in Large Language Models","summary":" Large Language Models (LLMs) such as ChatGPT have received enormous attention\nover the past year and are now used by hundreds of millions of people every\nday. The rapid adoption of this technology naturally raises questions about the\npossible biases such models might exhibit. In this work, we tested one of these\nmodels (GPT-3) on a range of cognitive effects, which are systematic patterns\nthat are usually found in human cognitive tasks. We found that LLMs are indeed\nprone to several human cognitive effects. Specifically, we show that the\npriming, distance, SNARC, and size congruity effects were presented with GPT-3,\nwhile the anchoring effect is absent. We describe our methodology, and\nspecifically the way we converted real-world experiments to text-based\nexperiments. Finally, we speculate on the possible reasons why GPT-3 exhibits\nthese effects and discuss whether they are imitated or reinvented.\n","authors":["Jonathan Shaki","Sarit Kraus","Michael Wooldridge"],"pdf_url":"https://arxiv.org/pdf/2308.14337v1.pdf","comment":"Accepted and will be published in the ECAI conference"},{"id":"http://arxiv.org/abs/2308.14321v1","updated":"2023-08-28T06:05:18Z","published":"2023-08-28T06:05:18Z","title":"Leveraging A Medical Knowledge Graph into Large Language Models for\n Diagnosis Prediction","summary":" Electronic Health Records (EHRs) and routine documentation practices play a\nvital role in patients' daily care, providing a holistic record of health,\ndiagnoses, and treatment. However, complex and verbose EHR narratives overload\nhealthcare providers, risking diagnostic inaccuracies. While Large Language\nModels (LLMs) have showcased their potential in diverse language tasks, their\napplication in the healthcare arena needs to ensure the minimization of\ndiagnostic errors and the prevention of patient harm. In this paper, we outline\nan innovative approach for augmenting the proficiency of LLMs in the realm of\nautomated diagnosis generation, achieved through the incorporation of a medical\nknowledge graph (KG) and a novel graph model: Dr.Knows, inspired by the\nclinical diagnostic reasoning process. We derive the KG from the National\nLibrary of Medicine's Unified Medical Language System (UMLS), a robust\nrepository of biomedical knowledge. Our method negates the need for\npre-training and instead leverages the KG as an auxiliary instrument aiding in\nthe interpretation and summarization of complex medical concepts. Using\nreal-world hospital datasets, our experimental results demonstrate that the\nproposed approach of combining LLMs with KG has the potential to improve the\naccuracy of automated diagnosis generation. More importantly, our approach\noffers an explainable diagnostic pathway, edging us closer to the realization\nof AI-augmented diagnostic decision support systems.\n","authors":["Yanjun Gao","Ruizhe Li","John Caskey","Dmitriy Dligach","Timothy Miller","Matthew M. Churpek","Majid Afshar"],"pdf_url":"https://arxiv.org/pdf/2308.14321v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2307.03109v7","updated":"2023-08-28T05:50:53Z","published":"2023-07-06T16:28:35Z","title":"A Survey on Evaluation of Large Language Models","summary":" Large language models (LLMs) are gaining increasing popularity in both\nacademia and industry, owing to their unprecedented performance in various\napplications. As LLMs continue to play a vital role in both research and daily\nuse, their evaluation becomes increasingly critical, not only at the task\nlevel, but also at the society level for better understanding of their\npotential risks. Over the past years, significant efforts have been made to\nexamine LLMs from various perspectives. This paper presents a comprehensive\nreview of these evaluation methods for LLMs, focusing on three key dimensions:\nwhat to evaluate, where to evaluate, and how to evaluate. Firstly, we provide\nan overview from the perspective of evaluation tasks, encompassing general\nnatural language processing tasks, reasoning, medical usage, ethics,\neducations, natural and social sciences, agent applications, and other areas.\nSecondly, we answer the `where' and `how' questions by diving into the\nevaluation methods and benchmarks, which serve as crucial components in\nassessing performance of LLMs. Then, we summarize the success and failure cases\nof LLMs in different tasks. Finally, we shed light on several future challenges\nthat lie ahead in LLMs evaluation. Our aim is to offer invaluable insights to\nresearchers in the realm of LLMs evaluation, thereby aiding the development of\nmore proficient LLMs. Our key point is that evaluation should be treated as an\nessential discipline to better assist the development of LLMs. We consistently\nmaintain the related open-source materials at:\nhttps://github.com/MLGroupJLU/LLM-eval-survey.\n","authors":["Yupeng Chang","Xu Wang","Jindong Wang","Yuan Wu","Linyi Yang","Kaijie Zhu","Hao Chen","Xiaoyuan Yi","Cunxiang Wang","Yidong Wang","Wei Ye","Yue Zhang","Yi Chang","Philip S. Yu","Qiang Yang","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2307.03109v7.pdf","comment":"26 pages; a major update to include more recent works;\n https://llm-eval.github.io/"},{"id":"http://arxiv.org/abs/2308.14306v1","updated":"2023-08-28T04:57:07Z","published":"2023-08-28T04:57:07Z","title":"Evaluating the Robustness to Instructions of Large Language Models","summary":" Recently, Instruction fine-tuning has risen to prominence as a potential\nmethod for enhancing the zero-shot capabilities of Large Language Models (LLMs)\non novel tasks. This technique has shown an exceptional ability to boost the\nperformance of moderately sized LLMs, sometimes even reaching performance\nlevels comparable to those of much larger model variants. The focus is on the\nrobustness of instruction-tuned LLMs to seen and unseen tasks. We conducted an\nexploration of six models including Alpaca, Vicuna, WizardLM, and Traditional\nTask-oriented Models(Flan-T5-XL/XXL, T0++) using real-world relation extraction\ndatasets as case studies. We carried out a comprehensive evaluation of these\ninstruction-following LLMs which have been tuned based on open-domain\ninstructions and task-oriented instructions. The main discussion is their\nperformance and robustness towards instructions. We have observed that in most\ncases, the model's performance in dealing with unfamiliar instructions tends to\nworsen significantly, and the robustness of the model for RE instructions\ndeteriorates compared to QA. Further, we discovered that up until a certain\nparameter size threshold (3B), the performance of the FLAN-T5 model improves as\nthe parameter count increases. The robustness of different scales of FLAN-T5\nmodels to RE instruction is worse than the robustness to QA instruction.\n","authors":["Yuansheng Ni","Sichao Jiang","Xinyu wu","Hui Shen","Yuli Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.14306v1.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2308.06966v2","updated":"2023-08-28T04:12:30Z","published":"2023-08-14T06:49:53Z","title":"EcomGPT: Instruction-tuning Large Language Models with Chain-of-Task\n Tasks for E-commerce","summary":" Recently, instruction-following Large Language Models (LLMs) , represented by\nChatGPT, have exhibited exceptional performance in general Natural Language\nProcessing (NLP) tasks. However, the unique characteristics of E-commerce data\npose significant challenges to general LLMs. An LLM tailored specifically for\nE-commerce scenarios, possessing robust cross-dataset/task generalization\ncapabilities, is a pressing necessity. To solve this issue, in this work, we\nproposed the first e-commerce instruction dataset EcomInstruct, with a total of\n2.5 million instruction data. EcomInstruct scales up the data size and task\ndiversity by constructing atomic tasks with E-commerce basic data types, such\nas product information, user reviews. Atomic tasks are defined as intermediate\ntasks implicitly involved in solving a final task, which we also call\nChain-of-Task tasks. We developed EcomGPT with different parameter scales by\ntraining the backbone model BLOOMZ with the EcomInstruct. Benefiting from the\nfundamental semantic understanding capabilities acquired from the Chain-of-Task\ntasks, EcomGPT exhibits excellent zero-shot generalization capabilities.\nExtensive experiments and human evaluations demonstrate that EcomGPT\noutperforms ChatGPT in term of cross-dataset/task generalization on E-commerce\ntasks.\n","authors":["Yangning Li","Shirong Ma","Xiaobin Wang","Shen Huang","Chengyue Jiang","Hai-Tao Zheng","Pengjun Xie","Fei Huang","Yong Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.06966v2.pdf","comment":"Initial version of EcomGPT"},{"id":"http://arxiv.org/abs/2308.14280v1","updated":"2023-08-28T03:26:21Z","published":"2023-08-28T03:26:21Z","title":"FonMTL: Towards Multitask Learning for the Fon Language","summary":" The Fon language, spoken by an average 2 million of people, is a truly\nlow-resourced African language, with a limited online presence, and existing\ndatasets (just to name but a few). Multitask learning is a learning paradigm\nthat aims to improve the generalization capacity of a model by sharing\nknowledge across different but related tasks: this could be prevalent in very\ndata-scarce scenarios. In this paper, we present the first explorative approach\nto multitask learning, for model capabilities enhancement in Natural Language\nProcessing for the Fon language. Specifically, we explore the tasks of Named\nEntity Recognition (NER) and Part of Speech Tagging (POS) for Fon. We leverage\ntwo language model heads as encoders to build shared representations for the\ninputs, and we use linear layers blocks for classification relative to each\ntask. Our results on the NER and POS tasks for Fon, show competitive (or\nbetter) performances compared to several multilingual pretrained language\nmodels finetuned on single tasks. Additionally, we perform a few ablation\nstudies to leverage the efficiency of two different loss combination strategies\nand find out that the equal loss weighting approach works best in our case. Our\ncode is open-sourced at https://github.com/bonaventuredossou/multitask_fon.\n","authors":["Bonaventure F. P. Dossou","Iffanice Houndayi","Pamely Zantou","Gilles Hacheme"],"pdf_url":"https://arxiv.org/pdf/2308.14280v1.pdf","comment":"Accepted at WiNLP workshop, co-located at EMNLP 2023"},{"id":"http://arxiv.org/abs/2308.14272v1","updated":"2023-08-28T03:03:03Z","published":"2023-08-28T03:03:03Z","title":"Goodhart's Law Applies to NLP's Explanation Benchmarks","summary":" Despite the rising popularity of saliency-based explanations, the research\ncommunity remains at an impasse, facing doubts concerning their purpose,\nefficacy, and tendency to contradict each other. Seeking to unite the\ncommunity's efforts around common goals, several recent works have proposed\nevaluation metrics. In this paper, we critically examine two sets of metrics:\nthe ERASER metrics (comprehensiveness and sufficiency) and the EVAL-X metrics,\nfocusing our inquiry on natural language processing. First, we show that we can\ninflate a model's comprehensiveness and sufficiency scores dramatically without\naltering its predictions or explanations on in-distribution test inputs. Our\nstrategy exploits the tendency for extracted explanations and their complements\nto be \"out-of-support\" relative to each other and in-distribution inputs. Next,\nwe demonstrate that the EVAL-X metrics can be inflated arbitrarily by a simple\nmethod that encodes the label, even though EVAL-X is precisely motivated to\naddress such exploits. Our results raise doubts about the ability of current\nmetrics to guide explainability research, underscoring the need for a broader\nreassessment of what precisely these metrics are intended to capture.\n","authors":["Jennifer Hsia","Danish Pruthi","Aarti Singh","Zachary C. Lipton"],"pdf_url":"https://arxiv.org/pdf/2308.14272v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14266v1","updated":"2023-08-28T02:48:49Z","published":"2023-08-28T02:48:49Z","title":"SalesBot 2.0: A Human-Like Intent-Guided Chit-Chat Dataset","summary":" In recent research on dialogue systems and corpora, there has been a\nsignificant focus on two distinct categories: task-oriented (TOD) and\nopen-domain (chit-chat) dialogues. TOD systems aim to satisfy specific user\ngoals, such as finding a movie to watch, whereas open-domain systems primarily\nfocus on generating engaging conversations. A recent study by Chiu et al.\n(2022) introduced SalesBot, which provides simulators and a dataset with\none-turn transition from chit-chat to task-oriented dialogues. However, the\npreviously generated data solely relied on BlenderBot, which raised concerns\nabout its long-turn naturalness and consistency during a conversation. To\naddress this issue, this paper aims to build SalesBot 2.0, a revised version of\nthe published data, by leveraging the commonsense knowledge of large language\nmodels (LLMs) through proper prompting. The objective is to gradually bridge\nthe gap between chit-chat and TOD towards better naturalness and consistency.\nThe newly released large-scale dataset with detailed annotations exhibits\nsmoother transitions between topics and is more human-like in terms of\nnaturalness and consistency. It can serve as a valuable resource for both\nacademic research and commercial applications. Furthermore, our proposed\nframework can be applied to generate numerous dialogues with various target\nintents.\n","authors":["Wen-Yu Chang","Yun-Nung Chen"],"pdf_url":"https://arxiv.org/pdf/2308.14266v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14242v1","updated":"2023-08-28T01:05:18Z","published":"2023-08-28T01:05:18Z","title":"The Cultural Psychology of Large Language Models: Is ChatGPT a Holistic\n or Analytic Thinker?","summary":" The prevalent use of Large Language Models (LLMs) has necessitated studying\ntheir mental models, yielding noteworthy theoretical and practical\nimplications. Current research has demonstrated that state-of-the-art LLMs,\nsuch as ChatGPT, exhibit certain theory of mind capabilities and possess\nrelatively stable Big Five and/or MBTI personality traits. In addition,\ncognitive process features form an essential component of these mental models.\nResearch in cultural psychology indicated significant differences in the\ncognitive processes of Eastern and Western people when processing information\nand making judgments. While Westerners predominantly exhibit analytical\nthinking that isolates things from their environment to analyze their nature\nindependently, Easterners often showcase holistic thinking, emphasizing\nrelationships and adopting a global viewpoint. In our research, we probed the\ncultural cognitive traits of ChatGPT. We employed two scales that directly\nmeasure the cognitive process: the Analysis-Holism Scale (AHS) and the Triadic\nCategorization Task (TCT). Additionally, we used two scales that investigate\nthe value differences shaped by cultural thinking: the Dialectical Self Scale\n(DSS) and the Self-construal Scale (SCS). In cognitive process tests (AHS/TCT),\nChatGPT consistently tends towards Eastern holistic thinking, but regarding\nvalue judgments (DSS/SCS), ChatGPT does not significantly lean towards the East\nor the West. We suggest that the result could be attributed to both the\ntraining paradigm and the training data in LLM development. We discuss the\npotential value of this finding for AI research and directions for future\nresearch.\n","authors":["Chuanyang Jin","Songyang Zhang","Tianmin Shu","Zhihan Cui"],"pdf_url":"https://arxiv.org/pdf/2308.14242v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.01002v2","updated":"2023-08-28T00:57:19Z","published":"2023-04-03T14:06:47Z","title":"Does Human Collaboration Enhance the Accuracy of Identifying\n LLM-Generated Deepfake Texts?","summary":" Advances in Large Language Models (e.g., GPT-4, LLaMA) have improved the\ngeneration of coherent sentences resembling human writing on a large scale,\nresulting in the creation of so-called deepfake texts. However, this progress\nposes security and privacy concerns, necessitating effective solutions for\ndistinguishing deepfake texts from human-written ones. Although prior works\nstudied humans' ability to detect deepfake texts, none has examined whether\n\"collaboration\" among humans improves the detection of deepfake texts. In this\nstudy, to address this gap of understanding on deepfake texts, we conducted\nexperiments with two groups: (1) nonexpert individuals from the AMT platform\nand (2) writing experts from the Upwork platform. The results demonstrate that\ncollaboration among humans can potentially improve the detection of deepfake\ntexts for both groups, increasing detection accuracies by 6.36% for non-experts\nand 12.76% for experts, respectively, compared to individuals' detection\naccuracies. We further analyze the explanations that humans used for detecting\na piece of text as deepfake text, and find that the strongest indicator of\ndeepfake texts is their lack of coherence and consistency. Our study provides\nuseful insights for future tools and framework designs to facilitate the\ncollaborative human detection of deepfake texts. The experiment datasets and\nAMT implementations are available at:\nhttps://github.com/huashen218/llm-deepfake-human-study.git\n","authors":["Adaku Uchendu","Jooyoung Lee","Hua Shen","Thai Le","Ting-Hao 'Kenneth' Huang","Dongwon Lee"],"pdf_url":"https://arxiv.org/pdf/2304.01002v2.pdf","comment":"Accepted at The 11th AAAI Conference on Human Computation and\n Crowdsourcing (HCOMP 2023)"},{"id":"http://arxiv.org/abs/2305.17118v2","updated":"2023-08-28T22:48:46Z","published":"2023-05-26T17:39:58Z","title":"Scissorhands: Exploiting the Persistence of Importance Hypothesis for\n LLM KV Cache Compression at Test Time","summary":" Large language models(LLMs) have sparked a new wave of exciting AI\napplications. Hosting these models at scale requires significant memory\nresources. One crucial memory bottleneck for the deployment stems from the\ncontext window. It is commonly recognized that model weights are memory hungry;\nhowever, the size of key-value embedding stored during the generation process\n(KV cache) can easily surpass the model size. The enormous size of the KV cache\nputs constraints on the inference batch size, which is crucial for high\nthroughput inference workload. Inspired by an interesting observation of the\nattention scores, we hypothesize the persistence of importance: only pivotal\ntokens, which had a substantial influence at one step, will significantly\ninfluence future generations. Based on our empirical verification and\ntheoretical analysis around this hypothesis, we propose Scissorhands, a system\nthat maintains the memory usage of the KV cache at a fixed budget without\nfinetuning the model. In essence, Scissorhands manages the KV cache by storing\nthe pivotal tokens with a higher probability. We validate that Scissorhands\nreduces the inference memory usage of the KV cache by up to 5X without\ncompromising model quality. We further demonstrate that Scissorhands can be\ncombined with 4-bit quantization, traditionally used to compress model weights,\nto achieve up to 20X compression.\n","authors":["Zichang Liu","Aditya Desai","Fangshuo Liao","Weitao Wang","Victor Xie","Zhaozhuo Xu","Anastasios Kyrillidis","Anshumali Shrivastava"],"pdf_url":"https://arxiv.org/pdf/2305.17118v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14921v1","updated":"2023-08-28T22:32:05Z","published":"2023-08-28T22:32:05Z","title":"Gender bias and stereotypes in Large Language Models","summary":" Large Language Models (LLMs) have made substantial progress in the past\nseveral months, shattering state-of-the-art benchmarks in many domains. This\npaper investigates LLMs' behavior with respect to gender stereotypes, a known\nissue for prior models. We use a simple paradigm to test the presence of gender\nbias, building on but differing from WinoBias, a commonly used gender bias\ndataset, which is likely to be included in the training data of current LLMs.\nWe test four recently published LLMs and demonstrate that they express biased\nassumptions about men and women's occupations. Our contributions in this paper\nare as follows: (a) LLMs are 3-6 times more likely to choose an occupation that\nstereotypically aligns with a person's gender; (b) these choices align with\npeople's perceptions better than with the ground truth as reflected in official\njob statistics; (c) LLMs in fact amplify the bias beyond what is reflected in\nperceptions or the ground truth; (d) LLMs ignore crucial ambiguities in\nsentence structure 95% of the time in our study items, but when explicitly\nprompted, they recognize the ambiguity; (e) LLMs provide explanations for their\nchoices that are factually inaccurate and likely obscure the true reason behind\ntheir predictions. That is, they provide rationalizations of their biased\nbehavior. This highlights a key property of these models: LLMs are trained on\nimbalanced datasets; as such, even with the recent successes of reinforcement\nlearning with human feedback, they tend to reflect those imbalances back at us.\nAs with other types of societal biases, we suggest that LLMs must be carefully\ntested to ensure that they treat minoritized individuals and communities\nequitably.\n","authors":["Hadas Kotek","Rikker Dockum","David Q. Sun"],"pdf_url":"https://arxiv.org/pdf/2308.14921v1.pdf","comment":"ACM Collective Intelligence"},{"id":"http://arxiv.org/abs/2308.14905v1","updated":"2023-08-28T21:16:08Z","published":"2023-08-28T21:16:08Z","title":"Neural approaches to spoken content embedding","summary":" Comparing spoken segments is a central operation to speech processing.\nTraditional approaches in this area have favored frame-level dynamic\nprogramming algorithms, such as dynamic time warping, because they require no\nsupervision, but they are limited in performance and efficiency. As an\nalternative, acoustic word embeddings -- fixed-dimensional vector\nrepresentations of variable-length spoken word segments -- have begun to be\nconsidered for such tasks as well. However, the current space of such\ndiscriminative embedding models, training approaches, and their application to\nreal-world downstream tasks is limited. We start by considering ``single-view\"\ntraining losses where the goal is to learn an acoustic word embedding model\nthat separates same-word and different-word spoken segment pairs. Then, we\nconsider ``multi-view\" contrastive losses. In this setting, acoustic word\nembeddings are learned jointly with embeddings of character sequences to\ngenerate acoustically grounded embeddings of written words, or acoustically\ngrounded word embeddings.\n In this thesis, we contribute new discriminative acoustic word embedding\n(AWE) and acoustically grounded word embedding (AGWE) approaches based on\nrecurrent neural networks (RNNs). We improve model training in terms of both\nefficiency and performance. We take these developments beyond English to\nseveral low-resource languages and show that multilingual training improves\nperformance when labeled data is limited. We apply our embedding models, both\nmonolingual and multilingual, to the downstream tasks of query-by-example\nspeech search and automatic speech recognition. Finally, we show how our\nembedding approaches compare with and complement more recent self-supervised\nspeech models.\n","authors":["Shane Settle"],"pdf_url":"https://arxiv.org/pdf/2308.14905v1.pdf","comment":"PhD thesis"},{"id":"http://arxiv.org/abs/2306.06826v2","updated":"2023-08-28T21:14:35Z","published":"2023-06-12T02:26:00Z","title":"When Do Annotator Demographics Matter? Measuring the Influence of\n Annotator Demographics with the POPQUORN Dataset","summary":" Annotators are not fungible. Their demographics, life experiences, and\nbackgrounds all contribute to how they label data. However, NLP has only\nrecently considered how annotator identity might influence their decisions.\nHere, we present POPQUORN (the POtato-Prolific dataset for QUestion-Answering,\nOffensiveness, text Rewriting, and politeness rating with demographic Nuance).\nPOPQUORN contains 45,000 annotations from 1,484 annotators, drawn from a\nrepresentative sample regarding sex, age, and race as the US population.\nThrough a series of analyses, we show that annotators' background plays a\nsignificant role in their judgments. Further, our work shows that backgrounds\nnot previously considered in NLP (e.g., education), are meaningful and should\nbe considered. Our study suggests that understanding the background of\nannotators and collecting labels from a demographically balanced pool of crowd\nworkers is important to reduce the bias of datasets. The dataset, annotator\nbackground, and annotation interface are available at\nhttps://github.com/Jiaxin-Pei/potato-prolific-dataset .\n","authors":["Jiaxin Pei","David Jurgens"],"pdf_url":"https://arxiv.org/pdf/2306.06826v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14903v1","updated":"2023-08-28T21:11:18Z","published":"2023-08-28T21:11:18Z","title":"MEMORY-VQ: Compression for Tractable Internet-Scale Memory","summary":" Retrieval augmentation is a powerful but expensive method to make language\nmodels more knowledgeable about the world. Memory-based methods like LUMEN\npre-compute token representations for retrieved passages to drastically speed\nup inference. However, memory also leads to much greater storage requirements\nfrom storing pre-computed representations.\n We propose MEMORY-VQ, a new method to reduce storage requirements of\nmemory-augmented models without sacrificing performance. Our method uses a\nvector quantization variational autoencoder (VQ-VAE) to compress token\nrepresentations. We apply MEMORY-VQ to the LUMEN model to obtain LUMEN-VQ, a\nmemory model that achieves a 16x compression rate with comparable performance\non the KILT benchmark. LUMEN-VQ enables practical retrieval augmentation even\nfor extremely large retrieval corpora.\n","authors":["Yury Zemlyanskiy","Michiel de Jong","Luke Vilnis","Santiago Ontañón","William W. Cohen","Sumit Sanghai","Joshua Ainslie"],"pdf_url":"https://arxiv.org/pdf/2308.14903v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14894v1","updated":"2023-08-28T20:31:45Z","published":"2023-08-28T20:31:45Z","title":"Multiscale Contextual Learning for Speech Emotion Recognition in\n Emergency Call Center Conversations","summary":" Emotion recognition in conversations is essential for ensuring advanced\nhuman-machine interactions. However, creating robust and accurate emotion\nrecognition systems in real life is challenging, mainly due to the scarcity of\nemotion datasets collected in the wild and the inability to take into account\nthe dialogue context. The CEMO dataset, composed of conversations between\nagents and patients during emergency calls to a French call center, fills this\ngap. The nature of these interactions highlights the role of the emotional flow\nof the conversation in predicting patient emotions, as context can often make a\ndifference in understanding actual feelings. This paper presents a multi-scale\nconversational context learning approach for speech emotion recognition, which\ntakes advantage of this hypothesis. We investigated this approach on both\nspeech transcriptions and acoustic segments. Experimentally, our method uses\nthe previous or next information of the targeted segment. In the text domain,\nwe tested the context window using a wide range of tokens (from 10 to 100) and\nat the speech turns level, considering inputs from both the same and opposing\nspeakers. According to our tests, the context derived from previous tokens has\na more significant influence on accurate prediction than the following tokens.\nFurthermore, taking the last speech turn of the same speaker in the\nconversation seems useful. In the acoustic domain, we conducted an in-depth\nanalysis of the impact of the surrounding emotions on the prediction. While\nmulti-scale conversational context learning using Transformers can enhance\nperformance in the textual modality for emergency call recordings,\nincorporating acoustic context is more challenging.\n","authors":["Théo Deschamps-Berger","Lori Lamel","Laurence Devillers"],"pdf_url":"https://arxiv.org/pdf/2308.14894v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19370v3","updated":"2023-08-28T20:13:33Z","published":"2023-05-30T19:25:51Z","title":"Blockwise Parallel Transformer for Large Context Models","summary":" Transformers have emerged as the cornerstone of state-of-the-art natural\nlanguage processing models, showcasing exceptional performance across a wide\nrange of AI applications. However, the memory demands posed by the\nself-attention mechanism and the large feedforward network in Transformers\nlimit their ability to handle long sequences, thereby creating challenges for\ntasks involving multiple long sequences or long-term dependencies. We present a\ndistinct approach, Blockwise Parallel Transformer (BPT), that leverages\nblockwise computation of self-attention and feedforward network fusion to\nminimize memory costs. By processing longer input sequences while maintaining\nmemory efficiency, BPT enables training sequences 32 times longer than vanilla\nTransformers and up to 4 times longer than previous memory-efficient methods.\nExtensive experiments on language modeling and reinforcement learning tasks\ndemonstrate the effectiveness of BPT in reducing memory requirements and\nimproving performance.\n","authors":["Hao Liu","Pieter Abbeel"],"pdf_url":"https://arxiv.org/pdf/2305.19370v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14873v1","updated":"2023-08-28T19:52:18Z","published":"2023-08-28T19:52:18Z","title":"CommunityFish: A Poisson-based Document Scaling With Hierarchical\n Clustering","summary":" Document scaling has been a key component in text-as-data applications for\nsocial scientists and a major field of interest for political researchers, who\naim at uncovering differences between speakers or parties with the help of\ndifferent probabilistic and non-probabilistic approaches. Yet, most of these\ntechniques are either built upon the agnostically bag-of-word hypothesis or use\nprior information borrowed from external sources that might embed the results\nwith a significant bias. If the corpus has long been considered as a collection\nof documents, it can also be seen as a dense network of connected words whose\nstructure could be clustered to differentiate independent groups of words,\nbased on their co-occurrences in documents, known as communities. This paper\nintroduces CommunityFish as an augmented version of Wordfish based on a\nhierarchical clustering, namely the Louvain algorithm, on the word space to\nyield communities as semantic and independent n-grams emerging from the corpus\nand use them as an input to Wordfish method, instead of considering the word\nspace. This strategy emphasizes the interpretability of the results, since\ncommunities have a non-overlapping structure, hence a crucial informative power\nin discriminating parties or speakers, in addition to allowing a faster\nexecution of the Poisson scaling model. Aside from yielding communities,\nassumed to be subtopic proxies, the application of this technique outperforms\nthe classic Wordfish model by highlighting historical developments in the U.S.\nState of the Union addresses and was found to replicate the prevailing\npolitical stance in Germany when using the corpus of parties' legislative\nmanifestos.\n","authors":["Sami Diaf"],"pdf_url":"https://arxiv.org/pdf/2308.14873v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14850v1","updated":"2023-08-28T19:11:52Z","published":"2023-08-28T19:11:52Z","title":"Attention Visualizer Package: Revealing Word Importance for Deeper\n Insight into Encoder-Only Transformer Models","summary":" This report introduces the Attention Visualizer package, which is crafted to\nvisually illustrate the significance of individual words in encoder-only\ntransformer-based models. In contrast to other methods that center on tokens\nand self-attention scores, our approach will examine the words and their impact\non the final embedding representation. Libraries like this play a crucial role\nin enhancing the interpretability and explainability of neural networks. They\noffer the opportunity to illuminate their internal mechanisms, providing a\nbetter understanding of how they operate and can be enhanced. You can access\nthe code and review examples on the following GitHub repository:\nhttps://github.com/AlaFalaki/AttentionVisualizer.\n","authors":["Ala Alam Falaki","Robin Gras"],"pdf_url":"https://arxiv.org/pdf/2308.14850v1.pdf","comment":"12 pages, 15 figures"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.14753v1","updated":"2023-08-28T17:59:47Z","published":"2023-08-28T17:59:47Z","title":"Efficient Discovery and Effective Evaluation of Visual Perceptual\n Similarity: A Benchmark and Beyond","summary":" Visual similarities discovery (VSD) is an important task with broad\ne-commerce applications. Given an image of a certain object, the goal of VSD is\nto retrieve images of different objects with high perceptual visual similarity.\nAlthough being a highly addressed problem, the evaluation of proposed methods\nfor VSD is often based on a proxy of an identification-retrieval task,\nevaluating the ability of a model to retrieve different images of the same\nobject. We posit that evaluating VSD methods based on identification tasks is\nlimited, and faithful evaluation must rely on expert annotations. In this\npaper, we introduce the first large-scale fashion visual similarity benchmark\ndataset, consisting of more than 110K expert-annotated image pairs. Besides\nthis major contribution, we share insight from the challenges we faced while\ncurating this dataset. Based on these insights, we propose a novel and\nefficient labeling procedure that can be applied to any dataset. Our analysis\nexamines its limitations and inductive biases, and based on these findings, we\npropose metrics to mitigate those limitations. Though our primary focus lies on\nvisual similarity, the methodologies we present have broader applications for\ndiscovering and evaluating perceptual similarity across various domains.\n","authors":["Oren Barkan","Tal Reiss","Jonathan Weill","Ori Katz","Roy Hirsch","Itzik Malkiel","Noam Koenigstein"],"pdf_url":"https://arxiv.org/pdf/2308.14753v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.14749v1","updated":"2023-08-28T17:56:22Z","published":"2023-08-28T17:56:22Z","title":"MagicEdit: High-Fidelity and Temporally Coherent Video Editing","summary":" In this report, we present MagicEdit, a surprisingly simple yet effective\nsolution to the text-guided video editing task. We found that high-fidelity and\ntemporally coherent video-to-video translation can be achieved by explicitly\ndisentangling the learning of content, structure and motion signals during\ntraining. This is in contradict to most existing methods which attempt to\njointly model both the appearance and temporal representation within a single\nframework, which we argue, would lead to degradation in per-frame quality.\nDespite its simplicity, we show that MagicEdit supports various downstream\nvideo editing tasks, including video stylization, local editing, video-MagicMix\nand video outpainting.\n","authors":["Jun Hao Liew","Hanshu Yan","Jianfeng Zhang","Zhongcong Xu","Jiashi Feng"],"pdf_url":"https://arxiv.org/pdf/2308.14749v1.pdf","comment":"Project page: https://magic-edit.github.io/"},{"id":"http://arxiv.org/abs/2308.14748v1","updated":"2023-08-28T17:56:18Z","published":"2023-08-28T17:56:18Z","title":"MagicAvatar: Multimodal Avatar Generation and Animation","summary":" This report presents MagicAvatar, a framework for multimodal video generation\nand animation of human avatars. Unlike most existing methods that generate\navatar-centric videos directly from multimodal inputs (e.g., text prompts),\nMagicAvatar explicitly disentangles avatar video generation into two stages:\n(1) multimodal-to-motion and (2) motion-to-video generation. The first stage\ntranslates the multimodal inputs into motion/ control signals (e.g., human\npose, depth, DensePose); while the second stage generates avatar-centric video\nguided by these motion signals. Additionally, MagicAvatar supports avatar\nanimation by simply providing a few images of the target person. This\ncapability enables the animation of the provided human identity according to\nthe specific motion derived from the first stage. We demonstrate the\nflexibility of MagicAvatar through various applications, including text-guided\nand video-guided avatar generation, as well as multimodal avatar animation.\n","authors":["Jianfeng Zhang","Hanshu Yan","Zhongcong Xu","Jiashi Feng","Jun Hao Liew"],"pdf_url":"https://arxiv.org/pdf/2308.14748v1.pdf","comment":"Project page: https://magic-avatar.github.io/"},{"id":"http://arxiv.org/abs/2308.14746v1","updated":"2023-08-28T17:55:33Z","published":"2023-08-28T17:55:33Z","title":"CoVR: Learning Composed Video Retrieval from Web Video Captions","summary":" Composed Image Retrieval (CoIR) has recently gained popularity as a task that\nconsiders both text and image queries together, to search for relevant images\nin a database. Most CoIR approaches require manually annotated datasets,\ncomprising image-text-image triplets, where the text describes a modification\nfrom the query image to the target image. However, manual curation of CoIR\ntriplets is expensive and prevents scalability. In this work, we instead\npropose a scalable automatic dataset creation methodology that generates\ntriplets given video-caption pairs, while also expanding the scope of the task\nto include composed video retrieval (CoVR). To this end, we mine paired videos\nwith a similar caption from a large database, and leverage a large language\nmodel to generate the corresponding modification text. Applying this\nmethodology to the extensive WebVid2M collection, we automatically construct\nour WebVid-CoVR dataset, resulting in 1.6 million triplets. Moreover, we\nintroduce a new benchmark for CoVR with a manually annotated evaluation set,\nalong with baseline results. Our experiments further demonstrate that training\na CoVR model on our dataset effectively transfers to CoIR, leading to improved\nstate-of-the-art performance in the zero-shot setup on both the CIRR and\nFashionIQ benchmarks. Our code, datasets, and models are publicly available at\nhttps://imagine.enpc.fr/~ventural/covr.\n","authors":["Lucas Ventura","Antoine Yang","Cordelia Schmid","Gül Varol"],"pdf_url":"https://arxiv.org/pdf/2308.14746v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.04688v3","updated":"2023-08-28T17:51:52Z","published":"2023-04-10T16:08:59Z","title":"Interaction-Aware Prompting for Zero-Shot Spatio-Temporal Action\n Detection","summary":" The goal of spatial-temporal action detection is to determine the time and\nplace where each person's action occurs in a video and classify the\ncorresponding action category. Most of the existing methods adopt\nfully-supervised learning, which requires a large amount of training data,\nmaking it very difficult to achieve zero-shot learning. In this paper, we\npropose to utilize a pre-trained visual-language model to extract the\nrepresentative image and text features, and model the relationship between\nthese features through different interaction modules to obtain the interaction\nfeature. In addition, we use this feature to prompt each label to obtain more\nappropriate text features. Finally, we calculate the similarity between the\ninteraction feature and the text feature for each label to determine the action\ncategory. Our experiments on J-HMDB and UCF101-24 datasets demonstrate that the\nproposed interaction module and prompting make the visual-language features\nbetter aligned, thus achieving excellent accuracy for zero-shot spatio-temporal\naction detection. The code will be available at\nhttps://github.com/webber2933/iCLIP.\n","authors":["Wei-Jhe Huang","Jheng-Hsien Yeh","Min-Hung Chen","Gueter Josmy Faure","Shang-Hong Lai"],"pdf_url":"https://arxiv.org/pdf/2304.04688v3.pdf","comment":"Accepted by ICCVW 2023 (What is Next in Multimodal Foundation\n Models?)"},{"id":"http://arxiv.org/abs/2308.11487v2","updated":"2023-08-28T17:42:45Z","published":"2023-08-22T15:06:14Z","title":"Free Lunch for Gait Recognition: A Novel Relation Descriptor","summary":" Gait recognition is to seek correct matches for query individuals by their\nunique walking patterns. However, current methods focus solely on extracting\nindividual-specific features, overlooking inter-personal relationships. In this\npaper, we propose a novel $\\textbf{Relation Descriptor}$ that captures not only\nindividual features but also relations between test gaits and pre-selected\nanchored gaits. Specifically, we reinterpret classifier weights as anchored\ngaits and compute similarity scores between test features and these anchors,\nwhich re-expresses individual gait features into a similarity relation\ndistribution. In essence, the relation descriptor offers a holistic perspective\nthat leverages the collective knowledge stored within the classifier's weights,\nemphasizing meaningful patterns and enhancing robustness. Despite its\npotential, relation descriptor poses dimensionality challenges since its\ndimension depends on the training set's identity count. To address this, we\npropose the Farthest Anchored-gait Selection to identify the most\ndiscriminative anchored gaits and an Orthogonal Regularization to increase\ndiversity within anchored gaits. Compared to individual-specific features\nextracted from the backbone, our relation descriptor can boost the performances\nnearly without any extra costs. We evaluate the effectiveness of our method on\nthe popular GREW, Gait3D, CASIA-B, and OU-MVLP, showing that our method\nconsistently outperforms the baselines and achieves state-of-the-art\nperformances.\n","authors":["Jilong Wang","Saihui Hou","Yan Huang","Chunshui Cao","Xu Liu","Yongzhen Huang","Liang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.11487v2.pdf","comment":"Add new figures and fix some typos"},{"id":"http://arxiv.org/abs/2308.14740v1","updated":"2023-08-28T17:41:14Z","published":"2023-08-28T17:41:14Z","title":"Total Selfie: Generating Full-Body Selfies","summary":" We present a method to generate full-body selfies -- photos that you take of\nyourself, but capturing your whole body as if someone else took the photo of\nyou from a few feet away. Our approach takes as input a pre-captured video of\nyour body, a target pose photo, and a selfie + background pair for each\nlocation. We introduce a novel diffusion-based approach to combine all of this\ninformation into high quality, well-composed photos of you with the desired\npose and background.\n","authors":["Bowei Chen","Brian Curless","Ira Kemelmacher-Shlizerman","Steve Seitz"],"pdf_url":"https://arxiv.org/pdf/2308.14740v1.pdf","comment":"Project page:\n https://homes.cs.washington.edu/~boweiche/project_page/totalselfie/"},{"id":"http://arxiv.org/abs/2308.14737v1","updated":"2023-08-28T17:38:31Z","published":"2023-08-28T17:38:31Z","title":"Flexible Techniques for Differentiable Rendering with 3D Gaussians","summary":" Fast, reliable shape reconstruction is an essential ingredient in many\ncomputer vision applications. Neural Radiance Fields demonstrated that\nphotorealistic novel view synthesis is within reach, but was gated by\nperformance requirements for fast reconstruction of real scenes and objects.\nSeveral recent approaches have built on alternative shape representations, in\nparticular, 3D Gaussians. We develop extensions to these renderers, such as\nintegrating differentiable optical flow, exporting watertight meshes and\nrendering per-ray normals. Additionally, we show how two of the recent methods\nare interoperable with each other. These reconstructions are quick, robust, and\neasily performed on GPU or CPU. For code and visual examples, see\nhttps://leonidk.github.io/fmb-plus\n","authors":["Leonid Keselman","Martial Hebert"],"pdf_url":"https://arxiv.org/pdf/2308.14737v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14726v1","updated":"2023-08-28T17:30:14Z","published":"2023-08-28T17:30:14Z","title":"PanoSwin: a Pano-style Swin Transformer for Panorama Understanding","summary":" In panorama understanding, the widely used equirectangular projection (ERP)\nentails boundary discontinuity and spatial distortion. It severely deteriorates\nthe conventional CNNs and vision Transformers on panoramas. In this paper, we\npropose a simple yet effective architecture named PanoSwin to learn panorama\nrepresentations with ERP. To deal with the challenges brought by\nequirectangular projection, we explore a pano-style shift windowing scheme and\nnovel pitch attention to address the boundary discontinuity and the spatial\ndistortion, respectively. Besides, based on spherical distance and Cartesian\ncoordinates, we adapt absolute positional embeddings and relative positional\nbiases for panoramas to enhance panoramic geometry information. Realizing that\nplanar image understanding might share some common knowledge with panorama\nunderstanding, we devise a novel two-stage learning framework to facilitate\nknowledge transfer from the planar images to panoramas. We conduct experiments\nagainst the state-of-the-art on various panoramic tasks, i.e., panoramic object\ndetection, panoramic classification, and panoramic layout estimation. The\nexperimental results demonstrate the effectiveness of PanoSwin in panorama\nunderstanding.\n","authors":["Zhixin Ling","Zhen Xing","Xiangdong Zhou","Manliang Cao","Guichun Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.14726v1.pdf","comment":"CVPR 2023"},{"id":"http://arxiv.org/abs/2308.14713v1","updated":"2023-08-28T17:13:49Z","published":"2023-08-28T17:13:49Z","title":"R3D3: Dense 3D Reconstruction of Dynamic Scenes from Multiple Cameras","summary":" Dense 3D reconstruction and ego-motion estimation are key challenges in\nautonomous driving and robotics. Compared to the complex, multi-modal systems\ndeployed today, multi-camera systems provide a simpler, low-cost alternative.\nHowever, camera-based 3D reconstruction of complex dynamic scenes has proven\nextremely difficult, as existing solutions often produce incomplete or\nincoherent results. We propose R3D3, a multi-camera system for dense 3D\nreconstruction and ego-motion estimation. Our approach iterates between\ngeometric estimation that exploits spatial-temporal information from multiple\ncameras, and monocular depth refinement. We integrate multi-camera feature\ncorrelation and dense bundle adjustment operators that yield robust geometric\ndepth and pose estimates. To improve reconstruction where geometric depth is\nunreliable, e.g. for moving objects or low-textured regions, we introduce\nlearnable scene priors via a depth refinement network. We show that this design\nenables a dense, consistent 3D reconstruction of challenging, dynamic outdoor\nenvironments. Consequently, we achieve state-of-the-art dense depth prediction\non the DDAD and NuScenes benchmarks.\n","authors":["Aron Schmied","Tobias Fischer","Martin Danelljan","Marc Pollefeys","Fisher Yu"],"pdf_url":"https://arxiv.org/pdf/2308.14713v1.pdf","comment":"Accepted to ICCV 2023. Project page is available at\n https://www.vis.xyz/pub/r3d3/"},{"id":"http://arxiv.org/abs/2308.14710v1","updated":"2023-08-28T17:10:12Z","published":"2023-08-28T17:10:12Z","title":"VideoCutLER: Surprisingly Simple Unsupervised Video Instance\n Segmentation","summary":" Existing approaches to unsupervised video instance segmentation typically\nrely on motion estimates and experience difficulties tracking small or\ndivergent motions. We present VideoCutLER, a simple method for unsupervised\nmulti-instance video segmentation without using motion-based learning signals\nlike optical flow or training on natural videos. Our key insight is that using\nhigh-quality pseudo masks and a simple video synthesis method for model\ntraining is surprisingly sufficient to enable the resulting video model to\neffectively segment and track multiple instances across video frames. We show\nthe first competitive unsupervised learning results on the challenging\nYouTubeVIS-2019 benchmark, achieving 50.7% APvideo^50 , surpassing the previous\nstate-of-the-art by a large margin. VideoCutLER can also serve as a strong\npretrained model for supervised video instance segmentation tasks, exceeding\nDINO by 15.9% on YouTubeVIS-2019 in terms of APvideo.\n","authors":["Xudong Wang","Ishan Misra","Ziyun Zeng","Rohit Girdhar","Trevor Darrell"],"pdf_url":"https://arxiv.org/pdf/2308.14710v1.pdf","comment":"Preprint. Code: https://github.com/facebookresearch/CutLER"},{"id":"http://arxiv.org/abs/2206.14996v2","updated":"2023-08-28T16:38:19Z","published":"2022-06-30T03:09:59Z","title":"Cross-domain Federated Object Detection","summary":" Detection models trained by one party (including server) may face severe\nperformance degradation when distributed to other users (clients). Federated\nlearning can enable multi-party collaborative learning without leaking client\ndata. In this paper, we focus on a special cross-domain scenario in which the\nserver has large-scale labeled data and multiple clients only have a small\namount of labeled data; meanwhile, there exist differences in data\ndistributions among the clients. In this case, traditional federated learning\nmethods can't help a client learn both the global knowledge of all participants\nand its own unique knowledge. To make up for this limitation, we propose a\ncross-domain federated object detection framework, named FedOD. The proposed\nframework first performs the federated training to obtain a public global\naggregated model through multi-teacher distillation, and sends the aggregated\nmodel back to each client for fine-tuning its personalized local model. After a\nfew rounds of communication, on each client we can perform weighted ensemble\ninference on the public global model and the personalized local model. We\nestablish a federated object detection dataset which has significant background\ndifferences and instance differences based on multiple public autonomous\ndriving datasets, and then conduct extensive experiments on the dataset. The\nexperimental results validate the effectiveness of the proposed method.\n","authors":["Shangchao Su","Bin Li","Chengzhi Zhang","Mingzhao Yang","Xiangyang Xue"],"pdf_url":"https://arxiv.org/pdf/2206.14996v2.pdf","comment":"ICME 2023"},{"id":"http://arxiv.org/abs/2308.14686v1","updated":"2023-08-28T16:21:51Z","published":"2023-08-28T16:21:51Z","title":"360-Degree Panorama Generation from Few Unregistered NFoV Images","summary":" 360$^\\circ$ panoramas are extensively utilized as environmental light sources\nin computer graphics. However, capturing a 360$^\\circ$ $\\times$ 180$^\\circ$\npanorama poses challenges due to the necessity of specialized and costly\nequipment, and additional human resources. Prior studies develop various\nlearning-based generative methods to synthesize panoramas from a single Narrow\nField-of-View (NFoV) image, but they are limited in alterable input patterns,\ngeneration quality, and controllability. To address these issues, we propose a\nnovel pipeline called PanoDiff, which efficiently generates complete\n360$^\\circ$ panoramas using one or more unregistered NFoV images captured from\narbitrary angles. Our approach has two primary components to overcome the\nlimitations. Firstly, a two-stage angle prediction module to handle various\nnumbers of NFoV inputs. Secondly, a novel latent diffusion-based panorama\ngeneration model uses incomplete panorama and text prompts as control signals\nand utilizes several geometric augmentation schemes to ensure geometric\nproperties in generated panoramas. Experiments show that PanoDiff achieves\nstate-of-the-art panoramic generation quality and high controllability, making\nit suitable for applications such as content editing.\n","authors":["Jionghao Wang","Ziyu Chen","Jun Ling","Rong Xie","Li Song"],"pdf_url":"https://arxiv.org/pdf/2308.14686v1.pdf","comment":"Accepted to ACM Multimedia 2023 (MM' 23). Code is available:\n https://github.com/shanemankiw/Panodiff"},{"id":"http://arxiv.org/abs/2308.14679v1","updated":"2023-08-28T16:15:23Z","published":"2023-08-28T16:15:23Z","title":"Video-Based Hand Pose Estimation for Remote Assessment of Bradykinesia\n in Parkinson's Disease","summary":" There is a growing interest in using pose estimation algorithms for\nvideo-based assessment of Bradykinesia in Parkinson's Disease (PD) to\nfacilitate remote disease assessment and monitoring. However, the accuracy of\npose estimation algorithms in videos from video streaming services during\nTelehealth appointments has not been studied. In this study, we used seven\noff-the-shelf hand pose estimation models to estimate the movement of the thumb\nand index fingers in videos of the finger-tapping (FT) test recorded from\nHealthy Controls (HC) and participants with PD and under two different\nconditions: streaming (videos recorded during a live Zoom meeting) and\non-device (videos recorded locally with high-quality cameras). The accuracy and\nreliability of the models were estimated by comparing the models' output with\nmanual results. Three of the seven models demonstrated good accuracy for\non-device recordings, and the accuracy decreased significantly for streaming\nrecordings. We observed a negative correlation between movement speed and the\nmodel's accuracy for the streaming recordings. Additionally, we evaluated the\nreliability of ten movement features related to bradykinesia extracted from\nvideo recordings of PD patients performing the FT test. While most of the\nfeatures demonstrated excellent reliability for on-device recordings, most of\nthe features demonstrated poor to moderate reliability for streaming\nrecordings. Our findings highlight the limitations of pose estimation\nalgorithms when applied to video recordings obtained during Telehealth visits,\nand demonstrate that on-device recordings can be used for automatic\nvideo-assessment of bradykinesia in PD.\n","authors":["Gabriela T. Acevedo Trebbau","Andrea Bandini","Diego L. Guarin"],"pdf_url":"https://arxiv.org/pdf/2308.14679v1.pdf","comment":"12 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2308.14667v1","updated":"2023-08-28T15:54:14Z","published":"2023-08-28T15:54:14Z","title":"Neural Network-Based Histologic Remission Prediction In Ulcerative\n Colitis","summary":" BACKGROUND & AIMS: Histological remission (HR) is advocated and considered as\na new therapeutic target in ulcerative colitis (UC). Diagnosis of histologic\nremission currently relies on biopsy; during this process, patients are at risk\nfor bleeding, infection, and post-biopsy fibrosis. In addition, histologic\nresponse scoring is complex and time-consuming, and there is heterogeneity\namong pathologists. Endocytoscopy (EC) is a novel ultra-high magnification\nendoscopic technique that can provide excellent in vivo assessment of glands.\nBased on the EC technique, we propose a neural network model that can assess\nhistological disease activity in UC using EC images to address the above\nissues. The experiment results demonstrate that the proposed method can assist\npatients in precise treatment and prognostic assessment.\n METHODS: We construct a neural network model for UC evaluation. A total of\n5105 images of 154 intestinal segments from 87 patients undergoing EC treatment\nat a center in China between March 2022 and March 2023 are scored according to\nthe Geboes score. Subsequently, 103 intestinal segments are used as the\ntraining set, 16 intestinal segments are used as the validation set for neural\nnetwork training, and the remaining 35 intestinal segments are used as the test\nset to measure the model performance together with the validation set.\n RESULTS: By treating HR as a negative category and histologic activity as a\npositive category, the proposed neural network model can achieve an accuracy of\n0.9, a specificity of 0.95, a sensitivity of 0.75, and an area under the curve\n(AUC) of 0.81.\n CONCLUSION: We develop a specific neural network model that can distinguish\nhistologic remission/activity in EC images of UC, which helps to accelerate\nclinical histological diagnosis.\n keywords: ulcerative colitis; Endocytoscopy; Geboes score; neural network.\n","authors":["Yemin li","Zhongcheng Liu","Xiaoying Lou","Mirigual Kurban","Miao Li","Jie Yang","Kaiwei Che","Jiankun Wang","Max Q. -H Meng","Yan Huang","Qin Guo","Pinjin Hu"],"pdf_url":"https://arxiv.org/pdf/2308.14667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14650v1","updated":"2023-08-28T15:22:15Z","published":"2023-08-28T15:22:15Z","title":"Comparison of automated crater catalogs for Mars from Benedix et al.\n (2020) and Lee and Hogan (2021)","summary":" Crater mapping using neural networks and other automated methods has\nincreased recently with automated Crater Detection Algorithms (CDAs) applied to\nplanetary bodies throughout the solar system. A recent publication by Benedix\net al. (2020) showed high performance at small scales compared to similar\nautomated CDAs but with a net positive diameter bias in many crater candidates.\nI compare the publicly available catalogs from Benedix et al. (2020) and Lee &\nHogan (2021) and show that the reported performance is sensitive to the metrics\nused to test the catalogs. I show how the more permissive comparison methods\nindicate a higher CDA performance by allowing worse candidate craters to match\nground-truth craters. I show that the Benedix et al. (2020) catalog has a\nsubstantial performance loss with increasing latitude and identify an image\nprojection issue that might cause this loss. Finally, I suggest future\napplications of neural networks in generating large scientific datasets be\nvalidated using secondary networks with independent data sources or training\nmethods.\n","authors":["Christopher Lee"],"pdf_url":"https://arxiv.org/pdf/2308.14650v1.pdf","comment":"14 pages, 6 figures. Accepted August 13th 2023"},{"id":"http://arxiv.org/abs/2212.09950v3","updated":"2023-08-28T15:09:46Z","published":"2022-12-20T01:59:27Z","title":"Domain Generalization with Correlated Style Uncertainty","summary":" Domain generalization (DG) approaches intend to extract domain invariant\nfeatures that can lead to a more robust deep learning model. In this regard,\nstyle augmentation is a strong DG method taking advantage of instance-specific\nfeature statistics containing informative style characteristics to synthetic\nnovel domains. While it is one of the state-of-the-art methods, prior works on\nstyle augmentation have either disregarded the interdependence amongst distinct\nfeature channels or have solely constrained style augmentation to linear\ninterpolation. To address these research gaps, in this work, we introduce a\nnovel augmentation approach, named Correlated Style Uncertainty (CSU),\nsurpassing the limitations of linear interpolation in style statistic space and\nsimultaneously preserving vital correlation information. Our method's efficacy\nis established through extensive experimentation on diverse cross-domain\ncomputer vision and medical imaging classification tasks: PACS, Office-Home,\nand Camelyon17 datasets, and the Duke-Market1501 instance retrieval task. The\nresults showcase a remarkable improvement margin over existing state-of-the-art\ntechniques. The source code is available https://github.com/freshman97/CSU.\n","authors":["Zheyuan Zhang","Bin Wang","Debesh Jha","Ugur Demir","Ulas Bagci"],"pdf_url":"https://arxiv.org/pdf/2212.09950v3.pdf","comment":"Accepted by WACV2024, camera ready version"},{"id":"http://arxiv.org/abs/2308.14626v1","updated":"2023-08-28T14:48:49Z","published":"2023-08-28T14:48:49Z","title":"VesselShot: Few-shot learning for cerebral blood vessel segmentation","summary":" Angiography is widely used to detect, diagnose, and treat cerebrovascular\ndiseases. While numerous techniques have been proposed to segment the vascular\nnetwork from different imaging modalities, deep learning (DL) has emerged as a\npromising approach. However, existing DL methods often depend on proprietary\ndatasets and extensive manual annotation. Moreover, the availability of\npre-trained networks specifically for medical domains and 3D volumes is\nlimited. To overcome these challenges, we propose a few-shot learning approach\ncalled VesselShot for cerebrovascular segmentation. VesselShot leverages\nknowledge from a few annotated support images and mitigates the scarcity of\nlabeled data and the need for extensive annotation in cerebral blood vessel\nsegmentation. We evaluated the performance of VesselShot using the publicly\navailable TubeTK dataset for the segmentation task, achieving a mean Dice\ncoefficient (DC) of 0.62(0.03).\n","authors":["Mumu Aktar","Hassan Rivaz","Marta Kersten-Oertel","Yiming Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.14626v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14619v1","updated":"2023-08-28T14:43:36Z","published":"2023-08-28T14:43:36Z","title":"Compositional Semantic Mix for Domain Adaptation in Point Cloud\n Segmentation","summary":" Deep-learning models for 3D point cloud semantic segmentation exhibit limited\ngeneralization capabilities when trained and tested on data captured with\ndifferent sensors or in varying environments due to domain shift. Domain\nadaptation methods can be employed to mitigate this domain shift, for instance,\nby simulating sensor noise, developing domain-agnostic generators, or training\npoint cloud completion networks. Often, these methods are tailored for range\nview maps or necessitate multi-modal input. In contrast, domain adaptation in\nthe image domain can be executed through sample mixing, which emphasizes input\ndata manipulation rather than employing distinct adaptation modules. In this\nstudy, we introduce compositional semantic mixing for point cloud domain\nadaptation, representing the first unsupervised domain adaptation technique for\npoint cloud segmentation based on semantic and geometric sample mixing. We\npresent a two-branch symmetric network architecture capable of concurrently\nprocessing point clouds from a source domain (e.g. synthetic) and point clouds\nfrom a target domain (e.g. real-world). Each branch operates within one domain\nby integrating selected data fragments from the other domain and utilizing\nsemantic information derived from source labels and target (pseudo) labels.\nAdditionally, our method can leverage a limited number of human point-level\nannotations (semi-supervised) to further enhance performance. We assess our\napproach in both synthetic-to-real and real-to-real scenarios using LiDAR\ndatasets and demonstrate that it significantly outperforms state-of-the-art\nmethods in both unsupervised and semi-supervised settings.\n","authors":["Cristiano Saltori","Fabio Galasso","Giuseppe Fiameni","Nicu Sebe","Fabio Poiesi","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2308.14619v1.pdf","comment":"TPAMI. arXiv admin note: text overlap with arXiv:2207.09778"},{"id":"http://arxiv.org/abs/2308.14616v1","updated":"2023-08-28T14:35:58Z","published":"2023-08-28T14:35:58Z","title":"VoroMesh: Learning Watertight Surface Meshes with Voronoi Diagrams","summary":" In stark contrast to the case of images, finding a concise, learnable\ndiscrete representation of 3D surfaces remains a challenge. In particular,\nwhile polygon meshes are arguably the most common surface representation used\nin geometry processing, their irregular and combinatorial structure often make\nthem unsuitable for learning-based applications. In this work, we present\nVoroMesh, a novel and differentiable Voronoi-based representation of watertight\n3D shape surfaces. From a set of 3D points (called generators) and their\nassociated occupancy, we define our boundary representation through the Voronoi\ndiagram of the generators as the subset of Voronoi faces whose two associated\n(equidistant) generators are of opposite occupancy: the resulting polygon mesh\nforms a watertight approximation of the target shape's boundary. To learn the\nposition of the generators, we propose a novel loss function, dubbed VoroLoss,\nthat minimizes the distance from ground truth surface samples to the closest\nfaces of the Voronoi diagram which does not require an explicit construction of\nthe entire Voronoi diagram. A direct optimization of the Voroloss to obtain\ngenerators on the Thingi32 dataset demonstrates the geometric efficiency of our\nrepresentation compared to axiomatic meshing algorithms and recent\nlearning-based mesh representations. We further use VoroMesh in a\nlearning-based mesh prediction task from input SDF grids on the ABC dataset,\nand show comparable performance to state-of-the-art methods while guaranteeing\nclosed output surfaces free of self-intersections.\n","authors":["Nissim Maruani","Roman Klokov","Maks Ovsjanikov","Pierre Alliez","Mathieu Desbrun"],"pdf_url":"https://arxiv.org/pdf/2308.14616v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14613v1","updated":"2023-08-28T14:28:50Z","published":"2023-08-28T14:28:50Z","title":"MS-Net: A Multi-modal Self-supervised Network for Fine-Grained\n Classification of Aircraft in SAR Images","summary":" Synthetic aperture radar (SAR) imaging technology is commonly used to provide\n24-hour all-weather earth observation. However, it still has some drawbacks in\nSAR target classification, especially in fine-grained classification of\naircraft: aircrafts in SAR images have large intra-class diversity and\ninter-class similarity; the number of effective samples is insufficient and\nit's hard to annotate. To address these issues, this article proposes a novel\nmulti-modal self-supervised network (MS-Net) for fine-grained classification of\naircraft. Firstly, in order to entirely exploit the potential of multi-modal\ninformation, a two-sided path feature extraction network (TSFE-N) is\nconstructed to enhance the image feature of the target and obtain the domain\nknowledge feature of text mode. Secondly, a contrastive self-supervised\nlearning (CSSL) framework is employed to effectively learn useful\nlabel-independent feature from unbalanced data, a similarity per-ception loss\n(SPloss) is proposed to avoid network overfitting. Finally, TSFE-N is used as\nthe encoder of CSSL to obtain the classification results. Through a large\nnumber of experiments, our MS-Net can effectively reduce the difficulty of\nclassifying similar types of aircrafts. In the case of no label, the proposed\nalgorithm achieves an accuracy of 88.46% for 17 types of air-craft\nclassification task, which has pioneering significance in the field of\nfine-grained classification of aircraft in SAR images.\n","authors":["Bingying Yue","Jianhao Li","Hao Shi","Yupei Wang","Honghu Zhong"],"pdf_url":"https://arxiv.org/pdf/2308.14613v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14610v1","updated":"2023-08-28T14:26:15Z","published":"2023-08-28T14:26:15Z","title":"A Transformer-Conditioned Neural Fields Pipeline with Polar Coordinate\n Representation for Astronomical Radio Interferometric Data Reconstruction","summary":" In radio astronomy, visibility data, which are measurements of wave signals\nfrom radio telescopes, are transformed into images for observation of distant\ncelestial objects. However, these resultant images usually contain both real\nsources and artifacts, due to signal sparsity and other factors. One way to\nobtain cleaner images is to reconstruct samples into dense forms before\nimaging. Unfortunately, existing visibility reconstruction methods may miss\nsome components of the frequency data, so blurred object edges and persistent\nartifacts remain in the images. Furthermore, the computation overhead is high\non irregular visibility samples due to the data skew. To address these\nproblems, we propose PolarRec, a reconstruction method for interferometric\nvisibility data, which consists of a transformer-conditioned neural fields\npipeline with a polar coordinate representation. This representation matches\nthe way in which telescopes observe a celestial area as the Earth rotates. We\nfurther propose Radial Frequency Loss function, using radial coordinates in the\npolar coordinate system to correlate with the frequency information, to help\nreconstruct complete visibility. We also group visibility sample points by\nangular coordinates in the polar coordinate system, and use groups as the\ngranularity for subsequent encoding with a Transformer encoder. Consequently,\nour method can capture the inherent characteristics of visibility data\neffectively and efficiently. Our experiments demonstrate that PolarRec markedly\nimproves imaging results by faithfully reconstructing all frequency components\nin the visibility domain while significantly reducing the computation cost.\n","authors":["Ruoqi Wang","Qiong Luo","Feng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14610v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14605v1","updated":"2023-08-28T14:19:13Z","published":"2023-08-28T14:19:13Z","title":"A Generalization of Continuous Relaxation in Structured Pruning","summary":" Deep learning harnesses massive parallel floating-point processing to train\nand evaluate large neural networks. Trends indicate that deeper and larger\nneural networks with an increasing number of parameters achieve higher accuracy\nthan smaller neural networks. This performance improvement, which often\nrequires heavy compute for both training and evaluation, eventually needs to\ntranslate well to resource-constrained hardware for practical value. Structured\npruning asserts that while large networks enable us to find solutions to\ncomplex computer vision problems, a smaller, computationally efficient\nsub-network can be derived from the large neural network that retains model\naccuracy but significantly improves computational efficiency.\n We generalize structured pruning with algorithms for network augmentation,\npruning, sub-network collapse and removal. In addition, we demonstrate\nefficient and stable convergence up to 93% sparsity and 95% FLOPs reduction\nwithout loss of inference accuracy using with continuous relaxation matching or\nexceeding the state of the art for all structured pruning methods. The\nresulting CNN executes efficiently on GPU hardware without computationally\nexpensive sparse matrix operations. We achieve this with routine automatable\noperations on classification and segmentation problems using CIFAR-10,\nImageNet, and CityScapes datasets with the ResNet and U-NET network\narchitectures.\n","authors":["Brad Larson","Bishal Upadhyaya","Luke McDermott","Siddha Ganju"],"pdf_url":"https://arxiv.org/pdf/2308.14605v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2305.06310v3","updated":"2023-08-28T14:18:25Z","published":"2023-04-27T03:41:15Z","title":"SoGAR: Self-supervised Spatiotemporal Attention-based Social Group\n Activity Recognition","summary":" This paper introduces a novel approach to Social Group Activity Recognition\n(SoGAR) using Self-supervised Transformers network that can effectively utilize\nunlabeled video data. To extract spatio-temporal information, we created local\nand global views with varying frame rates. Our self-supervised objective\nensures that features extracted from contrasting views of the same video were\nconsistent across spatio-temporal domains. Our proposed approach is efficient\nin using transformer-based encoders to alleviate the weakly supervised setting\nof group activity recognition. By leveraging the benefits of transformer\nmodels, our approach can model long-term relationships along spatio-temporal\ndimensions. Our proposed SoGAR method achieved state-of-the-art results on\nthree group activity recognition benchmarks, namely JRDB-PAR, NBA, and\nVolleyball datasets, surpassing the current numbers in terms of F1-score, MCA,\nand MPCA metrics.\n","authors":["Naga VS Raviteja Chappa","Pha Nguyen","Alexander H Nelson","Han-Seok Seo","Xin Li","Page Daniel Dobbs","Khoa Luu"],"pdf_url":"https://arxiv.org/pdf/2305.06310v3.pdf","comment":"Under review for PR journal; 32 pages, 7 figures. arXiv admin note:\n text overlap with arXiv:2303.12149"},{"id":"http://arxiv.org/abs/2308.14604v1","updated":"2023-08-28T14:17:16Z","published":"2023-08-28T14:17:16Z","title":"SAM-PARSER: Fine-tuning SAM Efficiently by Parameter Space\n Reconstruction","summary":" Segment Anything Model (SAM) has received remarkable attention as it offers a\npowerful and versatile solution for object segmentation in images. However,\nfine-tuning SAM for downstream segmentation tasks under different scenarios\nremains a challenge, as the varied characteristics of different scenarios\nnaturally requires diverse model parameter spaces. Most existing fine-tuning\nmethods attempt to bridge the gaps among different scenarios by introducing a\nset of new parameters to modify SAM's original parameter space. Unlike these\nworks, in this paper, we propose fine-tuning SAM efficiently by parameter space\nreconstruction (SAM-PARSER), which introduce nearly zero trainable parameters\nduring fine-tuning. In SAM-PARSER, we assume that SAM's original parameter\nspace is relatively complete, so that its bases are able to reconstruct the\nparameter space of a new scenario. We obtain the bases by matrix decomposition,\nand fine-tuning the coefficients to reconstruct the parameter space tailored to\nthe new scenario by an optimal linear combination of the bases. Experimental\nresults show that SAM-PARSER exhibits superior segmentation performance across\nvarious scenarios, while reducing the number of trainable parameters by\n$\\approx 290$ times compared with current parameter-efficient fine-tuning\nmethods.\n","authors":["Zelin Peng","Zhengqin Xu","Zhilin Zeng","Xiaokang Yang","Wei Shen"],"pdf_url":"https://arxiv.org/pdf/2308.14604v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12149v4","updated":"2023-08-28T14:13:16Z","published":"2023-03-06T16:58:27Z","title":"SPARTAN: Self-supervised Spatiotemporal Transformers Approach to Group\n Activity Recognition","summary":" In this paper, we propose a new, simple, and effective Self-supervised\nSpatio-temporal Transformers (SPARTAN) approach to Group Activity Recognition\n(GAR) using unlabeled video data. Given a video, we create local and global\nSpatio-temporal views with varying spatial patch sizes and frame rates. The\nproposed self-supervised objective aims to match the features of these\ncontrasting views representing the same video to be consistent with the\nvariations in spatiotemporal domains. To the best of our knowledge, the\nproposed mechanism is one of the first works to alleviate the weakly supervised\nsetting of GAR using the encoders in video transformers. Furthermore, using the\nadvantage of transformer models, our proposed approach supports long-term\nrelationship modeling along spatio-temporal dimensions. The proposed SPARTAN\napproach performs well on two group activity recognition benchmarks, including\nNBA and Volleyball datasets, by surpassing the state-of-the-art results by a\nsignificant margin in terms of MCA and MPCA metrics.\n","authors":["Naga VS Raviteja Chappa","Pha Nguyen","Alexander H Nelson","Han-Seok Seo","Xin Li","Page Daniel Dobbs","Khoa Luu"],"pdf_url":"https://arxiv.org/pdf/2303.12149v4.pdf","comment":"Accepted to CVPRW 2023; 11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.14598v1","updated":"2023-08-28T14:09:03Z","published":"2023-08-28T14:09:03Z","title":"S-TREK: Sequential Translation and Rotation Equivariant Keypoints for\n local feature extraction","summary":" In this work we introduce S-TREK, a novel local feature extractor that\ncombines a deep keypoint detector, which is both translation and rotation\nequivariant by design, with a lightweight deep descriptor extractor. We train\nthe S-TREK keypoint detector within a framework inspired by reinforcement\nlearning, where we leverage a sequential procedure to maximize a reward\ndirectly related to keypoint repeatability. Our descriptor network is trained\nfollowing a \"detect, then describe\" approach, where the descriptor loss is\nevaluated only at those locations where keypoints have been selected by the\nalready trained detector. Extensive experiments on multiple benchmarks confirm\nthe effectiveness of our proposed method, with S-TREK often outperforming other\nstate-of-the-art methods in terms of repeatability and quality of the recovered\nposes, especially when dealing with in-plane rotations.\n","authors":["Emanuele Santellani","Christian Sormann","Mattia Rossi","Andreas Kuhn","Friedrich Fraundorfer"],"pdf_url":"https://arxiv.org/pdf/2308.14598v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2308.14597v1","updated":"2023-08-28T14:09:02Z","published":"2023-08-28T14:09:02Z","title":"Adversarial Attacks on Foundational Vision Models","summary":" Rapid progress is being made in developing large, pretrained, task-agnostic\nfoundational vision models such as CLIP, ALIGN, DINOv2, etc. In fact, we are\napproaching the point where these models do not have to be finetuned\ndownstream, and can simply be used in zero-shot or with a lightweight probing\nhead. Critically, given the complexity of working at this scale, there is a\nbottleneck where relatively few organizations in the world are executing the\ntraining then sharing the models on centralized platforms such as HuggingFace\nand torch.hub. The goal of this work is to identify several key adversarial\nvulnerabilities of these models in an effort to make future designs more\nrobust. Intuitively, our attacks manipulate deep feature representations to\nfool an out-of-distribution (OOD) detector which will be required when using\nthese open-world-aware models to solve closed-set downstream tasks. Our methods\nreliably make in-distribution (ID) images (w.r.t. a downstream task) be\npredicted as OOD and vice versa while existing in extremely\nlow-knowledge-assumption threat models. We show our attacks to be potent in\nwhitebox and blackbox settings, as well as when transferred across foundational\nmodel types (e.g., attack DINOv2 with CLIP)! This work is only just the\nbeginning of a long journey towards adversarially robust foundational vision\nmodels.\n","authors":["Nathan Inkawhich","Gwendolyn McDonald","Ryan Luley"],"pdf_url":"https://arxiv.org/pdf/2308.14597v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14596v1","updated":"2023-08-28T14:08:42Z","published":"2023-08-28T14:08:42Z","title":"LatentDR: Improving Model Generalization Through Sample-Aware Latent\n Degradation and Restoration","summary":" Despite significant advances in deep learning, models often struggle to\ngeneralize well to new, unseen domains, especially when training data is\nlimited. To address this challenge, we propose a novel approach for\ndistribution-aware latent augmentation that leverages the relationships across\nsamples to guide the augmentation procedure. Our approach first degrades the\nsamples stochastically in the latent space, mapping them to augmented labels,\nand then restores the samples from their corrupted versions during training.\nThis process confuses the classifier in the degradation step and restores the\noverall class distribution of the original samples, promoting diverse\nintra-class/cross-domain variability. We extensively evaluate our approach on a\ndiverse set of datasets and tasks, including domain generalization benchmarks\nand medical imaging datasets with strong domain shift, where we show our\napproach achieves significant improvements over existing methods for latent\nspace augmentation. We further show that our method can be flexibly adapted to\nlong-tail recognition tasks, demonstrating its versatility in building more\ngeneralizable models. Code is available at\nhttps://github.com/nerdslab/LatentDR.\n","authors":["Ran Liu","Sahil Khose","Jingyun Xiao","Lakshmi Sathidevi","Keerthan Ramnath","Zsolt Kira","Eva L. Dyer"],"pdf_url":"https://arxiv.org/pdf/2308.14596v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14595v1","updated":"2023-08-28T14:06:36Z","published":"2023-08-28T14:06:36Z","title":"Neural Network Training Strategy to Enhance Anomaly Detection\n Performance: A Perspective on Reconstruction Loss Amplification","summary":" Unsupervised anomaly detection (UAD) is a widely adopted approach in industry\ndue to rare anomaly occurrences and data imbalance. A desirable characteristic\nof an UAD model is contained generalization ability which excels in the\nreconstruction of seen normal patterns but struggles with unseen anomalies.\nRecent studies have pursued to contain the generalization capability of their\nUAD models in reconstruction from different perspectives, such as design of\nneural network (NN) structure and training strategy. In contrast, we note that\ncontaining of generalization ability in reconstruction can also be obtained\nsimply from steep-shaped loss landscape. Motivated by this, we propose a loss\nlandscape sharpening method by amplifying the reconstruction loss, dubbed Loss\nAMPlification (LAMP). LAMP deforms the loss landscape into a steep shape so the\nreconstruction error on unseen anomalies becomes greater. Accordingly, the\nanomaly detection performance is improved without any change of the NN\narchitecture. Our findings suggest that LAMP can be easily applied to any\nreconstruction error metrics in UAD settings where the reconstruction model is\ntrained with anomaly-free samples only.\n","authors":["YeongHyeon Park","Sungho Kang","Myung Jin Kim","Hyeonho Jeong","Hyunkyu Park","Hyeong Seok Kim","Juneho Yi"],"pdf_url":"https://arxiv.org/pdf/2308.14595v1.pdf","comment":"5 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2211.16098v4","updated":"2023-08-28T14:03:09Z","published":"2022-11-29T11:17:34Z","title":"Three-stage binarization of color document images based on discrete\n wavelet transform and generative adversarial networks","summary":" The efficient segmentation of foreground text information from the background\nin degraded color document images is a critical challenge in the preservation\nof ancient manuscripts. The imperfect preservation of ancient manuscripts over\ntime has led to various types of degradation, such as staining, yellowing, and\nink seepage, significantly affecting image binarization results. This work\nproposes a three-stage method using Generative Adversarial Networks (GAN) for\nenhancing and binarizing degraded color document images through Discrete\nWavelet Transform (DWT). Stage-1 involves applying DWT and retaining the\nLow-Low (LL) subband images for image enhancement. In Stage-2, the original\ninput image is divided into four single-channel images (Red, Green, Blue, and\nGray), and each is trained with independent adversarial networks to extract\ncolor foreground information. In Stage-3, the output image from Stage-2 and the\noriginal input image are used to train independent adversarial networks for\ndocument binarization, enabling the integration of global and local features.\nThe experimental results demonstrate that our proposed method outperforms other\nclassic and state-of-the-art (SOTA) methods on the Document Image Binarization\nContest (DIBCO) datasets. We have released our implementation code at\nhttps://github.com/abcpp12383/ThreeStageBinarization.\n","authors":["Yu-Shian Lin","Rui-Yang Ju","Chih-Chia Chen","Chun-Tse Chien","Jen-Shiun Chiang"],"pdf_url":"https://arxiv.org/pdf/2211.16098v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14583v1","updated":"2023-08-28T13:49:08Z","published":"2023-08-28T13:49:08Z","title":"Learning to Read Analog Gauges from Synthetic Data","summary":" Manually reading and logging gauge data is time inefficient, and the effort\nincreases according to the number of gauges available. We present a computer\nvision pipeline that automates the reading of analog gauges. We propose a\ntwo-stage CNN pipeline that identifies the key structural components of an\nanalog gauge and outputs an angular reading. To facilitate the training of our\napproach, a synthetic dataset is generated thus obtaining a set of realistic\nanalog gauges with their corresponding annotation. To validate our proposal, an\nadditional real-world dataset was collected with 4.813 manually curated images.\nWhen compared against state-of-the-art methodologies, our method shows a\nsignificant improvement of 4.55 in the average error, which is a 52% relative\nimprovement. The resources for this project will be made available at:\nhttps://github.com/fuankarion/automatic-gauge-reading.\n","authors":["Juan Leon-Alcazar","Yazeed Alnumay","Cheng Zheng","Hassane Trigui","Sahejad Patel","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2308.14583v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14575v1","updated":"2023-08-28T13:40:47Z","published":"2023-08-28T13:40:47Z","title":"Referring Image Segmentation Using Text Supervision","summary":" Existing Referring Image Segmentation (RIS) methods typically require\nexpensive pixel-level or box-level annotations for supervision. In this paper,\nwe observe that the referring texts used in RIS already provide sufficient\ninformation to localize the target object. Hence, we propose a novel\nweakly-supervised RIS framework to formulate the target localization problem as\na classification process to differentiate between positive and negative text\nexpressions. While the referring text expressions for an image are used as\npositive expressions, the referring text expressions from other images can be\nused as negative expressions for this image. Our framework has three main\nnovelties. First, we propose a bilateral prompt method to facilitate the\nclassification process, by harmonizing the domain discrepancy between visual\nand linguistic features. Second, we propose a calibration method to reduce\nnoisy background information and improve the correctness of the response maps\nfor target object localization. Third, we propose a positive response map\nselection strategy to generate high-quality pseudo-labels from the enhanced\nresponse maps, for training a segmentation network for RIS inference. For\nevaluation, we propose a new metric to measure localization accuracy.\nExperiments on four benchmarks show that our framework achieves promising\nperformances to existing fully-supervised RIS methods while outperforming\nstate-of-the-art weakly-supervised methods adapted from related areas. Code is\navailable at https://github.com/fawnliu/TRIS.\n","authors":["Fang Liu","Yuhao Liu","Yuqiu Kong","Ke Xu","Lihe Zhang","Baocai Yin","Gerhard Hancke","Rynson Lau"],"pdf_url":"https://arxiv.org/pdf/2308.14575v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.14570v1","updated":"2023-08-28T13:35:07Z","published":"2023-08-28T13:35:07Z","title":"SAAN: Similarity-aware attention flow network for change detection with\n VHR remote sensing images","summary":" Change detection (CD) is a fundamental and important task for monitoring the\nland surface dynamics in the earth observation field. Existing deep\nlearning-based CD methods typically extract bi-temporal image features using a\nweight-sharing Siamese encoder network and identify change regions using a\ndecoder network. These CD methods, however, still perform far from\nsatisfactorily as we observe that 1) deep encoder layers focus on irrelevant\nbackground regions and 2) the models' confidence in the change regions is\ninconsistent at different decoder stages. The first problem is because deep\nencoder layers cannot effectively learn from imbalanced change categories using\nthe sole output supervision, while the second problem is attributed to the lack\nof explicit semantic consistency preservation. To address these issues, we\ndesign a novel similarity-aware attention flow network (SAAN). SAAN\nincorporates a similarity-guided attention flow module with deeply supervised\nsimilarity optimization to achieve effective change detection. Specifically, we\ncounter the first issue by explicitly guiding deep encoder layers to discover\nsemantic relations from bi-temporal input images using deeply supervised\nsimilarity optimization. The extracted features are optimized to be\nsemantically similar in the unchanged regions and dissimilar in the changing\nregions. The second drawback can be alleviated by the proposed\nsimilarity-guided attention flow module, which incorporates similarity-guided\nattention modules and attention flow mechanisms to guide the model to focus on\ndiscriminative channels and regions. We evaluated the effectiveness and\ngeneralization ability of the proposed method by conducting experiments on a\nwide range of CD tasks. The experimental results demonstrate that our method\nachieves excellent performance on several CD tasks, with discriminative\nfeatures and semantic consistency preserved.\n","authors":["Haonan Guo","Xin Su","Chen Wu","Bo Du","Liangpei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.14570v1.pdf","comment":"15 pages,13 figures"},{"id":"http://arxiv.org/abs/2304.11609v3","updated":"2023-08-28T13:26:52Z","published":"2023-04-23T10:46:16Z","title":"PiClick: Picking the desired mask in click-based interactive\n segmentation","summary":" Click-based interactive segmentation aims to generate target masks via human\nclicking, which facilitates efficient pixel-level annotation and image editing.\nIn such a task, target ambiguity remains a problem hindering the accuracy and\nefficiency of segmentation. That is, in scenes with rich context, one click may\ncorrespond to multiple potential targets, while most previous interactive\nsegmentors only generate a single mask and fail to deal with target ambiguity.\nIn this paper, we propose a novel interactive segmentation network named\nPiClick, to yield all potentially reasonable masks and suggest the most\nplausible one for the user. Specifically, PiClick utilizes a Transformer-based\narchitecture to generate all potential target masks by mutually interactive\nmask queries. Moreover, a Target Reasoning module is designed in PiClick to\nautomatically suggest the user-desired mask from all candidates, relieving\ntarget ambiguity and extra-human efforts. Extensive experiments on 9\ninteractive segmentation datasets demonstrate PiClick performs favorably\nagainst previous state-of-the-arts considering the segmentation results.\nMoreover, we show that PiClick effectively reduces human efforts in annotating\nand picking the desired masks. To ease the usage and inspire future research,\nwe release the source code of PiClick together with a plug-and-play annotation\ntool at https://github.com/cilinyan/PiClick.\n","authors":["Cilin Yan","Haochen Wang","Jie Liu","Xiaolong Jiang","Yao Hu","Xu Tang","Guoliang Kang","Efstratios Gavves"],"pdf_url":"https://arxiv.org/pdf/2304.11609v3.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2308.14551v1","updated":"2023-08-28T13:11:05Z","published":"2023-08-28T13:11:05Z","title":"Face Presentation Attack Detection by Excavating Causal Clues and\n Adapting Embedding Statistics","summary":" Recent face presentation attack detection (PAD) leverages domain adaptation\n(DA) and domain generalization (DG) techniques to address performance\ndegradation on unknown domains. However, DA-based PAD methods require access to\nunlabeled target data, while most DG-based PAD solutions rely on a priori,\ni.e., known domain labels. Moreover, most DA-/DG-based methods are\ncomputationally intensive, demanding complex model architectures and/or\nmulti-stage training processes. This paper proposes to model face PAD as a\ncompound DG task from a causal perspective, linking it to model optimization.\nWe excavate the causal factors hidden in the high-level representation via\ncounterfactual intervention. Moreover, we introduce a class-guided MixStyle to\nenrich feature-level data distribution within classes instead of focusing on\ndomain information. Both class-guided MixStyle and counterfactual intervention\ncomponents introduce no extra trainable parameters and negligible computational\nresources. Extensive cross-dataset and analytic experiments demonstrate the\neffectiveness and efficiency of our method compared to state-of-the-art PADs.\nThe implementation and the trained weights are publicly available.\n","authors":["Meiling Fang","Naser Damer"],"pdf_url":"https://arxiv.org/pdf/2308.14551v1.pdf","comment":"Accepted at WACV 2024"},{"id":"http://arxiv.org/abs/2205.05249v2","updated":"2023-08-28T13:00:38Z","published":"2022-05-11T03:36:04Z","title":"Secure & Private Federated Neuroimaging","summary":" The amount of biomedical data continues to grow rapidly. However, collecting\ndata from multiple sites for joint analysis remains challenging due to\nsecurity, privacy, and regulatory concerns. To overcome this challenge, we use\nFederated Learning, which enables distributed training of neural network models\nover multiple data sources without sharing data. Each site trains the neural\nnetwork over its private data for some time, then shares the neural network\nparameters (i.e., weights, gradients) with a Federation Controller, which in\nturn aggregates the local models, sends the resulting community model back to\neach site, and the process repeats. Our Federated Learning architecture,\nMetisFL, provides strong security and privacy. First, sample data never leaves\na site. Second, neural network parameters are encrypted before transmission and\nthe global neural model is computed under fully-homomorphic encryption.\nFinally, we use information-theoretic methods to limit information leakage from\nthe neural model to prevent a curious site from performing model inversion or\nmembership attacks. We present a thorough evaluation of the performance of\nsecure, private federated learning in neuroimaging tasks, including for\npredicting Alzheimer's disease and estimating BrainAGE from magnetic resonance\nimaging (MRI) studies, in challenging, heterogeneous federated environments\nwhere sites have different amounts of data and statistical distributions.\n","authors":["Dimitris Stripelis","Umang Gupta","Hamza Saleem","Nikhil Dhinagar","Tanmay Ghai","Rafael Chrysovalantis Anastasiou","Armaghan Asghar","Greg Ver Steeg","Srivatsan Ravi","Muhammad Naveed","Paul M. Thompson","Jose Luis Ambite"],"pdf_url":"https://arxiv.org/pdf/2205.05249v2.pdf","comment":"18 pages, 13 figures, 2 tables"},{"id":"http://arxiv.org/abs/2212.06969v2","updated":"2023-08-28T12:51:20Z","published":"2022-12-14T01:28:12Z","title":"EgoLoc: Revisiting 3D Object Localization from Egocentric Videos with\n Visual Queries","summary":" With the recent advances in video and 3D understanding, novel 4D\nspatio-temporal methods fusing both concepts have emerged. Towards this\ndirection, the Ego4D Episodic Memory Benchmark proposed a task for Visual\nQueries with 3D Localization (VQ3D). Given an egocentric video clip and an\nimage crop depicting a query object, the goal is to localize the 3D position of\nthe center of that query object with respect to the camera pose of a query\nframe. Current methods tackle the problem of VQ3D by unprojecting the 2D\nlocalization results of the sibling task Visual Queries with 2D Localization\n(VQ2D) into 3D predictions. Yet, we point out that the low number of camera\nposes caused by camera re-localization from previous VQ3D methods severally\nhinders their overall success rate. In this work, we formalize a pipeline (we\ndub EgoLoc) that better entangles 3D multiview geometry with 2D object\nretrieval from egocentric videos. Our approach involves estimating more robust\ncamera poses and aggregating multi-view 3D displacements by leveraging the 2D\ndetection confidence, which enhances the success rate of object queries and\nleads to a significant improvement in the VQ3D baseline performance.\nSpecifically, our approach achieves an overall success rate of up to 87.12%,\nwhich sets a new state-of-the-art result in the VQ3D task. We provide a\ncomprehensive empirical analysis of the VQ3D task and existing solutions, and\nhighlight the remaining challenges in VQ3D. The code is available at\nhttps://github.com/Wayne-Mai/EgoLoc.\n","authors":["Jinjie Mai","Abdullah Hamdi","Silvio Giancola","Chen Zhao","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2212.06969v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.14525v1","updated":"2023-08-28T12:23:36Z","published":"2023-08-28T12:23:36Z","title":"Semi-Supervised Learning for Visual Bird's Eye View Semantic\n Segmentation","summary":" Visual bird's eye view (BEV) semantic segmentation helps autonomous vehicles\nunderstand the surrounding environment only from images, including static\nelements (e.g., roads) and dynamic elements (e.g., vehicles, pedestrians).\nHowever, the high cost of annotation procedures of full-supervised methods\nlimits the capability of the visual BEV semantic segmentation, which usually\nneeds HD maps, 3D object bounding boxes, and camera extrinsic matrixes. In this\npaper, we present a novel semi-supervised framework for visual BEV semantic\nsegmentation to boost performance by exploiting unlabeled images during the\ntraining. A consistency loss that makes full use of unlabeled data is then\nproposed to constrain the model on not only semantic prediction but also the\nBEV feature. Furthermore, we propose a novel and effective data augmentation\nmethod named conjoint rotation which reasonably augments the dataset while\nmaintaining the geometric relationship between the front-view images and the\nBEV semantic segmentation. Extensive experiments on the nuScenes and Argoverse\ndatasets show that our semi-supervised framework can effectively improve\nprediction accuracy. To the best of our knowledge, this is the first work that\nexplores improving visual BEV semantic segmentation performance using unlabeled\ndata. The code will be publicly available.\n","authors":["Junyu Zhu","Lina Liu","Yu Tang","Feng Wen","Wanlong Li","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2308.14525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13266v2","updated":"2023-08-28T12:02:25Z","published":"2023-08-25T09:37:51Z","title":"Integrating Boxes and Masks: A Multi-Object Framework for Unified Visual\n Tracking and Segmentation","summary":" Tracking any given object(s) spatially and temporally is a common purpose in\nVisual Object Tracking (VOT) and Video Object Segmentation (VOS). Joint\ntracking and segmentation have been attempted in some studies but they often\nlack full compatibility of both box and mask in initialization and prediction,\nand mainly focus on single-object scenarios. To address these limitations, this\npaper proposes a Multi-object Mask-box Integrated framework for unified\nTracking and Segmentation, dubbed MITS. Firstly, the unified identification\nmodule is proposed to support both box and mask reference for initialization,\nwhere detailed object information is inferred from boxes or directly retained\nfrom masks. Additionally, a novel pinpoint box predictor is proposed for\naccurate multi-object box prediction, facilitating target-oriented\nrepresentation learning. All target objects are processed simultaneously from\nencoding to propagation and decoding, as a unified pipeline for VOT and VOS.\nExperimental results show MITS achieves state-of-the-art performance on both\nVOT and VOS benchmarks. Notably, MITS surpasses the best prior VOT competitor\nby around 6% on the GOT-10k test set, and significantly improves the\nperformance of box initialization on VOS benchmarks. The code is available at\nhttps://github.com/yoxu515/MITS.\n","authors":["Yuanyou Xu","Zongxin Yang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2308.13266v2.pdf","comment":"Accepted to ICCV2023"},{"id":"http://arxiv.org/abs/2211.00945v2","updated":"2023-08-28T11:36:06Z","published":"2022-11-02T08:09:03Z","title":"CarDD: A New Dataset for Vision-based Car Damage Detection","summary":" Automatic car damage detection has attracted significant attention in the car\ninsurance business. However, due to the lack of high-quality and publicly\navailable datasets, we can hardly learn a feasible model for car damage\ndetection. To this end, we contribute with Car Damage Detection (CarDD), the\nfirst public large-scale dataset designed for vision-based car damage detection\nand segmentation. Our CarDD contains 4,000 highresolution car damage images\nwith over 9,000 well-annotated instances of six damage categories. We detail\nthe image collection, selection, and annotation processes, and present a\nstatistical dataset analysis. Furthermore, we conduct extensive experiments on\nCarDD with state-of-the-art deep methods for different tasks and provide\ncomprehensive analyses to highlight the specialty of car damage detection.\nCarDD dataset and the source code are available at\nhttps://cardd-ustc.github.io.\n","authors":["Xinkuang Wang","Wenjing Li","Zhongcheng Wu"],"pdf_url":"https://arxiv.org/pdf/2211.00945v2.pdf","comment":"13 pages, 10 figures, full-length paper for Transactions on\n Intelligent Transportation Systems (2023)"},{"id":"http://arxiv.org/abs/2308.14500v1","updated":"2023-08-28T11:20:48Z","published":"2023-08-28T11:20:48Z","title":"LAC -- Latent Action Composition for Skeleton-based Action Segmentation","summary":" Skeleton-based action segmentation requires recognizing composable actions in\nuntrimmed videos. Current approaches decouple this problem by first extracting\nlocal visual features from skeleton sequences and then processing them by a\ntemporal model to classify frame-wise actions. However, their performances\nremain limited as the visual features cannot sufficiently express composable\nactions. In this context, we propose Latent Action Composition (LAC), a novel\nself-supervised framework aiming at learning from synthesized composable\nmotions for skeleton-based action segmentation. LAC is composed of a novel\ngeneration module towards synthesizing new sequences. Specifically, we design a\nlinear latent space in the generator to represent primitive motion. New\ncomposed motions can be synthesized by simply performing arithmetic operations\non latent representations of multiple input skeleton sequences. LAC leverages\nsuch synthesized sequences, which have large diversity and complexity, for\nlearning visual representations of skeletons in both sequence and frame spaces\nvia contrastive learning. The resulting visual encoder has a high expressive\npower and can be effectively transferred onto action segmentation tasks by\nend-to-end fine-tuning without the need for additional temporal models. We\nconduct a study focusing on transfer-learning and we show that representations\nlearned from pre-trained LAC outperform the state-of-the-art by a large margin\non TSU, Charades, PKU-MMD datasets.\n","authors":["Di Yang","Yaohui Wang","Antitza Dantcheva","Quan Kong","Lorenzo Garattoni","Gianpiero Francesca","Francois Bremond"],"pdf_url":"https://arxiv.org/pdf/2308.14500v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2302.05968v2","updated":"2023-08-28T11:14:31Z","published":"2023-02-12T18:16:51Z","title":"Self-supervised pseudo-colorizing of masked cells","summary":" Self-supervised learning, which is strikingly referred to as the dark matter\nof intelligence, is gaining more attention in biomedical applications of deep\nlearning. In this work, we introduce a novel self-supervision objective for the\nanalysis of cells in biomedical microscopy images. We propose training deep\nlearning models to pseudo-colorize masked cells. We use a physics-informed\npseudo-spectral colormap that is well suited for colorizing cell topology. Our\nexperiments reveal that approximating semantic segmentation by\npseudo-colorization is beneficial for subsequent fine-tuning on cell detection.\nInspired by the recent success of masked image modeling, we additionally mask\nout cell parts and train to reconstruct these parts to further enrich the\nlearned representations. We compare our pre-training method with\nself-supervised frameworks including contrastive learning (SimCLR), masked\nautoencoders (MAEs), and edge-based self-supervision. We build upon our\nprevious work and train hybrid models for cell detection, which contain both\nconvolutional and vision transformer modules. Our pre-training method can\noutperform SimCLR, MAE-like masked image modeling, and edge-based\nself-supervision when pre-training on a diverse set of six fluorescence\nmicroscopy datasets. Code is available at:\nhttps://github.com/roydenwa/pseudo-colorize-masked-cells\n","authors":["Royden Wagner","Carlos Fernandez Lopez","Christoph Stiller"],"pdf_url":"https://arxiv.org/pdf/2302.05968v2.pdf","comment":"14 pages, 3 figures; Published in PLOS ONE"},{"id":"http://arxiv.org/abs/2308.14492v1","updated":"2023-08-28T11:10:14Z","published":"2023-08-28T11:10:14Z","title":"PointHPS: Cascaded 3D Human Pose and Shape Estimation from Point Clouds","summary":" Human pose and shape estimation (HPS) has attracted increasing attention in\nrecent years. While most existing studies focus on HPS from 2D images or videos\nwith inherent depth ambiguity, there are surging need to investigate HPS from\n3D point clouds as depth sensors have been frequently employed in commercial\ndevices. However, real-world sensory 3D points are usually noisy and\nincomplete, and also human bodies could have different poses of high diversity.\nTo tackle these challenges, we propose a principled framework, PointHPS, for\naccurate 3D HPS from point clouds captured in real-world settings, which\niteratively refines point features through a cascaded architecture.\nSpecifically, each stage of PointHPS performs a series of downsampling and\nupsampling operations to extract and collate both local and global cues, which\nare further enhanced by two novel modules: 1) Cross-stage Feature Fusion (CFF)\nfor multi-scale feature propagation that allows information to flow effectively\nthrough the stages, and 2) Intermediate Feature Enhancement (IFE) for\nbody-aware feature aggregation that improves feature quality after each stage.\nTo facilitate a comprehensive study under various scenarios, we conduct our\nexperiments on two large-scale benchmarks, comprising i) a dataset that\nfeatures diverse subjects and actions captured by real commercial sensors in a\nlaboratory environment, and ii) controlled synthetic data generated with\nrealistic considerations such as clothed humans in crowded outdoor scenes.\nExtensive experiments demonstrate that PointHPS, with its powerful point\nfeature extraction and processing scheme, outperforms State-of-the-Art methods\nby significant margins across the board. Homepage:\nhttps://caizhongang.github.io/projects/PointHPS/.\n","authors":["Zhongang Cai","Liang Pan","Chen Wei","Wanqi Yin","Fangzhou Hong","Mingyuan Zhang","Chen Change Loy","Lei Yang","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2308.14492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.05277v2","updated":"2023-08-28T10:51:09Z","published":"2023-04-11T15:23:29Z","title":"Graph-based Topology Reasoning for Driving Scenes","summary":" Understanding the road genome is essential to realize autonomous driving.\nThis highly intelligent problem contains two aspects - the connection\nrelationship of lanes, and the assignment relationship between lanes and\ntraffic elements, where a comprehensive topology reasoning method is vacant. On\none hand, previous map learning techniques struggle in deriving lane\nconnectivity with segmentation or laneline paradigms; or prior lane\ntopology-oriented approaches focus on centerline detection and neglect the\ninteraction modeling. On the other hand, the traffic element to lane assignment\nproblem is limited in the image domain, leaving how to construct the\ncorrespondence from two views an unexplored challenge. To address these issues,\nwe present TopoNet, the first end-to-end framework capable of abstracting\ntraffic knowledge beyond conventional perception tasks. To capture the driving\nscene topology, we introduce three key designs: (1) an embedding module to\nincorporate semantic knowledge from 2D elements into a unified feature space;\n(2) a curated scene graph neural network to model relationships and enable\nfeature interaction inside the network; (3) instead of transmitting messages\narbitrarily, a scene knowledge graph is devised to differentiate prior\nknowledge from various types of the road genome. We evaluate TopoNet on the\nchallenging scene understanding benchmark, OpenLane-V2, where our approach\noutperforms all previous works by a great margin on all perceptual and\ntopological metrics. The code is released at\nhttps://github.com/OpenDriveLab/TopoNet\n","authors":["Tianyu Li","Li Chen","Huijie Wang","Yang Li","Jiazhi Yang","Xiangwei Geng","Shengyin Jiang","Yuting Wang","Hang Xu","Chunjing Xu","Junchi Yan","Ping Luo","Hongyang Li"],"pdf_url":"https://arxiv.org/pdf/2304.05277v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.10058v2","updated":"2023-08-28T10:46:22Z","published":"2023-03-17T15:38:39Z","title":"No Fear of Classifier Biases: Neural Collapse Inspired Federated\n Learning with Synthetic and Fixed Classifier","summary":" Data heterogeneity is an inherent challenge that hinders the performance of\nfederated learning (FL). Recent studies have identified the biased classifiers\nof local models as the key bottleneck. Previous attempts have used classifier\ncalibration after FL training, but this approach falls short in improving the\npoor feature representations caused by training-time classifier biases.\nResolving the classifier bias dilemma in FL requires a full understanding of\nthe mechanisms behind the classifier. Recent advances in neural collapse have\nshown that the classifiers and feature prototypes under perfect training\nscenarios collapse into an optimal structure called simplex equiangular tight\nframe (ETF). Building on this neural collapse insight, we propose a solution to\nthe FL's classifier bias problem by utilizing a synthetic and fixed ETF\nclassifier during training. The optimal classifier structure enables all\nclients to learn unified and optimal feature representations even under\nextremely heterogeneous data. We devise several effective modules to better\nadapt the ETF structure in FL, achieving both high generalization and\npersonalization. Extensive experiments demonstrate that our method achieves\nstate-of-the-art performances on CIFAR-10, CIFAR-100, and Tiny-ImageNet.\n","authors":["Zexi Li","Xinyi Shang","Rui He","Tao Lin","Chao Wu"],"pdf_url":"https://arxiv.org/pdf/2303.10058v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.14481v1","updated":"2023-08-28T10:43:53Z","published":"2023-08-28T10:43:53Z","title":"Group Regression for Query Based Object Detection and Tracking","summary":" Group regression is commonly used in 3D object detection to predict box\nparameters of similar classes in a joint head, aiming to benefit from\nsimilarities while separating highly dissimilar classes. For query-based\nperception methods, this has, so far, not been feasible. We close this gap and\npresent a method to incorporate multi-class group regression, especially\ndesigned for the 3D domain in the context of autonomous driving, into existing\nattention and query-based perception approaches. We enhance a transformer based\njoint object detection and tracking model with this approach, and thoroughly\nevaluate its behavior and performance. For group regression, the classes of the\nnuScenes dataset are divided into six groups of similar shape and prevalence,\neach being regressed by a dedicated head. We show that the proposed method is\napplicable to many existing transformer based perception approaches and can\nbring potential benefits. The behavior of query group regression is thoroughly\nanalyzed in comparison to a unified regression head, e.g. in terms of\nclass-switching behavior and distribution of the output parameters. The\nproposed method offers many possibilities for further research, such as in the\ndirection of deep multi-hypotheses tracking.\n","authors":["Felicia Ruppel","Florian Faion","Claudius Gläser","Klaus Dietmayer"],"pdf_url":"https://arxiv.org/pdf/2308.14481v1.pdf","comment":"Accepted for publication at the 2023 26th IEEE International\n Conference on Intelligent Transportation Systems (ITSC 2023), Sep 24-28,\n 2023, in Bilbao, Spain"},{"id":"http://arxiv.org/abs/2307.07205v3","updated":"2023-08-28T10:41:07Z","published":"2023-07-14T07:42:45Z","title":"Multimodal Motion Conditioned Diffusion Model for Skeleton-based Video\n Anomaly Detection","summary":" Anomalies are rare and anomaly detection is often therefore framed as\nOne-Class Classification (OCC), i.e. trained solely on normalcy. Leading OCC\ntechniques constrain the latent representations of normal motions to limited\nvolumes and detect as abnormal anything outside, which accounts satisfactorily\nfor the openset'ness of anomalies. But normalcy shares the same openset'ness\nproperty since humans can perform the same action in several ways, which the\nleading techniques neglect. We propose a novel generative model for video\nanomaly detection (VAD), which assumes that both normality and abnormality are\nmultimodal. We consider skeletal representations and leverage state-of-the-art\ndiffusion probabilistic models to generate multimodal future human poses. We\ncontribute a novel conditioning on the past motion of people and exploit the\nimproved mode coverage capabilities of diffusion processes to generate\ndifferent-but-plausible future motions. Upon the statistical aggregation of\nfuture modes, an anomaly is detected when the generated set of motions is not\npertinent to the actual future. We validate our model on 4 established\nbenchmarks: UBnormal, HR-UBnormal, HR-STC, and HR-Avenue, with extensive\nexperiments surpassing state-of-the-art results.\n","authors":["Alessandro Flaborea","Luca Collorone","Guido D'Amely","Stefano D'Arrigo","Bardh Prenkaj","Fabio Galasso"],"pdf_url":"https://arxiv.org/pdf/2307.07205v3.pdf","comment":"Accepted at ICCV2023"},{"id":"http://arxiv.org/abs/2308.14480v1","updated":"2023-08-28T10:40:16Z","published":"2023-08-28T10:40:16Z","title":"Priority-Centric Human Motion Generation in Discrete Latent Space","summary":" Text-to-motion generation is a formidable task, aiming to produce human\nmotions that align with the input text while also adhering to human\ncapabilities and physical laws. While there have been advancements in diffusion\nmodels, their application in discrete spaces remains underexplored. Current\nmethods often overlook the varying significance of different motions, treating\nthem uniformly. It is essential to recognize that not all motions hold the same\nrelevance to a particular textual description. Some motions, being more salient\nand informative, should be given precedence during generation. In response, we\nintroduce a Priority-Centric Motion Discrete Diffusion Model (M2DM), which\nutilizes a Transformer-based VQ-VAE to derive a concise, discrete motion\nrepresentation, incorporating a global self-attention mechanism and a\nregularization term to counteract code collapse. We also present a motion\ndiscrete diffusion model that employs an innovative noise schedule, determined\nby the significance of each motion token within the entire motion sequence.\nThis approach retains the most salient motions during the reverse diffusion\nprocess, leading to more semantically rich and varied motions. Additionally, we\nformulate two strategies to gauge the importance of motion tokens, drawing from\nboth textual and visual indicators. Comprehensive experiments on the HumanML3D\nand KIT-ML datasets confirm that our model surpasses existing techniques in\nfidelity and diversity, particularly for intricate textual descriptions.\n","authors":["Hanyang Kong","Kehong Gong","Dongze Lian","Michael Bi Mi","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14480v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.14477v1","updated":"2023-08-28T10:30:08Z","published":"2023-08-28T10:30:08Z","title":"Medical needle tip tracking based on Optical Imaging and AI","summary":" Deep needle insertion to a target often poses a huge challenge, requiring a\ncombination of specialized skills, assistive technology, and extensive\ntraining. One of the frequently encountered medical scenarios demanding such\nexpertise includes the needle insertion into a femoral vessel in the groin.\nAfter the access to the femoral vessel, various medical procedures, such as\ncardiac catheterization and extracorporeal membrane oxygenation (ECMO) can be\nperformed. However, even with the aid of Ultrasound imaging, achieving\nsuccessful insertion can necessitate multiple attempts due to the complexities\nof anatomy and tissue deformation. To address this challenge, this paper\npresents an innovative technology for needle tip real-time tracking, aiming for\nenhanced needle insertion guidance. Specifically, our approach revolves around\nthe creation of scattering imaging using an optical fiber-equipped needle, and\nuses Convolutional Neural Network (CNN) based algorithms to enable real-time\nestimation of the needle tip's position and orientation during insertion\nprocedures. The efficacy of the proposed technology was rigorously evaluated\nthrough three experiments. The first two experiments involved rubber and bacon\nphantoms to simulate groin anatomy. The positional errors averaging 2.3+1.5mm\nand 2.0+1.2mm, and the orientation errors averaging 0.2+0.11rad and\n0.16+0.1rad. Furthermore, the system's capabilities were validated through\nexperiments conducted on fresh porcine phantom mimicking more complex\nanatomical structures, yielding positional accuracy results of 3.2+3.1mm and\norientational accuracy of 0.19+0.1rad. Given the average femoral arterial\nradius of 4 to 5mm, the proposed system is demonstrated with a great potential\nfor precise needle guidance in femoral artery insertion procedures. In\naddition, the findings highlight the broader potential applications of the\nsystem in the medical field.\n","authors":["Zhuoqi Cheng","Simon Lyck Bjært Sørensen","Mikkel Werge Olsen","René Lynge Eriksen","Thiusius Rajeeth Savarimuthu"],"pdf_url":"https://arxiv.org/pdf/2308.14477v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.08451v3","updated":"2023-08-28T10:22:23Z","published":"2023-04-17T17:21:21Z","title":"Efficient Video Action Detection with Token Dropout and Context\n Refinement","summary":" Streaming video clips with large-scale video tokens impede vision\ntransformers (ViTs) for efficient recognition, especially in video action\ndetection where sufficient spatiotemporal representations are required for\nprecise actor identification. In this work, we propose an end-to-end framework\nfor efficient video action detection (EVAD) based on vanilla ViTs. Our EVAD\nconsists of two specialized designs for video action detection. First, we\npropose a spatiotemporal token dropout from a keyframe-centric perspective. In\na video clip, we maintain all tokens from its keyframe, preserve tokens\nrelevant to actor motions from other frames, and drop out the remaining tokens\nin this clip. Second, we refine scene context by leveraging remaining tokens\nfor better recognizing actor identities. The region of interest (RoI) in our\naction detector is expanded into temporal domain. The captured spatiotemporal\nactor identity representations are refined via scene context in a decoder with\nthe attention mechanism. These two designs make our EVAD efficient while\nmaintaining accuracy, which is validated on three benchmark datasets (i.e.,\nAVA, UCF101-24, JHMDB). Compared to the vanilla ViT backbone, our EVAD reduces\nthe overall GFLOPs by 43% and improves real-time inference speed by 40% with no\nperformance degradation. Moreover, even at similar computational costs, our\nEVAD can improve the performance by 1.1 mAP with higher resolution inputs. Code\nis available at https://github.com/MCG-NJU/EVAD.\n","authors":["Lei Chen","Zhan Tong","Yibing Song","Gangshan Wu","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2304.08451v3.pdf","comment":"technical report"},{"id":"http://arxiv.org/abs/2308.14469v1","updated":"2023-08-28T10:15:57Z","published":"2023-08-28T10:15:57Z","title":"Pixel-Aware Stable Diffusion for Realistic Image Super-resolution and\n Personalized Stylization","summary":" Realistic image super-resolution (Real-ISR) aims to reproduce perceptually\nrealistic image details from a low-quality input. The commonly used adversarial\ntraining based Real-ISR methods often introduce unnatural visual artifacts and\nfail to generate realistic textures for natural scene images. The recently\ndeveloped generative stable diffusion models provide a potential solution to\nReal-ISR with pre-learned strong image priors. However, the existing methods\nalong this line either fail to keep faithful pixel-wise image structures or\nresort to extra skipped connections to reproduce details, which requires\nadditional training in image space and limits their extension to other related\ntasks in latent space such as image stylization. In this work, we propose a\npixel-aware stable diffusion (PASD) network to achieve robust Real-ISR as well\nas personalized stylization. In specific, a pixel-aware cross attention module\nis introduced to enable diffusion models perceiving image local structures in\npixel-wise level, while a degradation removal module is used to extract\ndegradation insensitive features to guide the diffusion process together with\nimage high level information. By simply replacing the base diffusion model with\na personalized one, our method can generate diverse stylized images without the\nneed to collect pairwise training data. PASD can be easily integrated into\nexisting diffusion models such as Stable Diffusion. Experiments on Real-ISR and\npersonalized stylization demonstrate the effectiveness of our proposed\napproach. The source code and models can be found at\n\\url{https://github.com/yangxy/PASD}.\n","authors":["Tao Yang","Peiran Ren","Xuansong Xie","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.14469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00247v3","updated":"2023-08-28T10:12:03Z","published":"2023-08-01T03:00:36Z","title":"Unleashing the Power of Self-Supervised Image Denoising: A Comprehensive\n Review","summary":" The advent of deep learning has brought a revolutionary transformation to\nimage denoising techniques. However, the persistent challenge of acquiring\nnoise-clean pairs for supervised methods in real-world scenarios remains\nformidable, necessitating the exploration of more practical self-supervised\nimage denoising. This paper focuses on self-supervised image denoising methods\nthat offer effective solutions to address this challenge. Our comprehensive\nreview thoroughly analyzes the latest advancements in self-supervised image\ndenoising approaches, categorizing them into three distinct classes: General\nmethods, Blind Spot Network (BSN)-based methods, and Transformer-based methods.\nFor each class, we provide a concise theoretical analysis along with their\npractical applications. To assess the effectiveness of these methods, we\npresent both quantitative and qualitative experimental results on various\ndatasets, utilizing classical algorithms as benchmarks. Additionally, we\ncritically discuss the current limitations of these methods and propose\npromising directions for future research. By offering a detailed overview of\nrecent developments in self-supervised image denoising, this review serves as\nan invaluable resource for researchers and practitioners in the field,\nfacilitating a deeper understanding of this emerging domain and inspiring\nfurther advancements.\n","authors":["Dan Zhang","Fangfang Zhou","Xiao Yang","Yuan Gu"],"pdf_url":"https://arxiv.org/pdf/2308.00247v3.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2308.14466v1","updated":"2023-08-28T10:04:06Z","published":"2023-08-28T10:04:06Z","title":"Improving the performance of object detection by preserving label\n distribution","summary":" Object detection is a task that performs position identification and label\nclassification of objects in images or videos. The information obtained through\nthis process plays an essential role in various tasks in the field of computer\nvision. In object detection, the data utilized for training and validation\ntypically originate from public datasets that are well-balanced in terms of the\nnumber of objects ascribed to each class in an image. However, in real-world\nscenarios, handling datasets with much greater class imbalance, i.e., very\ndifferent numbers of objects for each class , is much more common, and this\nimbalance may reduce the performance of object detection when predicting unseen\ntest images. In our study, thus, we propose a method that evenly distributes\nthe classes in an image for training and validation, solving the class\nimbalance problem in object detection. Our proposed method aims to maintain a\nuniform class distribution through multi-label stratification. We tested our\nproposed method not only on public datasets that typically exhibit balanced\nclass distribution but also on custom datasets that may have imbalanced class\ndistribution. We found that our proposed method was more effective on datasets\ncontaining severe imbalance and less data. Our findings indicate that the\nproposed method can be effectively used on datasets with substantially\nimbalanced class distribution.\n","authors":["Heewon Lee","Sangtae Ahn"],"pdf_url":"https://arxiv.org/pdf/2308.14466v1.pdf","comment":"Code is available at\n https://github.com/leeheewon-01/YOLOstratifiedKFold/tree/main"},{"id":"http://arxiv.org/abs/2212.04740v3","updated":"2023-08-28T09:59:28Z","published":"2022-12-09T09:36:59Z","title":"Predicting Shape Development: a Riemannian Method","summary":" Predicting the future development of an anatomical shape from a single\nbaseline observation is a challenging task. But it can be essential for\nclinical decision-making. Research has shown that it should be tackled in\ncurved shape spaces, as (e.g., disease-related) shape changes frequently expose\nnonlinear characteristics. We thus propose a novel prediction method that\nencodes the whole shape in a Riemannian shape space. It then learns a simple\nprediction technique founded on hierarchical statistical modeling of\nlongitudinal training data. When applied to predict the future development of\nthe shape of the right hippocampus under Alzheimer's disease and to human body\nmotion, it outperforms deep learning-supported variants as well as\nstate-of-the-art.\n","authors":["Doğa Türkseven","Islem Rekik","Christoph von Tycowicz","Martin Hanik"],"pdf_url":"https://arxiv.org/pdf/2212.04740v3.pdf","comment":"new experiment with human motion data; fixed vertex-assignment bug in\n the prediction of the varifold-based method"},{"id":"http://arxiv.org/abs/2308.14461v1","updated":"2023-08-28T09:58:34Z","published":"2023-08-28T09:58:34Z","title":"Spatio-Temporal Analysis of Patient-Derived Organoid Videos Using Deep\n Learning for the Prediction of Drug Efficacy","summary":" Over the last ten years, Patient-Derived Organoids (PDOs) emerged as the most\nreliable technology to generate ex-vivo tumor avatars. PDOs retain the main\ncharacteristics of their original tumor, making them a system of choice for\npre-clinical and clinical studies. In particular, PDOs are attracting interest\nin the field of Functional Precision Medicine (FPM), which is based upon an\nex-vivo drug test in which living tumor cells (such as PDOs) from a specific\npatient are exposed to a panel of anti-cancer drugs. Currently, the Adenosine\nTriphosphate (ATP) based cell viability assay is the gold standard test to\nassess the sensitivity of PDOs to drugs. The readout is measured at the end of\nthe assay from a global PDO population and therefore does not capture single\nPDO responses and does not provide time resolution of drug effect. To this end,\nin this study, we explore for the first time the use of powerful large\nfoundation models for the automatic processing of PDO data. In particular, we\npropose a novel imaging-based high-throughput screening method to assess\nreal-time drug efficacy from a time-lapse microscopy video of PDOs. The\nrecently proposed SAM algorithm for segmentation and DINOv2 model are adapted\nin a comprehensive pipeline for processing PDO microscopy frames. Moreover, an\nattention mechanism is proposed for fusing temporal and spatial features in a\nmultiple instance learning setting to predict ATP. We report better results\nthan other non-time-resolved methods, indicating that the temporality of data\nis an important factor for the prediction of ATP. Extensive ablations shed\nlight on optimizing the experimental setting and automating the prediction both\nin real-time and for forecasting.\n","authors":["Leo Fillioux","Emilie Gontran","Jérôme Cartry","Jacques RR Mathieu","Sabrina Bedja","Alice Boilève","Paul-Henry Cournède","Fanny Jaulin","Stergios Christodoulidis","Maria Vakalopoulou"],"pdf_url":"https://arxiv.org/pdf/2308.14461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14448v1","updated":"2023-08-28T09:35:13Z","published":"2023-08-28T09:35:13Z","title":"ExpCLIP: Bridging Text and Facial Expressions via Semantic Alignment","summary":" The objective of stylized speech-driven facial animation is to create\nanimations that encapsulate specific emotional expressions. Existing methods\noften depend on pre-established emotional labels or facial expression\ntemplates, which may limit the necessary flexibility for accurately conveying\nuser intent. In this research, we introduce a technique that enables the\ncontrol of arbitrary styles by leveraging natural language as emotion prompts.\nThis technique presents benefits in terms of both flexibility and\nuser-friendliness. To realize this objective, we initially construct a\nText-Expression Alignment Dataset (TEAD), wherein each facial expression is\npaired with several prompt-like descriptions.We propose an innovative automatic\nannotation method, supported by Large Language Models (LLMs), to expedite the\ndataset construction, thereby eliminating the substantial expense of manual\nannotation. Following this, we utilize TEAD to train a CLIP-based model, termed\nExpCLIP, which encodes text and facial expressions into semantically aligned\nstyle embeddings. The embeddings are subsequently integrated into the facial\nanimation generator to yield expressive and controllable facial animations.\nGiven the limited diversity of facial emotions in existing speech-driven facial\nanimation training data, we further introduce an effective Expression Prompt\nAugmentation (EPA) mechanism to enable the animation generator to support\nunprecedented richness in style control. Comprehensive experiments illustrate\nthat our method accomplishes expressive facial animation generation and offers\nenhanced flexibility in effectively conveying the desired style.\n","authors":["Yicheng Zhong","Huawei Wei","Peiji Yang","Zhisheng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14448v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14437v1","updated":"2023-08-28T09:23:18Z","published":"2023-08-28T09:23:18Z","title":"Data-iterative Optimization Score Model for Stable Ultra-Sparse-View CT\n Reconstruction","summary":" Score-based generative models (SGMs) have gained prominence in sparse-view CT\nreconstruction for their precise sampling of complex distributions. In\nSGM-based reconstruction, data consistency in the score-based diffusion model\nensures close adherence of generated samples to observed data distribution,\ncrucial for improving image quality. Shortcomings in data consistency\ncharacterization manifest in three aspects. Firstly, data from the optimization\nprocess can lead to artifacts in reconstructed images. Secondly, it often\nneglects that the generation model and original data constraints are\nindependently completed, fragmenting unity. Thirdly, it predominantly focuses\non constraining intermediate results in the inverse sampling process, rather\nthan ideal real images. Thus, we propose an iterative optimization data scoring\nmodel. This paper introduces the data-iterative optimization score-based model\n(DOSM), integrating innovative data consistency into the Stochastic\nDifferential Equation, a valuable constraint for ultra-sparse-view CT\nreconstruction. The novelty of this data consistency element lies in its sole\nreliance on original measurement data to confine generation outcomes,\neffectively balancing measurement data and generative model constraints.\nAdditionally, we pioneer an inference strategy that traces back from current\niteration results to ideal truth, enhancing reconstruction stability. We\nleverage conventional iteration techniques to optimize DOSM updates.\nQuantitative and qualitative results from 23 views of numerical and clinical\ncardiac datasets demonstrate DOSM's superiority over other methods. Remarkably,\neven with 10 views, our method achieves excellent performance.\n","authors":["Weiwen Wu","Yanyang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14437v1.pdf","comment":"11 pages, 12 figures"},{"id":"http://arxiv.org/abs/2308.14419v1","updated":"2023-08-28T08:59:57Z","published":"2023-08-28T08:59:57Z","title":"Graph-based Asynchronous Event Processing for Rapid Object Recognition","summary":" Different from traditional video cameras, event cameras capture asynchronous\nevents stream in which each event encodes pixel location, trigger time, and the\npolarity of the brightness changes. In this paper, we introduce a novel\ngraph-based framework for event cameras, namely SlideGCN. Unlike some recent\ngraph-based methods that use groups of events as input, our approach can\nefficiently process data event-by-event, unlock the low latency nature of\nevents data while still maintaining the graph's structure internally. For fast\ngraph construction, we develop a radius search algorithm, which better exploits\nthe partial regular structure of event cloud against k-d tree based generic\nmethods. Experiments show that our method reduces the computational complexity\nup to 100 times with respect to current graph-based methods while keeping\nstate-of-the-art performance on object recognition. Moreover, we verify the\nsuperiority of event-wise processing with our method. When the state becomes\nstable, we can give a prediction with high confidence, thus making an early\nrecognition. Project page: \\url{https://zju3dv.github.io/slide_gcn/}.\n","authors":["Yijin Li","Han Zhou","Bangbang Yang","Ye Zhang","Zhaopeng Cui","Hujun Bao","Guofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.14419v1.pdf","comment":"Accepted to ICCV 2021. Project Page:\n https://zju3dv.github.io/slide_gcn/"},{"id":"http://arxiv.org/abs/2303.11917v2","updated":"2023-08-28T08:54:47Z","published":"2023-03-21T15:08:35Z","title":"Efficient Decision-based Black-box Patch Attacks on Video Recognition","summary":" Although Deep Neural Networks (DNNs) have demonstrated excellent performance,\nthey are vulnerable to adversarial patches that introduce perceptible and\nlocalized perturbations to the input. Generating adversarial patches on images\nhas received much attention, while adversarial patches on videos have not been\nwell investigated. Further, decision-based attacks, where attackers only access\nthe predicted hard labels by querying threat models, have not been well\nexplored on video models either, even if they are practical in real-world video\nrecognition scenes. The absence of such studies leads to a huge gap in the\nrobustness assessment for video models. To bridge this gap, this work first\nexplores decision-based patch attacks on video models. We analyze that the huge\nparameter space brought by videos and the minimal information returned by\ndecision-based models both greatly increase the attack difficulty and query\nburden. To achieve a query-efficient attack, we propose a spatial-temporal\ndifferential evolution (STDE) framework. First, STDE introduces target videos\nas patch textures and only adds patches on keyframes that are adaptively\nselected by temporal difference. Second, STDE takes minimizing the patch area\nas the optimization objective and adopts spatialtemporal mutation and crossover\nto search for the global optimum without falling into the local optimum.\nExperiments show STDE has demonstrated state-of-the-art performance in terms of\nthreat, efficiency and imperceptibility. Hence, STDE has the potential to be a\npowerful tool for evaluating the robustness of video recognition models.\n","authors":["Kaixun Jiang","Zhaoyu Chen","Hao Huang","Jiafeng Wang","Dingkang Yang","Bo Li","Yan Wang","Wenqiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.11917v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14418v1","updated":"2023-08-28T08:54:27Z","published":"2023-08-28T08:54:27Z","title":"Multi-Scale and Multi-Layer Contrastive Learning for Domain\n Generalization","summary":" During the past decade, deep neural networks have led to fast-paced progress\nand significant achievements in computer vision problems, for both academia and\nindustry. Yet despite their success, state-of-the-art image classification\napproaches fail to generalize well in previously unseen visual contexts, as\nrequired by many real-world applications. In this paper, we focus on this\ndomain generalization (DG) problem and argue that the generalization ability of\ndeep convolutional neural networks can be improved by taking advantage of\nmulti-layer and multi-scaled representations of the network. We introduce a\nframework that aims at improving domain generalization of image classifiers by\ncombining both low-level and high-level features at multiple scales, enabling\nthe network to implicitly disentangle representations in its latent space and\nlearn domain-invariant attributes of the depicted objects. Additionally, to\nfurther facilitate robust representation learning, we propose a novel objective\nfunction, inspired by contrastive learning, which aims at constraining the\nextracted representations to remain invariant under distribution shifts. We\ndemonstrate the effectiveness of our method by evaluating on the domain\ngeneralization datasets of PACS, VLCS, Office-Home and NICO. Through extensive\nexperimentation, we show that our model is able to surpass the performance of\nprevious DG methods and consistently produce competitive and state-of-the-art\nresults in all datasets.\n","authors":["Aristotelis Ballas","Christos Diou"],"pdf_url":"https://arxiv.org/pdf/2308.14418v1.pdf","comment":"Manuscript under review at: IEEE Transactions on Artificial\n Intelligence"},{"id":"http://arxiv.org/abs/2308.14414v1","updated":"2023-08-28T08:51:20Z","published":"2023-08-28T08:51:20Z","title":"INF: Implicit Neural Fusion for LiDAR and Camera","summary":" Sensor fusion has become a popular topic in robotics. However, conventional\nfusion methods encounter many difficulties, such as data representation\ndifferences, sensor variations, and extrinsic calibration. For example, the\ncalibration methods used for LiDAR-camera fusion often require manual operation\nand auxiliary calibration targets. Implicit neural representations (INRs) have\nbeen developed for 3D scenes, and the volume density distribution involved in\nan INR unifies the scene information obtained by different types of sensors.\nTherefore, we propose implicit neural fusion (INF) for LiDAR and camera. INF\nfirst trains a neural density field of the target scene using LiDAR frames.\nThen, a separate neural color field is trained using camera images and the\ntrained neural density field. Along with the training process, INF both\nestimates LiDAR poses and optimizes extrinsic parameters. Our experiments\ndemonstrate the high accuracy and stable performance of the proposed method.\n","authors":["Shuyi Zhou","Shuxiang Xie","Ryoichi Ishikawa","Ken Sakurada","Masaki Onishi","Takeshi Oishi"],"pdf_url":"https://arxiv.org/pdf/2308.14414v1.pdf","comment":"Accepted to IROS 2023. (project page:\n https://ShuyiZhou495.github.io/inf-project-page/)"},{"id":"http://arxiv.org/abs/2308.14409v1","updated":"2023-08-28T08:47:06Z","published":"2023-08-28T08:47:06Z","title":"Steerable Conditional Diffusion for Out-of-Distribution Adaptation in\n Imaging Inverse Problems","summary":" Denoising diffusion models have emerged as the go-to framework for solving\ninverse problems in imaging. A critical concern regarding these models is their\nperformance on out-of-distribution (OOD) tasks, which remains an under-explored\nchallenge. Realistic reconstructions inconsistent with the measured data can be\ngenerated, hallucinating image features that are uniquely present in the\ntraining dataset. To simultaneously enforce data-consistency and leverage\ndata-driven priors, we introduce a novel sampling framework called Steerable\nConditional Diffusion. This framework adapts the denoising network specifically\nto the available measured data. Utilising our proposed method, we achieve\nsubstantial enhancements in OOD performance across diverse imaging modalities,\nadvancing the robust deployment of denoising diffusion models in real-world\napplications.\n","authors":["Riccardo Barbano","Alexander Denker","Hyungjin Chung","Tae Hoon Roh","Simon Arrdige","Peter Maass","Bangti Jin","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2308.14409v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14400v1","updated":"2023-08-28T08:33:45Z","published":"2023-08-28T08:33:45Z","title":"Semi-Supervised Semantic Depth Estimation using Symbiotic Transformer\n and NearFarMix Augmentation","summary":" In computer vision, depth estimation is crucial for domains like robotics,\nautonomous vehicles, augmented reality, and virtual reality. Integrating\nsemantics with depth enhances scene understanding through reciprocal\ninformation sharing. However, the scarcity of semantic information in datasets\nposes challenges. Existing convolutional approaches with limited local\nreceptive fields hinder the full utilization of the symbiotic potential between\ndepth and semantics. This paper introduces a dataset-invariant semi-supervised\nstrategy to address the scarcity of semantic information. It proposes the Depth\nSemantics Symbiosis module, leveraging the Symbiotic Transformer for achieving\ncomprehensive mutual awareness by information exchange within both local and\nglobal contexts. Additionally, a novel augmentation, NearFarMix is introduced\nto combat overfitting and compensate both depth-semantic tasks by strategically\nmerging regions from two images, generating diverse and structurally consistent\nsamples with enhanced control. Extensive experiments on NYU-Depth-V2 and KITTI\ndatasets demonstrate the superiority of our proposed techniques in indoor and\noutdoor environments.\n","authors":["Md Awsafur Rahman","Shaikh Anowarul Fattah"],"pdf_url":"https://arxiv.org/pdf/2308.14400v1.pdf","comment":"Accepted at WACV 2024"},{"id":"http://arxiv.org/abs/2202.06599v3","updated":"2023-08-28T08:27:30Z","published":"2022-02-14T10:40:51Z","title":"Multi-Atlas Segmentation and Spatial Alignment of the Human Embryo in\n First Trimester 3D Ultrasound","summary":" Segmentation and spatial alignment of ultrasound (US) imaging data acquired\nin the in first trimester are crucial for monitoring human embryonic growth and\ndevelopment throughout this crucial period of life. Current approaches are\neither manual or semi-automatic and are therefore very time-consuming and prone\nto errors. To automate these tasks, we propose a multi-atlas framework for\nautomatic segmentation and spatial alignment of the embryo using deep learning\nwith minimal supervision. Our framework learns to register the embryo to an\natlas, which consists of the US images acquired at a range of gestational age\n(GA), segmented and spatially aligned to a predefined standard orientation.\nFrom this, we can derive the segmentation of the embryo and put the embryo in\nstandard orientation. US images acquired at 8+0 till 12+6 weeks GA were used\nand eight subjects were selected as atlas. We evaluated different fusion\nstrategies to incorporate multiple atlases: 1) training the framework using\natlas images from a single subject, 2) training the framework with data of all\navailable atlases and 3) ensembling of the frameworks trained per subject. To\nevaluate the performance, we calculated the Dice score over the test set. We\nfound that training the framework using all available atlases outperformed\nensembling and gave similar results compared to the best of all frameworks\ntrained on a single subject. Furthermore, we found that selecting images from\nthe four atlases closest in GA out of all available atlases, regardless of the\nindividual quality, gave the best results with a median Dice score of 0.72. We\nconclude that our framework can accurately segment and spatially align the\nembryo in first trimester 3D US images and is robust for the variation in\nquality that existed in the available atlases.\n","authors":["W. A. P. Bastiaansen","M. Rousian","R. P. M. Steegers-Theunissen","W. J. Niessen","A. H. J. Koning","S. Klein"],"pdf_url":"https://arxiv.org/pdf/2202.06599v3.pdf","comment":"Accepted for publication at the Journal of Machine Learning for\n Biomedical Imaging (MELBA) https://www.melba-journal.org/papers/2022:020.html"},{"id":"http://arxiv.org/abs/2308.14397v1","updated":"2023-08-28T08:24:25Z","published":"2023-08-28T08:24:25Z","title":"Ensemble of Anchor-Free Models for Robust Bangla Document Layout\n Segmentation","summary":" In this research paper, we present an innovative system designed for the\npurpose of segmenting the layout of Bangla documents. Our methodology involves\nutilizing a sophisticated collection of YOLOv8 models, meticulously adapted for\nthe DL Sprint 2.0 - BUET CSE Fest 2023 Competition that centers around Bangla\ndocument layout segmentation. Our primary focus lies in elevating various\nelements of the task, including techniques like image augmentation, model\narchitecture, and the use of model ensembles. We intentionally lower the\nquality of a subset of document images to enhance the resilience of model\ntraining, consequently leading to an improvement in our cross-validation score.\nEmploying Bayesian optimization, we determine the optimal confidence and IoU\nthresholds for our model ensemble. Through our approach, we successfully\nshowcase the effectiveness of amalgamating anchor-free models to achieve robust\nlayout segmentation in Bangla documents.\n","authors":["U Mong Sain Chak","Md. Asib Rahman"],"pdf_url":"https://arxiv.org/pdf/2308.14397v1.pdf","comment":"4 pages, 5 figures, 6 Tables"},{"id":"http://arxiv.org/abs/2308.14395v1","updated":"2023-08-28T08:20:30Z","published":"2023-08-28T08:20:30Z","title":"UMMAFormer: A Universal Multimodal-adaptive Transformer Framework for\n Temporal Forgery Localization","summary":" The emergence of artificial intelligence-generated content (AIGC) has raised\nconcerns about the authenticity of multimedia content in various fields.\nHowever, existing research for forgery content detection has focused mainly on\nbinary classification tasks of complete videos, which has limited applicability\nin industrial settings. To address this gap, we propose UMMAFormer, a novel\nuniversal transformer framework for temporal forgery localization (TFL) that\npredicts forgery segments with multimodal adaptation. Our approach introduces a\nTemporal Feature Abnormal Attention (TFAA) module based on temporal feature\nreconstruction to enhance the detection of temporal differences. We also design\na Parallel Cross-Attention Feature Pyramid Network (PCA-FPN) to optimize the\nFeature Pyramid Network (FPN) for subtle feature enhancement. To evaluate the\nproposed method, we contribute a novel Temporal Video Inpainting Localization\n(TVIL) dataset specifically tailored for video inpainting scenes. Our\nexperiments show that our approach achieves state-of-the-art performance on\nbenchmark datasets, including Lav-DF, TVIL, and Psynd, significantly\noutperforming previous methods. The code and data are available at\nhttps://github.com/ymhzyj/UMMAFormer/.\n","authors":["Rui Zhang","Hongxia Wang","Mingshan Du","Hanqing Liu","Yang Zhou","Qiang Zeng"],"pdf_url":"https://arxiv.org/pdf/2308.14395v1.pdf","comment":"11 pages, 8 figures, 66 references. This paper has been accepted for\n ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.14392v1","updated":"2023-08-28T08:15:43Z","published":"2023-08-28T08:15:43Z","title":"1st Place Solution for the 5th LSVOS Challenge: Video Instance\n Segmentation","summary":" Video instance segmentation is a challenging task that serves as the\ncornerstone of numerous downstream applications, including video editing and\nautonomous driving. In this report, we present further improvements to the SOTA\nVIS method, DVIS. First, we introduce a denoising training strategy for the\ntrainable tracker, allowing it to achieve more stable and accurate object\ntracking in complex and long videos. Additionally, we explore the role of\nvisual foundation models in video instance segmentation. By utilizing a frozen\nVIT-L model pre-trained by DINO v2, DVIS demonstrates remarkable performance\nimprovements. With these enhancements, our method achieves 57.9 AP and 56.0 AP\nin the development and test phases, respectively, and ultimately ranked 1st in\nthe VIS track of the 5th LSVOS Challenge. The code will be available at\nhttps://github.com/zhang-tao-whu/DVIS.\n","authors":["Tao Zhang","Xingye Tian","Yikang Zhou","Yu Wu","Shunping Ji","Cilin Yan","Xuebo Wang","Xin Tao","Yuan Zhang","Pengfei Wan"],"pdf_url":"https://arxiv.org/pdf/2308.14392v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14391v1","updated":"2023-08-28T08:14:20Z","published":"2023-08-28T08:14:20Z","title":"FIRE: Food Image to REcipe generation","summary":" Food computing has emerged as a prominent multidisciplinary field of research\nin recent years. An ambitious goal of food computing is to develop end-to-end\nintelligent systems capable of autonomously producing recipe information for a\nfood image. Current image-to-recipe methods are retrieval-based and their\nsuccess depends heavily on the dataset size and diversity, as well as the\nquality of learned embeddings. Meanwhile, the emergence of powerful\nattention-based vision and language models presents a promising avenue for\naccurate and generalizable recipe generation, which has yet to be extensively\nexplored. This paper proposes FIRE, a novel multimodal methodology tailored to\nrecipe generation in the food computing domain, which generates the food title,\ningredients, and cooking instructions based on input food images. FIRE\nleverages the BLIP model to generate titles, utilizes a Vision Transformer with\na decoder for ingredient extraction, and employs the T5 model to generate\nrecipes incorporating titles and ingredients as inputs. We showcase two\npractical applications that can benefit from integrating FIRE with large\nlanguage model prompting: recipe customization to fit recipes to user\npreferences and recipe-to-code transformation to enable automated cooking\nprocesses. Our experimental findings validate the efficacy of our proposed\napproach, underscoring its potential for future advancements and widespread\nadoption in food computing.\n","authors":["Prateek Chhikara","Dhiraj Chaurasia","Yifan Jiang","Omkar Masur","Filip Ilievski"],"pdf_url":"https://arxiv.org/pdf/2308.14391v1.pdf","comment":"5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2303.11325v2","updated":"2023-08-28T08:00:52Z","published":"2023-03-20T17:59:03Z","title":"GeoMIM: Towards Better 3D Knowledge Transfer via Masked Image Modeling\n for Multi-view 3D Understanding","summary":" Multi-view camera-based 3D detection is a challenging problem in computer\nvision. Recent works leverage a pretrained LiDAR detection model to transfer\nknowledge to a camera-based student network. However, we argue that there is a\nmajor domain gap between the LiDAR BEV features and the camera-based BEV\nfeatures, as they have different characteristics and are derived from different\nsources. In this paper, we propose Geometry Enhanced Masked Image Modeling\n(GeoMIM) to transfer the knowledge of the LiDAR model in a pretrain-finetune\nparadigm for improving the multi-view camera-based 3D detection. GeoMIM is a\nmulti-camera vision transformer with Cross-View Attention (CVA) blocks that\nuses LiDAR BEV features encoded by the pretrained BEV model as learning\ntargets. During pretraining, GeoMIM's decoder has a semantic branch completing\ndense perspective-view features and the other geometry branch reconstructing\ndense perspective-view depth maps. The depth branch is designed to be\ncamera-aware by inputting the camera's parameters for better transfer\ncapability. Extensive results demonstrate that GeoMIM outperforms existing\nmethods on nuScenes benchmark, achieving state-of-the-art performance for\ncamera-based 3D object detection and 3D segmentation. Code and pretrained\nmodels are available at https://github.com/Sense-X/GeoMIM.\n","authors":["Jihao Liu","Tai Wang","Boxiao Liu","Qihang Zhang","Yu Liu","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2303.11325v2.pdf","comment":"Release code: https://github.com/Sense-X/GeoMIM"},{"id":"http://arxiv.org/abs/2212.12734v3","updated":"2023-08-28T07:58:48Z","published":"2022-12-24T13:35:31Z","title":"DDH-QA: A Dynamic Digital Humans Quality Assessment Database","summary":" In recent years, large amounts of effort have been put into pushing forward\nthe real-world application of dynamic digital human (DDH). However, most\ncurrent quality assessment research focuses on evaluating static 3D models and\nusually ignores motion distortions. Therefore, in this paper, we construct a\nlarge-scale dynamic digital human quality assessment (DDH-QA) database with\ndiverse motion content as well as multiple distortions to comprehensively study\nthe perceptual quality of DDHs. Both model-based distortion (noise,\ncompression) and motion-based distortion (binding error, motion unnaturalness)\nare taken into consideration. Ten types of common motion are employed to drive\nthe DDHs and a total of 800 DDHs are generated in the end. Afterward, we render\nthe video sequences of the distorted DDHs as the evaluation media and carry out\na well-controlled subjective experiment. Then a benchmark experiment is\nconducted with the state-of-the-art video quality assessment (VQA) methods and\nthe experimental results show that existing VQA methods are limited in\nassessing the perceptual loss of DDHs.\n","authors":["Zicheng Zhang","Yingjie Zhou","Wei Sun","Wei Lu","Xiongkuo Min","Yu Wang","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2212.12734v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14383v1","updated":"2023-08-28T07:56:13Z","published":"2023-08-28T07:56:13Z","title":"Multi-Modal Neural Radiance Field for Monocular Dense SLAM with a\n Light-Weight ToF Sensor","summary":" Light-weight time-of-flight (ToF) depth sensors are compact and\ncost-efficient, and thus widely used on mobile devices for tasks such as\nautofocus and obstacle detection. However, due to the sparse and noisy depth\nmeasurements, these sensors have rarely been considered for dense geometry\nreconstruction. In this work, we present the first dense SLAM system with a\nmonocular camera and a light-weight ToF sensor. Specifically, we propose a\nmulti-modal implicit scene representation that supports rendering both the\nsignals from the RGB camera and light-weight ToF sensor which drives the\noptimization by comparing with the raw sensor inputs. Moreover, in order to\nguarantee successful pose tracking and reconstruction, we exploit a predicted\ndepth as an intermediate supervision and develop a coarse-to-fine optimization\nstrategy for efficient learning of the implicit representation. At last, the\ntemporal information is explicitly exploited to deal with the noisy signals\nfrom light-weight ToF sensors to improve the accuracy and robustness of the\nsystem. Experiments demonstrate that our system well exploits the signals of\nlight-weight ToF sensors and achieves competitive results both on camera\ntracking and dense scene reconstruction. Project page:\n\\url{https://zju3dv.github.io/tof_slam/}.\n","authors":["Xinyang Liu","Yijin Li","Yanbin Teng","Hujun Bao","Guofeng Zhang","Yinda Zhang","Zhaopeng Cui"],"pdf_url":"https://arxiv.org/pdf/2308.14383v1.pdf","comment":"Accepted to ICCV 2023 (Oral). Project Page:\n https://zju3dv.github.io/tof_slam/"},{"id":"http://arxiv.org/abs/2308.14378v1","updated":"2023-08-28T07:50:04Z","published":"2023-08-28T07:50:04Z","title":"GKGNet: Group K-Nearest Neighbor based Graph Convolutional Network for\n Multi-Label Image Recognition","summary":" Multi-Label Image Recognition (MLIR) is a challenging task that aims to\npredict multiple object labels in a single image while modeling the complex\nrelationships between labels and image regions. Although convolutional neural\nnetworks and vision transformers have succeeded in processing images as regular\ngrids of pixels or patches, these representations are sub-optimal for capturing\nirregular and discontinuous regions of interest. In this work, we present the\nfirst fully graph convolutional model, Group K-nearest neighbor based Graph\nconvolutional Network (GKGNet), which models the connections between semantic\nlabel embeddings and image patches in a flexible and unified graph structure.\nTo address the scale variance of different objects and to capture information\nfrom multiple perspectives, we propose the Group KGCN module for dynamic graph\nconstruction and message passing. Our experiments demonstrate that GKGNet\nachieves state-of-the-art performance with significantly lower computational\ncosts on the challenging multi-label datasets, \\ie MS-COCO and VOC2007\ndatasets. We will release the code and models to facilitate future research in\nthis area.\n","authors":["Ruijie Yao","Sheng Jin","Lumin Xu","Wang Zeng","Wentao Liu","Chen Qian","Ping Luo","Ji Wu"],"pdf_url":"https://arxiv.org/pdf/2308.14378v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14371v1","updated":"2023-08-28T07:40:48Z","published":"2023-08-28T07:40:48Z","title":"SuperUDF: Self-supervised UDF Estimation for Surface Reconstruction","summary":" Learning-based surface reconstruction based on unsigned distance functions\n(UDF) has many advantages such as handling open surfaces. We propose SuperUDF,\na self-supervised UDF learning which exploits a learned geometry prior for\nefficient training and a novel regularization for robustness to sparse\nsampling. The core idea of SuperUDF draws inspiration from the classical\nsurface approximation operator of locally optimal projection (LOP). The key\ninsight is that if the UDF is estimated correctly, the 3D points should be\nlocally projected onto the underlying surface following the gradient of the\nUDF. Based on that, a number of inductive biases on UDF geometry and a\npre-learned geometry prior are devised to learn UDF estimation efficiently. A\nnovel regularization loss is proposed to make SuperUDF robust to sparse\nsampling. Furthermore, we also contribute a learning-based mesh extraction from\nthe estimated UDFs. Extensive evaluations demonstrate that SuperUDF outperforms\nthe state of the arts on several public datasets in terms of both quality and\nefficiency. Code will be released after accteptance.\n","authors":["Hui Tian","Chenyang Zhu","Yifei Shi","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2308.14371v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14369v1","updated":"2023-08-28T07:35:21Z","published":"2023-08-28T07:35:21Z","title":"Improving Lesion Volume Measurements on Digital Mammograms","summary":" Lesion volume is an important predictor for prognosis in breast cancer. We\nmake a step towards a more accurate lesion volume measurement on digital\nmammograms by developing a model that allows to estimate lesion volumes on\nprocessed mammograms, which are the images routinely used by radiologists in\nclinical practice as well as in breast cancer screening and are available in\nmedical centers. Processed mammograms are obtained from raw mammograms, which\nare the X-ray data coming directly from the scanner, by applying certain\nvendor-specific non-linear transformations. At the core of our volume\nestimation method is a physics-based algorithm for measuring lesion volumes on\nraw mammograms. We subsequently extend this algorithm to processed mammograms\nvia a deep learning image-to-image translation model that produces synthetic\nraw mammograms from processed mammograms in a multi-vendor setting. We assess\nthe reliability and validity of our method using a dataset of 1778 mammograms\nwith an annotated mass. Firstly, we investigate the correlations between lesion\nvolumes computed from mediolateral oblique and craniocaudal views, with a\nresulting Pearson correlation of 0.93 [95% confidence interval (CI) 0.92 -\n0.93]. Secondly, we compare the resulting lesion volumes from true and\nsynthetic raw data, with a resulting Pearson correlation of 0.998 [95% CI 0.998\n- 0.998] . Finally, for a subset of 100 mammograms with a malign mass and\nconcurrent MRI examination available, we analyze the agreement between lesion\nvolume on mammography and MRI, resulting in an intraclass correlation\ncoefficient of 0.81 [95% CI 0.73 - 0.87] for consistency and 0.78 [95% CI 0.66\n- 0.86] for absolute agreement. In conclusion, we developed an algorithm to\nmeasure mammographic lesion volume that reached excellent reliability and good\nvalidity, when using MRI as ground truth.\n","authors":["Nikita Moriakov","Jim Peters","Ritse Mann","Nico Karssemeijer","Jos van Dijck","Mireille Broeders","Jonas Teuwen"],"pdf_url":"https://arxiv.org/pdf/2308.14369v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.07011v4","updated":"2023-08-28T07:29:03Z","published":"2023-05-11T17:53:29Z","title":"Region-Aware Pretraining for Open-Vocabulary Object Detection with\n Vision Transformers","summary":" We present Region-aware Open-vocabulary Vision Transformers (RO-ViT) - a\ncontrastive image-text pretraining recipe to bridge the gap between image-level\npretraining and open-vocabulary object detection. At the pretraining phase, we\npropose to randomly crop and resize regions of positional embeddings instead of\nusing the whole image positional embeddings. This better matches the use of\npositional embeddings at region-level in the detection finetuning phase. In\naddition, we replace the common softmax cross entropy loss in contrastive\nlearning with focal loss to better learn the informative yet difficult\nexamples. Finally, we leverage recent advances in novel object proposals to\nimprove open-vocabulary detection finetuning. We evaluate our full model on the\nLVIS and COCO open-vocabulary detection benchmarks and zero-shot transfer.\nRO-ViT achieves a state-of-the-art 34.1 $AP_r$ on LVIS, surpassing the best\nexisting approach by +7.8 points in addition to competitive zero-shot transfer\ndetection. Surprisingly, RO-ViT improves the image-level representation as well\nand achieves the state of the art on 9 out of 12 metrics on COCO and Flickr\nimage-text retrieval benchmarks, outperforming competitive approaches with\nlarger models.\n","authors":["Dahun Kim","Anelia Angelova","Weicheng Kuo"],"pdf_url":"https://arxiv.org/pdf/2305.07011v4.pdf","comment":"CVPR 2023 Highlight - https://github.com/mcahny/rovit ; adds LAION-2B\n result"},{"id":"http://arxiv.org/abs/2302.06039v2","updated":"2023-08-28T07:19:48Z","published":"2023-02-13T00:46:34Z","title":"Predicting Class Distribution Shift for Reliable Domain Adaptive Object\n Detection","summary":" Unsupervised Domain Adaptive Object Detection (UDA-OD) uses unlabelled data\nto improve the reliability of robotic vision systems in open-world\nenvironments. Previous approaches to UDA-OD based on self-training have been\neffective in overcoming changes in the general appearance of images. However,\nshifts in a robot's deployment environment can also impact the likelihood that\ndifferent objects will occur, termed class distribution shift. Motivated by\nthis, we propose a framework for explicitly addressing class distribution shift\nto improve pseudo-label reliability in self-training. Our approach uses the\ndomain invariance and contextual understanding of a pre-trained joint vision\nand language model to predict the class distribution of unlabelled data. By\naligning the class distribution of pseudo-labels with this prediction, we\nprovide weak supervision of pseudo-label accuracy. To further account for low\nquality pseudo-labels early in self-training, we propose an approach to\ndynamically adjust the number of pseudo-labels per image based on model\nconfidence. Our method outperforms state-of-the-art approaches on several\nbenchmarks, including a 4.7 mAP improvement when facing challenging class\ndistribution shift.\n","authors":["Nicolas Harvey Chapman","Feras Dayoub","Will Browne","Christopher Lehnert"],"pdf_url":"https://arxiv.org/pdf/2302.06039v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.10425v2","updated":"2023-08-28T07:06:35Z","published":"2023-02-21T03:34:15Z","title":"Instance-incremental Scene Graph Generation from Real-world Point Clouds\n via Normalizing Flows","summary":" This work introduces a new task of instance-incremental scene graph\ngeneration: Given a scene of the point cloud, representing it as a graph and\nautomatically increasing novel instances. A graph denoting the object layout of\nthe scene is finally generated. It is an important task since it helps to guide\nthe insertion of novel 3D objects into a real-world scene in vision-based\napplications like augmented reality. It is also challenging because the\ncomplexity of the real-world point cloud brings difficulties in learning object\nlayout experiences from the observation data (non-empty rooms with labeled\nsemantics). We model this task as a conditional generation problem and propose\na 3D autoregressive framework based on normalizing flows (3D-ANF) to address\nit. First, we represent the point cloud as a graph by extracting the label\nsemantics and contextual relationships. Next, a model based on normalizing\nflows is introduced to map the conditional generation of graphic elements into\nthe Gaussian process. The mapping is invertible. Thus, the real-world\nexperiences represented in the observation data can be modeled in the training\nphase, and novel instances can be autoregressively generated based on the\nGaussian process in the testing phase. To evaluate the performance of our\nmethod sufficiently, we implement this new task on the indoor benchmark dataset\n3DSSG-O27R16 and our newly proposed graphical dataset of outdoor scenes GPL3D.\nExperiments show that our method generates reliable novel graphs from the\nreal-world point cloud and achieves state-of-the-art performance on the\ndatasets.\n","authors":["Chao Qi","Jianqin Yin","Jinghang Xu","Pengxiang Ding"],"pdf_url":"https://arxiv.org/pdf/2302.10425v2.pdf","comment":"Accepted by IEEE TCSVT. The supplementary material is available in\n the media column of the journal version of the article"},{"id":"http://arxiv.org/abs/2308.14334v1","updated":"2023-08-28T06:25:40Z","published":"2023-08-28T06:25:40Z","title":"MetaWeather: Few-Shot Weather-Degraded Image Restoration via Degradation\n Pattern Matching","summary":" Real-world vision tasks frequently suffer from the appearance of adverse\nweather conditions including rain, fog, snow, and raindrops in captured images.\nRecently, several generic methods for restoring weather-degraded images have\nbeen proposed, aiming to remove multiple types of adverse weather effects\npresent in the images. However, these methods have considered weather as\ndiscrete and mutually exclusive variables, leading to failure in generalizing\nto unforeseen weather conditions beyond the scope of the training data, such as\nthe co-occurrence of rain, fog, and raindrops. To this end, weather-degraded\nimage restoration models should have flexible adaptability to the current\nunknown weather condition to ensure reliable and optimal performance. The\nadaptation method should also be able to cope with data scarcity for real-world\nadaptation. This paper proposes MetaWeather, a few-shot weather-degraded image\nrestoration method for arbitrary weather conditions. For this, we devise the\ncore piece of MetaWeather, coined Degradation Pattern Matching Module (DPMM),\nwhich leverages representations from a few-shot support set by matching\nfeatures between input and sample images under new weather conditions. In\naddition, we build meta-knowledge with episodic meta-learning on top of our\nMetaWeather architecture to provide flexible adaptability. In the meta-testing\nphase, we adopt a parameter-efficient fine-tuning method to preserve the\nprebuilt knowledge and avoid the overfitting problem. Experiments on the BID\nTask II.A dataset show our method achieves the best performance on PSNR and\nSSIM compared to state-of-the-art image restoration methods. Code is available\nat (TBA).\n","authors":["Youngrae Kim","Younggeol Cho","Thanh-Tung Nguyen","Dongman Lee"],"pdf_url":"https://arxiv.org/pdf/2308.14334v1.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.14332v1","updated":"2023-08-28T06:22:10Z","published":"2023-08-28T06:22:10Z","title":"Attention-Guided Lidar Segmentation and Odometry Using Image-to-Point\n Cloud Saliency Transfer","summary":" LiDAR odometry estimation and 3D semantic segmentation are crucial for\nautonomous driving, which has achieved remarkable advances recently. However,\nthese tasks are challenging due to the imbalance of points in different\nsemantic categories for 3D semantic segmentation and the influence of dynamic\nobjects for LiDAR odometry estimation, which increases the importance of using\nrepresentative/salient landmarks as reference points for robust feature\nlearning. To address these challenges, we propose a saliency-guided approach\nthat leverages attention information to improve the performance of LiDAR\nodometry estimation and semantic segmentation models. Unlike in the image\ndomain, only a few studies have addressed point cloud saliency information due\nto the lack of annotated training data. To alleviate this, we first present a\nuniversal framework to transfer saliency distribution knowledge from color\nimages to point clouds, and use this to construct a pseudo-saliency dataset\n(i.e. FordSaliency) for point clouds. Then, we adopt point cloud-based\nbackbones to learn saliency distribution from pseudo-saliency labels, which is\nfollowed by our proposed SalLiDAR module. SalLiDAR is a saliency-guided 3D\nsemantic segmentation model that integrates saliency information to improve\nsegmentation performance. Finally, we introduce SalLONet, a self-supervised\nsaliency-guided LiDAR odometry network that uses the semantic and saliency\npredictions of SalLiDAR to achieve better odometry estimation. Our extensive\nexperiments on benchmark datasets demonstrate that the proposed SalLiDAR and\nSalLONet models achieve state-of-the-art performance against existing methods,\nhighlighting the effectiveness of image-to-LiDAR saliency knowledge transfer.\nSource code will be available at https://github.com/nevrez/SalLONet.\n","authors":["Guanqun Ding","Nevrez Imamoglu","Ali Caglayan","Masahiro Murakawa","Ryosuke Nakamura"],"pdf_url":"https://arxiv.org/pdf/2308.14332v1.pdf","comment":"33 pages, 12 Figures, 6 Tables"},{"id":"http://arxiv.org/abs/2308.14324v1","updated":"2023-08-28T06:09:25Z","published":"2023-08-28T06:09:25Z","title":"CPFES: Physical Fitness Evaluation Based on Canadian Agility and\n Movement Skill Assessment","summary":" In recent years, the assessment of fundamental movement skills integrated\nwith physical education has focused on both teaching practice and the\nfeasibility of assessment. The object of assessment has shifted from multiple\nages to subdivided ages, while the content of assessment has changed from\ncomplex and time-consuming to concise and efficient. Therefore, we apply deep\nlearning to physical fitness evaluation, we propose a system based on the\nCanadian Agility and Movement Skill Assessment (CAMSA) Physical Fitness\nEvaluation System (CPFES), which evaluates children's physical fitness based on\nCAMSA, and gives recommendations based on the scores obtained by CPFES to help\nchildren grow. We have designed a landmark detection module and a pose\nestimation module, and we have also designed a pose evaluation module for the\nCAMSA criteria that can effectively evaluate the actions of the child being\ntested. Our experimental results demonstrate the high accuracy of the proposed\nsystem.\n","authors":["Pengcheng Dong","Xiaojin Mao","Lixia Fan","Wenbo Wan","Jiande Sun"],"pdf_url":"https://arxiv.org/pdf/2308.14324v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14322v1","updated":"2023-08-28T06:05:23Z","published":"2023-08-28T06:05:23Z","title":"Machine Unlearning Methodology base on Stochastic Teacher Network","summary":" The rise of the phenomenon of the \"right to be forgotten\" has prompted\nresearch on machine unlearning, which grants data owners the right to actively\nwithdraw data that has been used for model training, and requires the\nelimination of the contribution of that data to the model. A simple method to\nachieve this is to use the remaining data to retrain the model, but this is not\nacceptable for other data owners who continue to participate in training.\nExisting machine unlearning methods have been found to be ineffective in\nquickly removing knowledge from deep learning models. This paper proposes using\na stochastic network as a teacher to expedite the mitigation of the influence\ncaused by forgotten data on the model. We performed experiments on three\ndatasets, and the findings demonstrate that our approach can efficiently\nmitigate the influence of target data on the model within a single epoch. This\nallows for one-time erasure and reconstruction of the model, and the\nreconstruction model achieves the same performance as the retrained model.\n","authors":["Xulong Zhang","Jianzong Wang","Ning Cheng","Yifu Sun","Chuanyao Zhang","Jing Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.14322v1.pdf","comment":"Accepted by 19th International Conference on Advanced Data Mining and\n Applications. (ADMA 2023)"},{"id":"http://arxiv.org/abs/2304.02163v2","updated":"2023-08-28T06:03:39Z","published":"2023-04-04T23:41:20Z","title":"GINA-3D: Learning to Generate Implicit Neural Assets in the Wild","summary":" Modeling the 3D world from sensor data for simulation is a scalable way of\ndeveloping testing and validation environments for robotic learning problems\nsuch as autonomous driving. However, manually creating or re-creating\nreal-world-like environments is difficult, expensive, and not scalable. Recent\ngenerative model techniques have shown promising progress to address such\nchallenges by learning 3D assets using only plentiful 2D images -- but still\nsuffer limitations as they leverage either human-curated image datasets or\nrenderings from manually-created synthetic 3D environments. In this paper, we\nintroduce GINA-3D, a generative model that uses real-world driving data from\ncamera and LiDAR sensors to create realistic 3D implicit neural assets of\ndiverse vehicles and pedestrians. Compared to the existing image datasets, the\nreal-world driving setting poses new challenges due to occlusions,\nlighting-variations and long-tail distributions. GINA-3D tackles these\nchallenges by decoupling representation learning and generative modeling into\ntwo stages with a learned tri-plane latent structure, inspired by recent\nadvances in generative modeling of images. To evaluate our approach, we\nconstruct a large-scale object-centric dataset containing over 1.2M images of\nvehicles and pedestrians from the Waymo Open Dataset, and a new set of 80K\nimages of long-tail instances such as construction equipment, garbage trucks,\nand cable cars. We compare our model with existing approaches and demonstrate\nthat it achieves state-of-the-art performance in quality and diversity for both\ngenerated images and geometries.\n","authors":["Bokui Shen","Xinchen Yan","Charles R. Qi","Mahyar Najibi","Boyang Deng","Leonidas Guibas","Yin Zhou","Dragomir Anguelov"],"pdf_url":"https://arxiv.org/pdf/2304.02163v2.pdf","comment":"Accepted by CVPR 2023; Our WOD-ObjectAsset can be accessed through\n waymo.com/open"},{"id":"http://arxiv.org/abs/2211.03989v3","updated":"2023-08-28T05:56:47Z","published":"2022-11-08T04:00:23Z","title":"$BT^2$: Backward-compatible Training with Basis Transformation","summary":" Modern retrieval system often requires recomputing the representation of\nevery piece of data in the gallery when updating to a better representation\nmodel. This process is known as backfilling and can be especially costly in the\nreal world where the gallery often contains billions of samples. Recently,\nresearchers have proposed the idea of Backward Compatible Training (BCT) where\nthe new representation model can be trained with an auxiliary loss to make it\nbackward compatible with the old representation. In this way, the new\nrepresentation can be directly compared with the old representation, in\nprinciple avoiding the need for any backfilling. However, followup work shows\nthat there is an inherent tradeoff where a backward compatible representation\nmodel cannot simultaneously maintain the performance of the new model itself.\nThis paper reports our ``not-so-surprising'' finding that adding extra\ndimensions to the representation can help here. However, we also found that\nnaively increasing the dimension of the representation did not work. To deal\nwith this, we propose Backward-compatible Training with a novel Basis\nTransformation ($BT^2$). A basis transformation (BT) is basically a learnable\nset of parameters that applies an orthonormal transformation. Such a\ntransformation possesses an important property whereby the original information\ncontained in its input is retained in its output. We show in this paper how a\nBT can be utilized to add only the necessary amount of additional dimensions.\nWe empirically verify the advantage of $BT^2$ over other state-of-the-art\nmethods in a wide range of settings. We then further extend $BT^2$ to other\nchallenging yet more practical settings, including significant change in model\narchitecture (CNN to Transformers), modality change, and even a series of\nupdates in the model architecture mimicking the evolution of deep learning\nmodels.\n","authors":["Yifei Zhou","Zilu Li","Abhinav Shrivastava","Hengshuang Zhao","Antonio Torralba","Taipeng Tian","Ser-Nam Lim"],"pdf_url":"https://arxiv.org/pdf/2211.03989v3.pdf","comment":"iccv2023 camera ready"},{"id":"http://arxiv.org/abs/2308.14316v1","updated":"2023-08-28T05:38:43Z","published":"2023-08-28T05:38:43Z","title":"UniPT: Universal Parallel Tuning for Transfer Learning with Efficient\n Parameter and Memory","summary":" Fine-tuning pre-trained models has emerged as a powerful technique in\nnumerous domains, owing to its ability to leverage enormous pre-existing\nknowledge and achieve remarkable performance on downstream tasks. However,\nupdating the parameters of entire networks is computationally intensive.\nAlthough state-of-the-art parameter-efficient transfer learning (PETL) methods\nsignificantly reduce the trainable parameters and storage demand, almost all of\nthem still need to back-propagate the gradients through large pre-trained\nnetworks. This memory-extensive characteristic extremely limits the\napplicability of PETL methods in real-world scenarios. To this end, we propose\na new memory-efficient PETL strategy, dubbed Universal Parallel Tuning (UniPT).\nSpecifically, we facilitate the transfer process via a lightweight learnable\nparallel network, which consists of two modules: 1) A parallel interaction\nmodule that decouples the inherently sequential connections and processes the\nintermediate activations detachedly of the pre-trained network. 2) A confidence\naggregation module that learns optimal strategies adaptively for integrating\ncross-layer features. We evaluate UniPT with different backbones (e.g.,\nVSE$\\infty$, CLIP4Clip, Clip-ViL, and MDETR) on five challenging\nvision-and-language tasks (i.e., image-text retrieval, video-text retrieval,\nvisual question answering, compositional question answering, and visual\ngrounding). Extensive ablations on ten datasets have validated that our UniPT\ncan not only dramatically reduce memory consumption and outperform the best\nmemory-efficient competitor, but also achieve higher performance than existing\nPETL methods in a low-memory scenario on different architectures. Our code is\npublicly available at: https://github.com/Paranioar/UniPT.\n","authors":["Haiwen Diao","Bo Wan","Ying Zhang","Xu Jia","Huchuan Lu","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2308.14316v1.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.14312v1","updated":"2023-08-28T05:29:59Z","published":"2023-08-28T05:29:59Z","title":"Local-Global Pseudo-label Correction for Source-free Domain Adaptive\n Medical Image Segmentation","summary":" Domain shift is a commonly encountered issue in medical imaging solutions,\nprimarily caused by variations in imaging devices and data sources. To mitigate\nthis problem, unsupervised domain adaptation techniques have been employed.\nHowever, concerns regarding patient privacy and potential degradation of image\nquality have led to an increased focus on source-free domain adaptation. In\nthis study, we address the issue of false labels in self-training based\nsource-free domain adaptive medical image segmentation methods. To correct\nerroneous pseudo-labels, we propose a novel approach called the local-global\npseudo-label correction (LGDA) method for source-free domain adaptive medical\nimage segmentation. Our method consists of two components: An offline local\ncontext-based pseudo-label correction method that utilizes local context\nsimilarity in image space. And an online global pseudo-label correction method\nbased on class prototypes, which corrects erroneously predicted pseudo-labels\nby considering the relative distance between pixel-wise feature vectors and\nprototype vectors. We evaluate the performance of our method on three benchmark\nfundus image datasets for optic disc and cup segmentation. Our method achieves\nsuperior performance compared to the state-of-the-art approaches, even without\nusing of any source data.\n","authors":["Yanyu Ye","Zhengxi Zhang","Chunna Tianb","Wei wei"],"pdf_url":"https://arxiv.org/pdf/2308.14312v1.pdf","comment":"30 pages,7 figures"},{"id":"http://arxiv.org/abs/2202.13799v3","updated":"2023-08-28T04:52:53Z","published":"2022-02-28T13:48:41Z","title":"One-shot Ultra-high-Resolution Generative Adversarial Network That\n Synthesizes 16K Images On A Single GPU","summary":" We propose a one-shot ultra-high-resolution generative adversarial network\n(OUR-GAN) framework that generates non-repetitive 16K (16, 384 x 8, 640) images\nfrom a single training image and is trainable on a single consumer GPU. OUR-GAN\ngenerates an initial image that is visually plausible and varied in shape at\nlow resolution, and then gradually increases the resolution by adding detail\nthrough super-resolution. Since OUR-GAN learns from a real\nultra-high-resolution (UHR) image, it can synthesize large shapes with fine\ndetails and long-range coherence, which is difficult to achieve with\nconventional generative models that rely on the patch distribution learned from\nrelatively small images. OUR-GAN can synthesize high-quality 16K images with\n12.5 GB of GPU memory and 4K images with only 4.29 GB as it synthesizes a UHR\nimage part by part through seamless subregion-wise super-resolution.\nAdditionally, OUR-GAN improves visual coherence while maintaining diversity by\napplying vertical positional convolution. In experiments on the ST4K and RAISE\ndatasets, OUR-GAN exhibited improved fidelity, visual coherency, and diversity\ncompared with the baseline one-shot synthesis models. To the best of our\nknowledge, OUR-GAN is the first one-shot image synthesizer that generates\nnon-repetitive UHR images on a single consumer GPU. The synthesized image\nsamples are presented at https://our-gan.github.io.\n","authors":["Junseok Oh","Donghwee Yoon","Injung Kim"],"pdf_url":"https://arxiv.org/pdf/2202.13799v3.pdf","comment":"36 pages, 26 figures"},{"id":"http://arxiv.org/abs/2303.12091v2","updated":"2023-08-28T04:50:57Z","published":"2023-03-21T09:07:15Z","title":"Adaptive Negative Evidential Deep Learning for Open-set Semi-supervised\n Learning","summary":" Semi-supervised learning (SSL) methods assume that labeled data, unlabeled\ndata and test data are from the same distribution. Open-set semi-supervised\nlearning (Open-set SSL) considers a more practical scenario, where unlabeled\ndata and test data contain new categories (outliers) not observed in labeled\ndata (inliers). Most previous works focused on outlier detection via binary\nclassifiers, which suffer from insufficient scalability and inability to\ndistinguish different types of uncertainty. In this paper, we propose a novel\nframework, Adaptive Negative Evidential Deep Learning (ANEDL) to tackle these\nlimitations. Concretely, we first introduce evidential deep learning (EDL) as\nan outlier detector to quantify different types of uncertainty, and design\ndifferent uncertainty metrics for self-training and inference. Furthermore, we\npropose a novel adaptive negative optimization strategy, making EDL more\ntailored to the unlabeled dataset containing both inliers and outliers. As\ndemonstrated empirically, our proposed method outperforms existing\nstate-of-the-art methods across four datasets.\n","authors":["Yang Yu","Danruo Deng","Furui Liu","Yueming Jin","Qi Dou","Guangyong Chen","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2303.12091v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.15699v2","updated":"2023-08-28T04:46:01Z","published":"2023-03-28T03:05:25Z","title":"Enhancing Breast Cancer Risk Prediction by Incorporating Prior Images","summary":" Recently, deep learning models have shown the potential to predict breast\ncancer risk and enable targeted screening strategies, but current models do not\nconsider the change in the breast over time. In this paper, we present a new\nmethod, PRIME+, for breast cancer risk prediction that leverages prior\nmammograms using a transformer decoder, outperforming a state-of-the-art risk\nprediction method that only uses mammograms from a single time point. We\nvalidate our approach on a dataset with 16,113 exams and further demonstrate\nthat it effectively captures patterns of changes from prior mammograms, such as\nchanges in breast density, resulting in improved short-term and long-term\nbreast cancer risk prediction. Experimental results show that our model\nachieves a statistically significant improvement in performance over the\nstate-of-the-art based model, with a C-index increase from 0.68 to 0.73 (p <\n0.05) on held-out test sets.\n","authors":["Hyeonsoo Lee","Junha Kim","Eunkyung Park","Minjeong Kim","Taesoo Kim","Thijs Kooi"],"pdf_url":"https://arxiv.org/pdf/2303.15699v2.pdf","comment":"MICCAI 2023 accepted"},{"id":"http://arxiv.org/abs/2304.12666v2","updated":"2023-08-28T04:43:57Z","published":"2023-04-25T09:12:37Z","title":"Bayesian Optimization Meets Self-Distillation","summary":" Bayesian optimization (BO) has contributed greatly to improving model\nperformance by suggesting promising hyperparameter configurations iteratively\nbased on observations from multiple training trials. However, only partial\nknowledge (i.e., the measured performances of trained models and their\nhyperparameter configurations) from previous trials is transferred. On the\nother hand, Self-Distillation (SD) only transfers partial knowledge learned by\nthe task model itself. To fully leverage the various knowledge gained from all\ntraining trials, we propose the BOSS framework, which combines BO and SD. BOSS\nsuggests promising hyperparameter configurations through BO and carefully\nselects pre-trained models from previous trials for SD, which are otherwise\nabandoned in the conventional BO process. BOSS achieves significantly better\nperformance than both BO and SD in a wide range of tasks including general\nimage classification, learning with noisy labels, semi-supervised learning, and\nmedical image analysis tasks.\n","authors":["HyunJae Lee","Heon Song","Hyeonsoo Lee","Gi-hyeon Lee","Suyeong Park","Donggeun Yoo"],"pdf_url":"https://arxiv.org/pdf/2304.12666v2.pdf","comment":"ICCV 2023 accepted"},{"id":"http://arxiv.org/abs/2308.14298v1","updated":"2023-08-28T04:34:50Z","published":"2023-08-28T04:34:50Z","title":"Direct initial orbit determination","summary":" Initial orbit determination (IOD) is an important early step in the\nprocessing chain that makes sense of and reconciles the multiple optical\nobservations of a resident space object. IOD methods generally operate on\nline-of-sight (LOS) vectors extracted from images of the object, hence the LOS\nvectors can be seen as discrete point samples of the raw optical measurements.\nTypically, the number of LOS vectors used by an IOD method is much smaller than\nthe available measurements (\\ie, the set of pixel intensity values), hence\ncurrent IOD methods arguably under-utilize the rich information present in the\ndata. In this paper, we propose a \\emph{direct} IOD method called D-IOD that\nfits the orbital parameters directly on the observed streak images, without\nrequiring LOS extraction. Since it does not utilize LOS vectors, D-IOD avoids\npotential inaccuracies or errors due to an imperfect LOS extraction step. Two\ninnovations underpin our novel orbit-fitting paradigm: first, we introduce a\nnovel non-linear least-squares objective function that computes the loss\nbetween the candidate-orbit-generated streak images and the observed streak\nimages. Second, the objective function is minimized with a gradient descent\napproach that is embedded in our proposed optimization strategies designed for\nstreak images. We demonstrate the effectiveness of D-IOD on a variety of\nsimulated scenarios and challenging real streak images.\n","authors":["Chee-Kheng Chng","Trent Jansen-Sturgeon","Timothy Payne","Tat-Jun Chin"],"pdf_url":"https://arxiv.org/pdf/2308.14298v1.pdf","comment":"28 pages, 17 figures, Submitted to Advances in Space Research"},{"id":"http://arxiv.org/abs/2308.06725v2","updated":"2023-08-28T04:27:35Z","published":"2023-08-13T09:05:56Z","title":"CLE Diffusion: Controllable Light Enhancement Diffusion Model","summary":" Low light enhancement has gained increasing importance with the rapid\ndevelopment of visual creation and editing. However, most existing enhancement\nalgorithms are designed to homogeneously increase the brightness of images to a\npre-defined extent, limiting the user experience. To address this issue, we\npropose Controllable Light Enhancement Diffusion Model, dubbed CLE Diffusion, a\nnovel diffusion framework to provide users with rich controllability. Built\nwith a conditional diffusion model, we introduce an illumination embedding to\nlet users control their desired brightness level. Additionally, we incorporate\nthe Segment-Anything Model (SAM) to enable user-friendly region\ncontrollability, where users can click on objects to specify the regions they\nwish to enhance. Extensive experiments demonstrate that CLE Diffusion achieves\ncompetitive performance regarding quantitative metrics, qualitative results,\nand versatile controllability. Project page:\nhttps://yuyangyin.github.io/CLEDiffusion/\n","authors":["Yuyang Yin","Dejia Xu","Chuangchuang Tan","Ping Liu","Yao Zhao","Yunchao Wei"],"pdf_url":"https://arxiv.org/pdf/2308.06725v2.pdf","comment":"Accepted In Proceedings of the 31st ACM International Conference on\n Multimedia (MM' 23)"},{"id":"http://arxiv.org/abs/2307.05016v2","updated":"2023-08-28T04:05:15Z","published":"2023-07-11T05:32:21Z","title":"TRansPose: Large-Scale Multispectral Dataset for Transparent Object","summary":" Transparent objects are encountered frequently in our daily lives, yet\nrecognizing them poses challenges for conventional vision sensors due to their\nunique material properties, not being well perceived from RGB or depth cameras.\nOvercoming this limitation, thermal infrared cameras have emerged as a\nsolution, offering improved visibility and shape information for transparent\nobjects. In this paper, we present TRansPose, the first large-scale\nmultispectral dataset that combines stereo RGB-D, thermal infrared (TIR)\nimages, and object poses to promote transparent object research. The dataset\nincludes 99 transparent objects, encompassing 43 household items, 27 recyclable\ntrashes, 29 chemical laboratory equivalents, and 12 non-transparent objects. It\ncomprises a vast collection of 333,819 images and 4,000,056 annotations,\nproviding instance-level segmentation masks, ground-truth poses, and completed\ndepth information. The data was acquired using a FLIR A65 thermal infrared\n(TIR) camera, two Intel RealSense L515 RGB-D cameras, and a Franka Emika Panda\nrobot manipulator. Spanning 87 sequences, TRansPose covers various challenging\nreal-life scenarios, including objects filled with water, diverse lighting\nconditions, heavy clutter, non-transparent or translucent containers, objects\nin plastic bags, and multi-stacked objects. TRansPose dataset can be accessed\nfrom the following link: https://sites.google.com/view/transpose-dataset\n","authors":["Jeongyun Kim","Myung-Hwan Jeon","Sangwoo Jung","Wooseong Yang","Minwoo Jung","Jaeho Shin","Ayoung Kim"],"pdf_url":"https://arxiv.org/pdf/2307.05016v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2308.14286v1","updated":"2023-08-28T03:57:37Z","published":"2023-08-28T03:57:37Z","title":"Bridging Cross-task Protocol Inconsistency for Distillation in Dense\n Object Detection","summary":" Knowledge distillation (KD) has shown potential for learning compact models\nin dense object detection. However, the commonly used softmax-based\ndistillation ignores the absolute classification scores for individual\ncategories. Thus, the optimum of the distillation loss does not necessarily\nlead to the optimal student classification scores for dense object detectors.\nThis cross-task protocol inconsistency is critical, especially for dense object\ndetectors, since the foreground categories are extremely imbalanced. To address\nthe issue of protocol differences between distillation and classification, we\npropose a novel distillation method with cross-task consistent protocols,\ntailored for the dense object detection. For classification distillation, we\naddress the cross-task protocol inconsistency problem by formulating the\nclassification logit maps in both teacher and student models as multiple\nbinary-classification maps and applying a binary-classification distillation\nloss to each map. For localization distillation, we design an IoU-based\nLocalization Distillation Loss that is free from specific network structures\nand can be compared with existing localization distillation losses. Our\nproposed method is simple but effective, and experimental results demonstrate\nits superiority over existing methods. Code is available at\nhttps://github.com/TinyTigerPan/BCKD.\n","authors":["Longrong Yang","Xianpan Zhou","Xuewei Li","Liang Qiao","Zheyang Li","Ziwei Yang","Gaoang Wang","Xi Li"],"pdf_url":"https://arxiv.org/pdf/2308.14286v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2209.15304v4","updated":"2023-08-28T03:16:50Z","published":"2022-09-30T08:23:26Z","title":"Hiding Visual Information via Obfuscating Adversarial Perturbations","summary":" Growing leakage and misuse of visual information raise security and privacy\nconcerns, which promotes the development of information protection. Existing\nadversarial perturbations-based methods mainly focus on the de-identification\nagainst deep learning models. However, the inherent visual information of the\ndata has not been well protected. In this work, inspired by the Type-I\nadversarial attack, we propose an adversarial visual information hiding method\nto protect the visual privacy of data. Specifically, the method generates\nobfuscating adversarial perturbations to obscure the visual information of the\ndata. Meanwhile, it maintains the hidden objectives to be correctly predicted\nby models. In addition, our method does not modify the parameters of the\napplied model, which makes it flexible for different scenarios. Experimental\nresults on the recognition and classification tasks demonstrate that the\nproposed method can effectively hide visual information and hardly affect the\nperformances of models. The code is available in the supplementary material.\n","authors":["Zhigang Su","Dawei Zhou","Nannan Wangu","Decheng Li","Zhen Wang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2209.15304v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.15663v3","updated":"2023-08-28T03:02:16Z","published":"2022-10-27T17:59:50Z","title":"Deep Generative Models on 3D Representations: A Survey","summary":" Generative models aim to learn the distribution of observed data by\ngenerating new instances. With the advent of neural networks, deep generative\nmodels, including variational autoencoders (VAEs), generative adversarial\nnetworks (GANs), and diffusion models (DMs), have progressed remarkably in\nsynthesizing 2D images. Recently, researchers started to shift focus from 2D to\n3D space, considering that 3D data is more closely aligned with our physical\nworld and holds immense practical potential. However, unlike 2D images, which\npossess an inherent and efficient representation (\\textit{i.e.}, a pixel grid),\nrepresenting 3D data poses significantly greater challenges. Ideally, a robust\n3D representation should be capable of accurately modeling complex shapes and\nappearances while being highly efficient in handling high-resolution data with\nhigh processing speeds and low memory requirements. Regrettably, existing 3D\nrepresentations, such as point clouds, meshes, and neural fields, often fail to\nsatisfy all of these requirements simultaneously. In this survey, we thoroughly\nreview the ongoing developments of 3D generative models, including methods that\nemploy 2D and 3D supervision. Our analysis centers on generative models, with a\nparticular focus on the representations utilized in this context. We believe\nour survey will help the community to track the field's evolution and to spark\ninnovative ideas to propel progress towards solving this challenging task.\n","authors":["Zifan Shi","Sida Peng","Yinghao Xu","Andreas Geiger","Yiyi Liao","Yujun Shen"],"pdf_url":"https://arxiv.org/pdf/2210.15663v3.pdf","comment":"Github: https://github.com/justimyhxu/awesome-3D-generation"},{"id":"http://arxiv.org/abs/2211.01146v3","updated":"2023-08-28T02:59:24Z","published":"2022-11-02T14:22:50Z","title":"DynamicISP: Dynamically Controlled Image Signal Processor for Image\n Recognition","summary":" Image Signal Processors (ISPs) play important roles in image recognition\ntasks as well as in the perceptual quality of captured images. In most cases,\nexperts make a lot of effort to manually tune many parameters of ISPs, but the\nparameters are sub-optimal. In the literature, two types of techniques have\nbeen actively studied: a machine learning-based parameter tuning technique and\na DNN-based ISP technique. The former is lightweight but lacks expressive\npower. The latter has expressive power, but the computational cost is too heavy\non edge devices. To solve these problems, we propose \"DynamicISP,\" which\nconsists of multiple classical ISP functions and dynamically controls the\nparameters of each frame according to the recognition result of the previous\nframe. We show our method successfully controls the parameters of multiple ISP\nfunctions and achieves state-of-the-art accuracy with low computational cost in\nsingle and multi-category object detection tasks.\n","authors":["Masakazu Yoshimura","Junji Otsuka","Atsushi Irie","Takeshi Ohashi"],"pdf_url":"https://arxiv.org/pdf/2211.01146v3.pdf","comment":"Accepted to ICCV2023. Several updates from v2 including additional\n experiments and modification of typos in Auto Gain equation"},{"id":"http://arxiv.org/abs/2212.04636v3","updated":"2023-08-28T02:51:25Z","published":"2022-12-09T02:25:20Z","title":"Ego-Body Pose Estimation via Ego-Head Pose Estimation","summary":" Estimating 3D human motion from an egocentric video sequence plays a critical\nrole in human behavior understanding and has various applications in VR/AR.\nHowever, naively learning a mapping between egocentric videos and human motions\nis challenging, because the user's body is often unobserved by the front-facing\ncamera placed on the head of the user. In addition, collecting large-scale,\nhigh-quality datasets with paired egocentric videos and 3D human motions\nrequires accurate motion capture devices, which often limit the variety of\nscenes in the videos to lab-like environments. To eliminate the need for paired\negocentric video and human motions, we propose a new method, Ego-Body Pose\nEstimation via Ego-Head Pose Estimation (EgoEgo), which decomposes the problem\ninto two stages, connected by the head motion as an intermediate\nrepresentation. EgoEgo first integrates SLAM and a learning approach to\nestimate accurate head motion. Subsequently, leveraging the estimated head pose\nas input, EgoEgo utilizes conditional diffusion to generate multiple plausible\nfull-body motions. This disentanglement of head and body pose eliminates the\nneed for training datasets with paired egocentric videos and 3D human motion,\nenabling us to leverage large-scale egocentric video datasets and motion\ncapture datasets separately. Moreover, for systematic benchmarking, we develop\na synthetic dataset, AMASS-Replica-Ego-Syn (ARES), with paired egocentric\nvideos and human motion. On both ARES and real data, our EgoEgo model performs\nsignificantly better than the current state-of-the-art methods.\n","authors":["Jiaman Li","C. Karen Liu","Jiajun Wu"],"pdf_url":"https://arxiv.org/pdf/2212.04636v3.pdf","comment":"CVPR 2023 (Award Candidate)"},{"id":"http://arxiv.org/abs/2308.14267v1","updated":"2023-08-28T02:49:07Z","published":"2023-08-28T02:49:07Z","title":"Unleash Model Potential: Bootstrapped Meta Self-supervised Learning","summary":" The long-term goal of machine learning is to learn general visual\nrepresentations from a small amount of data without supervision, mimicking\nthree advantages of human cognition: i) no need for labels, ii) robustness to\ndata scarcity, and iii) learning from experience. Self-supervised learning and\nmeta-learning are two promising techniques to achieve this goal, but they both\nonly partially capture the advantages and fail to address all the problems.\nSelf-supervised learning struggles to overcome the drawbacks of data scarcity,\nwhile ignoring prior knowledge that can facilitate learning and generalization.\nMeta-learning relies on supervised information and suffers from a bottleneck of\ninsufficient learning. To address these issues, we propose a novel Bootstrapped\nMeta Self-Supervised Learning (BMSSL) framework that aims to simulate the human\nlearning process. We first analyze the close relationship between meta-learning\nand self-supervised learning. Based on this insight, we reconstruct tasks to\nleverage the strengths of both paradigms, achieving advantages i and ii.\nMoreover, we employ a bi-level optimization framework that alternates between\nsolving specific tasks with a learned ability (first level) and improving this\nability (second level), attaining advantage iii. To fully harness its power, we\nintroduce a bootstrapped target based on meta-gradient to make the model its\nown teacher. We validate the effectiveness of our approach with comprehensive\ntheoretical and empirical study.\n","authors":["Jingyao Wang","Zeen Song","Wenwen Qiang","Changwen Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.14267v1.pdf","comment":"submitted to NIPS"},{"id":"http://arxiv.org/abs/2304.12685v2","updated":"2023-08-28T02:23:05Z","published":"2023-04-25T09:39:30Z","title":"Exploring the Mutual Influence between Self-Supervised Single-Frame and\n Multi-Frame Depth Estimation","summary":" Although both self-supervised single-frame and multi-frame depth estimation\nmethods only require unlabeled monocular videos for training, the information\nthey leverage varies because single-frame methods mainly rely on\nappearance-based features while multi-frame methods focus on geometric cues.\nConsidering the complementary information of single-frame and multi-frame\nmethods, some works attempt to leverage single-frame depth to improve\nmulti-frame depth. However, these methods can neither exploit the difference\nbetween single-frame depth and multi-frame depth to improve multi-frame depth\nnor leverage multi-frame depth to optimize single-frame depth models. To fully\nutilize the mutual influence between single-frame and multi-frame methods, we\npropose a novel self-supervised training framework. Specifically, we first\nintroduce a pixel-wise adaptive depth sampling module guided by single-frame\ndepth to train the multi-frame model. Then, we leverage the minimum\nreprojection based distillation loss to transfer the knowledge from the\nmulti-frame depth network to the single-frame network to improve single-frame\ndepth. Finally, we regard the improved single-frame depth as a prior to further\nboost the performance of multi-frame depth estimation. Experimental results on\nthe KITTI and Cityscapes datasets show that our method outperforms existing\napproaches in the self-supervised monocular setting.\n","authors":["Jie Xiang","Yun Wang","Lifeng An","Haiyang Liu","Jian Liu"],"pdf_url":"https://arxiv.org/pdf/2304.12685v2.pdf","comment":"Accepted for publication in the IEEE Robotics and Automation Letters\n (RA-L). 8 pages, 3figures"},{"id":"http://arxiv.org/abs/2308.14256v1","updated":"2023-08-28T02:20:44Z","published":"2023-08-28T02:20:44Z","title":"FaceChain: A Playground for Identity-Preserving Portrait Generation","summary":" Recent advancement in personalized image generation have unveiled the\nintriguing capability of pre-trained text-to-image models on learning identity\ninformation from a collection of portrait images. However, existing solutions\ncan be vulnerable in producing truthful details, and usually suffer from\nseveral defects such as (i) The generated face exhibit its own unique\ncharacteristics, \\ie facial shape and facial feature positioning may not\nresemble key characteristics of the input, and (ii) The synthesized face may\ncontain warped, blurred or corrupted regions. In this paper, we present\nFaceChain, a personalized portrait generation framework that combines a series\nof customized image-generation model and a rich set of face-related perceptual\nunderstanding models (\\eg, face detection, deep face embedding extraction, and\nfacial attribute recognition), to tackle aforementioned challenges and to\ngenerate truthful personalized portraits, with only a handful of portrait\nimages as input. Concretely, we inject several SOTA face models into the\ngeneration procedure, achieving a more efficient label-tagging,\ndata-processing, and model post-processing compared to previous solutions, such\nas DreamBooth ~\\cite{ruiz2023dreambooth} , InstantBooth\n~\\cite{shi2023instantbooth} , or other LoRA-only approaches ~\\cite{hu2021lora}\n. Through the development of FaceChain, we have identified several potential\ndirections to accelerate development of Face/Human-Centric AIGC research and\napplication. We have designed FaceChain as a framework comprised of pluggable\ncomponents that can be easily adjusted to accommodate different styles and\npersonalized needs. We hope it can grow to serve the burgeoning needs from the\ncommunities. FaceChain is open-sourced under Apache-2.0 license at\n\\url{https://github.com/modelscope/facechain}.\n","authors":["Yang Liu","Cheng Yu","Lei Shang","Ziheng Wu","Xingjun Wang","Yuze Zhao","Lin Zhu","Chen Cheng","Weitao Chen","Chao Xu","Haoyu Xie","Yuan Yao","Wenmeng Zhou","Yingda Chen","Xuansong Xie","Baigui Sun"],"pdf_url":"https://arxiv.org/pdf/2308.14256v1.pdf","comment":"This is an ongoing work that will be consistently refined and\n improved upon"},{"id":"http://arxiv.org/abs/2308.14244v1","updated":"2023-08-28T01:19:33Z","published":"2023-08-28T01:19:33Z","title":"HoloFusion: Towards Photo-realistic 3D Generative Modeling","summary":" Diffusion-based image generators can now produce high-quality and diverse\nsamples, but their success has yet to fully translate to 3D generation:\nexisting diffusion methods can either generate low-resolution but 3D consistent\noutputs, or detailed 2D views of 3D objects but with potential structural\ndefects and lacking view consistency or realism. We present HoloFusion, a\nmethod that combines the best of these approaches to produce high-fidelity,\nplausible, and diverse 3D samples while learning from a collection of\nmulti-view 2D images only. The method first generates coarse 3D samples using a\nvariant of the recently proposed HoloDiffusion generator. Then, it\nindependently renders and upsamples a large number of views of the coarse 3D\nmodel, super-resolves them to add detail, and distills those into a single,\nhigh-fidelity implicit 3D representation, which also ensures view consistency\nof the final renders. The super-resolution network is trained as an integral\npart of HoloFusion, end-to-end, and the final distillation uses a new sampling\nscheme to capture the space of super-resolved signals. We compare our method\nagainst existing baselines, including DreamFusion, Get3D, EG3D, and\nHoloDiffusion, and achieve, to the best of our knowledge, the most realistic\nresults on the challenging CO3Dv2 dataset.\n","authors":["Animesh Karnewar","Niloy J. Mitra","Andrea Vedaldi","David Novotny"],"pdf_url":"https://arxiv.org/pdf/2308.14244v1.pdf","comment":"ICCV 2023 conference; project page at:\n https://holodiffusion.github.io/holofusion"},{"id":"http://arxiv.org/abs/2305.19867v2","updated":"2023-08-28T23:47:07Z","published":"2023-05-31T14:04:11Z","title":"Unsupervised Anomaly Detection in Medical Images Using Masked Diffusion\n Model","summary":" It can be challenging to identify brain MRI anomalies using supervised\ndeep-learning techniques due to anatomical heterogeneity and the requirement\nfor pixel-level labeling. Unsupervised anomaly detection approaches provide an\nalternative solution by relying only on sample-level labels of healthy brains\nto generate a desired representation to identify abnormalities at the pixel\nlevel. Although, generative models are crucial for generating such anatomically\nconsistent representations of healthy brains, accurately generating the\nintricate anatomy of the human brain remains a challenge. In this study, we\npresent a method called masked-DDPM (mDPPM), which introduces masking-based\nregularization to reframe the generation task of diffusion models.\nSpecifically, we introduce Masked Image Modeling (MIM) and Masked Frequency\nModeling (MFM) in our self-supervised approach that enables models to learn\nvisual representations from unlabeled data. To the best of our knowledge, this\nis the first attempt to apply MFM in DPPM models for medical applications. We\nevaluate our approach on datasets containing tumors and numerous sclerosis\nlesions and exhibit the superior performance of our unsupervised method as\ncompared to the existing fully/weakly supervised baselines. Code is available\nat https://github.com/hasan1292/mDDPM.\n","authors":["Hasan Iqbal","Umar Khalid","Jing Hua","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2305.19867v2.pdf","comment":"Accepted in MICCAI 2023 Workshops"},{"id":"http://arxiv.org/abs/2308.14938v1","updated":"2023-08-28T23:33:07Z","published":"2023-08-28T23:33:07Z","title":"Entropy-based Guidance of Deep Neural Networks for Accelerated\n Convergence and Improved Performance","summary":" Neural networks have dramatically increased our capacity to learn from large,\nhigh-dimensional datasets across innumerable disciplines. However, their\ndecisions are not easily interpretable, their computational costs are high, and\nbuilding and training them are uncertain processes. To add structure to these\nefforts, we derive new mathematical results to efficiently measure the changes\nin entropy as fully-connected and convolutional neural networks process data,\nand introduce entropy-based loss terms. Experiments in image compression and\nimage classification on benchmark datasets demonstrate these losses guide\nneural networks to learn rich latent data representations in fewer dimensions,\nconverge in fewer training epochs, and achieve better test metrics.\n","authors":["Mackenzie J. Meni","Ryan T. White","Michael Mayo","Kevin Pilkiewicz"],"pdf_url":"https://arxiv.org/pdf/2308.14938v1.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.14936v1","updated":"2023-08-28T23:23:53Z","published":"2023-08-28T23:23:53Z","title":"Auto-Prompting SAM for Mobile Friendly 3D Medical Image Segmentation","summary":" The Segment Anything Model (SAM) has rapidly been adopted for segmenting a\nwide range of natural images. However, recent studies have indicated that SAM\nexhibits subpar performance on 3D medical image segmentation tasks. In addition\nto the domain gaps between natural and medical images, disparities in the\nspatial arrangement between 2D and 3D images, the substantial computational\nburden imposed by powerful GPU servers, and the time-consuming manual prompt\ngeneration impede the extension of SAM to a broader spectrum of medical image\nsegmentation applications. To address these challenges, in this work, we\nintroduce a novel method, AutoSAM Adapter, designed specifically for 3D\nmulti-organ CT-based segmentation. We employ parameter-efficient adaptation\ntechniques in developing an automatic prompt learning paradigm to facilitate\nthe transformation of the SAM model's capabilities to 3D medical image\nsegmentation, eliminating the need for manually generated prompts. Furthermore,\nwe effectively transfer the acquired knowledge of the AutoSAM Adapter to other\nlightweight models specifically tailored for 3D medical image analysis,\nachieving state-of-the-art (SOTA) performance on medical image segmentation\ntasks. Through extensive experimental evaluation, we demonstrate the AutoSAM\nAdapter as a critical foundation for effectively leveraging the emerging\nability of foundation models in 2D natural image segmentation for 3D medical\nimage segmentation.\n","authors":["Chengyin Li","Prashant Khanduri","Yao Qiang","Rafi Ibn Sultan","Indrin Chetty","Dongxiao Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.14936v1.pdf","comment":"9 pages, 4 figures, 4 tables"},{"id":"http://arxiv.org/abs/2308.14930v1","updated":"2023-08-28T23:08:32Z","published":"2023-08-28T23:08:32Z","title":"Application of Quantum Pre-Processing Filter for Binary Image\n Classification with Small Samples","summary":" Over the past few years, there has been significant interest in Quantum\nMachine Learning (QML) among researchers, as it has the potential to transform\nthe field of machine learning. Several models that exploit the properties of\nquantum mechanics have been developed for practical applications. In this\nstudy, we investigated the application of our previously proposed quantum\npre-processing filter (QPF) to binary image classification. We evaluated the\nQPF on four datasets: MNIST (handwritten digits), EMNIST (handwritten digits\nand alphabets), CIFAR-10 (photographic images) and GTSRB (real-life traffic\nsign images). Similar to our previous multi-class classification results, the\napplication of QPF improved the binary image classification accuracy using\nneural network against MNIST, EMNIST, and CIFAR-10 from 98.9% to 99.2%, 97.8%\nto 98.3%, and 71.2% to 76.1%, respectively, but degraded it against GTSRB from\n93.5% to 92.0%. We then applied QPF in cases using a smaller number of training\nand testing samples, i.e. 80 and 20 samples per class, respectively. In order\nto derive statistically stable results, we conducted the experiment with 100\ntrials choosing randomly different training and testing samples and averaging\nthe results. The result showed that the application of QPF did not improve the\nimage classification accuracy against MNIST and EMNIST but improved it against\nCIFAR-10 and GTSRB from 65.8% to 67.2% and 90.5% to 91.8%, respectively.\nFurther research will be conducted as part of future work to investigate the\npotential of QPF to assess the scalability of the proposed approach to larger\nand complex datasets.\n","authors":["Farina Riaz","Shahab Abdulla","Hajime Suzuki","Srinjoy Ganguly","Ravinesh C. Deo","Susan Hopkins"],"pdf_url":"https://arxiv.org/pdf/2308.14930v1.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2304.11751v2","updated":"2023-08-28T22:38:31Z","published":"2023-04-23T21:05:59Z","title":"Score-Based Diffusion Models as Principled Priors for Inverse Imaging","summary":" Priors are essential for reconstructing images from noisy and/or incomplete\nmeasurements. The choice of the prior determines both the quality and\nuncertainty of recovered images. We propose turning score-based diffusion\nmodels into principled image priors (\"score-based priors\") for analyzing a\nposterior of images given measurements. Previously, probabilistic priors were\nlimited to handcrafted regularizers and simple distributions. In this work, we\nempirically validate the theoretically-proven probability function of a\nscore-based diffusion model. We show how to sample from resulting posteriors by\nusing this probability function for variational inference. Our results,\nincluding experiments on denoising, deblurring, and interferometric imaging,\nsuggest that score-based priors enable principled inference with a\nsophisticated, data-driven image prior.\n","authors":["Berthy T. Feng","Jamie Smith","Michael Rubinstein","Huiwen Chang","Katherine L. Bouman","William T. Freeman"],"pdf_url":"https://arxiv.org/pdf/2304.11751v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.14922v1","updated":"2023-08-28T22:32:15Z","published":"2023-08-28T22:32:15Z","title":"Automated Conversion of Music Videos into Lyric Videos","summary":" Musicians and fans often produce lyric videos, a form of music videos that\nshowcase the song's lyrics, for their favorite songs. However, making such\nvideos can be challenging and time-consuming as the lyrics need to be added in\nsynchrony and visual harmony with the video. Informed by prior work and close\nexamination of existing lyric videos, we propose a set of design guidelines to\nhelp creators make such videos. Our guidelines ensure the readability of the\nlyric text while maintaining a unified focus of attention. We instantiate these\nguidelines in a fully automated pipeline that converts an input music video\ninto a lyric video. We demonstrate the robustness of our pipeline by generating\nlyric videos from a diverse range of input sources. A user study shows that\nlyric videos generated by our pipeline are effective in maintaining text\nreadability and unifying the focus of attention.\n","authors":["Jiaju Ma","Anyi Rao","Li-Yi Wei","Rubaiat Habib Kazi","Hijung Valentina Shin","Maneesh Agrawala"],"pdf_url":"https://arxiv.org/pdf/2308.14922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03407v2","updated":"2023-08-28T22:25:15Z","published":"2023-08-07T08:48:46Z","title":"Spatially Varying Nanophotonic Neural Networks","summary":" The explosive growth of computation and energy cost of artificial\nintelligence has spurred strong interests in new computing modalities as\npotential alternatives to conventional electronic processors. Photonic\nprocessors that execute operations using photons instead of electrons, have\npromised to enable optical neural networks with ultra-low latency and power\nconsumption. However, existing optical neural networks, limited by the\nunderlying network designs, have achieved image recognition accuracy much lower\nthan state-of-the-art electronic neural networks. In this work, we close this\ngap by introducing a large-kernel spatially-varying convolutional neural\nnetwork learned via low-dimensional reparameterization techniques. We\nexperimentally instantiate the network with a flat meta-optical system that\nencompasses an array of nanophotonic structures designed to induce\nangle-dependent responses. Combined with an extremely lightweight electronic\nbackend with approximately 2K parameters we demonstrate a nanophotonic neural\nnetwork reaches 73.80\\% blind test classification accuracy on CIFAR-10 dataset,\nand, as such, the first time, an optical neural network outperforms the first\nmodern digital neural network -- AlexNet (72.64\\%) with 57M parameters,\nbringing optical neural network into modern deep learning era.\n","authors":["Kaixuan Wei","Xiao Li","Johannes Froech","Praneeth Chakravarthula","James Whitehead","Ethan Tseng","Arka Majumdar","Felix Heide"],"pdf_url":"https://arxiv.org/pdf/2308.03407v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14904v1","updated":"2023-08-28T21:13:04Z","published":"2023-08-28T21:13:04Z","title":"Maturity-Aware Active Learning for Semantic Segmentation with\n Hierarchically-Adaptive Sample Assessment","summary":" Active Learning (AL) for semantic segmentation is challenging due to heavy\nclass imbalance and different ways of defining \"sample\" (pixels, areas, etc.),\nleaving the interpretation of the data distribution ambiguous. We propose\n\"Maturity-Aware Distribution Breakdown-based Active Learning'' (MADBAL), an AL\nmethod that benefits from a hierarchical approach to define a multiview data\ndistribution, which takes into account the different \"sample\" definitions\njointly, hence able to select the most impactful segmentation pixels with\ncomprehensive understanding. MADBAL also features a novel uncertainty\nformulation, where AL supporting modules are included to sense the features'\nmaturity whose weighted influence continuously contributes to the uncertainty\ndetection. In this way, MADBAL makes significant performance leaps even in the\nearly AL stage, hence reducing the training burden significantly. It\noutperforms state-of-the-art methods on Cityscapes and PASCAL VOC datasets as\nverified in our extensive experiments.\n","authors":["Amirsaeed Yazdani","Xuelu Li","Vishal Monga"],"pdf_url":"https://arxiv.org/pdf/2308.14904v1.pdf","comment":"Accepted to the 34th British Machine Vision Conference (BMVC 2023)"},{"id":"http://arxiv.org/abs/2308.14900v1","updated":"2023-08-28T20:59:15Z","published":"2023-08-28T20:59:15Z","title":"BIT: Bi-Level Temporal Modeling for Efficient Supervised Action\n Segmentation","summary":" We address the task of supervised action segmentation which aims to partition\na video into non-overlapping segments, each representing a different action.\nRecent works apply transformers to perform temporal modeling at the\nframe-level, which suffer from high computational cost and cannot well capture\naction dependencies over long temporal horizons. To address these issues, we\npropose an efficient BI-level Temporal modeling (BIT) framework that learns\nexplicit action tokens to represent action segments, in parallel performs\ntemporal modeling on frame and action levels, while maintaining a low\ncomputational cost. Our model contains (i) a frame branch that uses convolution\nto learn frame-level relationships, (ii) an action branch that uses transformer\nto learn action-level dependencies with a small set of action tokens and (iii)\ncross-attentions to allow communication between the two branches. We apply and\nextend a set-prediction objective to allow each action token to represent one\nor multiple action segments, thus can avoid learning a large number of tokens\nover long videos with many segments. Thanks to the design of our action branch,\nwe can also seamlessly leverage textual transcripts of videos (when available)\nto help action segmentation by using them to initialize the action tokens. We\nevaluate our model on four video datasets (two egocentric and two third-person)\nfor action segmentation with and without transcripts, showing that BIT\nsignificantly improves the state-of-the-art accuracy with much lower\ncomputational cost (30 times faster) compared to existing transformer-based\nmethods.\n","authors":["Zijia Lu","Ehsan Elhamifar"],"pdf_url":"https://arxiv.org/pdf/2308.14900v1.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.14899v1","updated":"2023-08-28T20:52:18Z","published":"2023-08-28T20:52:18Z","title":"RobustCLEVR: A Benchmark and Framework for Evaluating Robustness in\n Object-centric Learning","summary":" Object-centric representation learning offers the potential to overcome\nlimitations of image-level representations by explicitly parsing image scenes\ninto their constituent components. While image-level representations typically\nlack robustness to natural image corruptions, the robustness of object-centric\nmethods remains largely untested. To address this gap, we present the\nRobustCLEVR benchmark dataset and evaluation framework. Our framework takes a\nnovel approach to evaluating robustness by enabling the specification of causal\ndependencies in the image generation process grounded in expert knowledge and\ncapable of producing a wide range of image corruptions unattainable in existing\nrobustness evaluations. Using our framework, we define several causal models of\nthe image corruption process which explicitly encode assumptions about the\ncausal relationships and distributions of each corruption type. We generate\ndataset variants for each causal model on which we evaluate state-of-the-art\nobject-centric methods. Overall, we find that object-centric methods are not\ninherently robust to image corruptions. Our causal evaluation approach exposes\nmodel sensitivities not observed using conventional evaluation processes,\nyielding greater insight into robustness differences across algorithms. Lastly,\nwhile conventional robustness evaluations view corruptions as\nout-of-distribution, we use our causal framework to show that even training on\nin-distribution image corruptions does not guarantee increased model\nrobustness. This work provides a step towards more concrete and substantiated\nunderstanding of model performance and deterioration under complex corruption\nprocesses of the real-world.\n","authors":["Nathan Drenkow","Mathias Unberath"],"pdf_url":"https://arxiv.org/pdf/2308.14899v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14893v1","updated":"2023-08-28T20:30:10Z","published":"2023-08-28T20:30:10Z","title":"When hard negative sampling meets supervised contrastive learning","summary":" State-of-the-art image models predominantly follow a two-stage strategy:\npre-training on large datasets and fine-tuning with cross-entropy loss. Many\nstudies have shown that using cross-entropy can result in sub-optimal\ngeneralisation and stability. While the supervised contrastive loss addresses\nsome limitations of cross-entropy loss by focusing on intra-class similarities\nand inter-class differences, it neglects the importance of hard negative\nmining. We propose that models will benefit from performance improvement by\nweighting negative samples based on their dissimilarity to positive\ncounterparts. In this paper, we introduce a new supervised contrastive learning\nobjective, SCHaNe, which incorporates hard negative sampling during the\nfine-tuning phase. Without requiring specialized architectures, additional\ndata, or extra computational resources, experimental results indicate that\nSCHaNe outperforms the strong baseline BEiT-3 in Top-1 accuracy across various\nbenchmarks, with significant gains of up to $3.32\\%$ in few-shot learning\nsettings and $3.41\\%$ in full dataset fine-tuning. Importantly, our proposed\nobjective sets a new state-of-the-art for base models on ImageNet-1k, achieving\nan 86.14\\% accuracy. Furthermore, we demonstrate that the proposed objective\nyields better embeddings and explains the improved effectiveness observed in\nour experiments.\n","authors":["Zijun Long","George Killick","Richard McCreadie","Gerardo Aragon Camarasa","Zaiqiao Meng"],"pdf_url":"https://arxiv.org/pdf/2308.14893v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.16890v2","updated":"2023-08-28T19:38:32Z","published":"2023-06-29T12:22:47Z","title":"Trajectory Poisson multi-Bernoulli mixture filter for traffic monitoring\n using a drone","summary":" This paper proposes a multi-object tracking (MOT) algorithm for traffic\nmonitoring using a drone equipped with optical and thermal cameras. Object\ndetections on the images are obtained using a neural network for each type of\ncamera. The cameras are modelled as direction-of-arrival (DOA) sensors. Each\nDOA detection follows a von-Mises Fisher distribution, whose mean direction is\nobtain by projecting a vehicle position on the ground to the camera. We then\nuse the trajectory Poisson multi-Bernoulli mixture filter (TPMBM), which is a\nBayesian MOT algorithm, to optimally estimate the set of vehicle trajectories.\nWe have also developed a parameter estimation algorithm for the measurement\nmodel. We have tested the accuracy of the resulting TPMBM filter in synthetic\nand experimental data sets.\n","authors":["Ángel F. García-Fernández","Jimin Xiao"],"pdf_url":"https://arxiv.org/pdf/2306.16890v2.pdf","comment":"accepted in IEEE Transactions on Vehicular Technology"},{"id":"http://arxiv.org/abs/2308.14861v1","updated":"2023-08-28T19:31:53Z","published":"2023-08-28T19:31:53Z","title":"Evaluation of Key Spatiotemporal Learners for Print Track Anomaly\n Classification Using Melt Pool Image Streams","summary":" Recent applications of machine learning in metal additive manufacturing (MAM)\nhave demonstrated significant potential in addressing critical barriers to the\nwidespread adoption of MAM technology. Recent research in this field emphasizes\nthe importance of utilizing melt pool signatures for real-time defect\nprediction. While high-quality melt pool image data holds the promise of\nenabling precise predictions, there has been limited exploration into the\nutilization of cutting-edge spatiotemporal models that can harness the inherent\ntransient and sequential characteristics of the additive manufacturing process.\nThis research introduces and puts into practice some of the leading deep\nspatiotemporal learning models that can be adapted for the classification of\nmelt pool image streams originating from various materials, systems, and\napplications. Specifically, it investigates two-stream networks comprising\nspatial and temporal streams, a recurrent spatial network, and a factorized 3D\nconvolutional neural network. The capacity of these models to generalize when\nexposed to perturbations in melt pool image data is examined using data\nperturbation techniques grounded in real-world process scenarios. The\nimplemented architectures demonstrate the ability to capture the spatiotemporal\nfeatures of melt pool image sequences. However, among these models, only the\nKinetics400 pre-trained SlowFast network, categorized as a two-stream network,\nexhibits robust generalization capabilities in the presence of data\nperturbations.\n","authors":["Lynn Cherif","Mutahar Safdar","Guy Lamouche","Priti Wanjara","Padma Paul","Gentry Wood","Max Zimmermann","Florian Hannesen","Yaoyao Fiona Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.14861v1.pdf","comment":"This work has been accepted to IFAC for publication under a Creative\n Commons Licence CC-BY-NC-ND"},{"id":"http://arxiv.org/abs/2308.14852v1","updated":"2023-08-28T19:15:27Z","published":"2023-08-28T19:15:27Z","title":"SynthDistill: Face Recognition with Knowledge Distillation from\n Synthetic Data","summary":" State-of-the-art face recognition networks are often computationally\nexpensive and cannot be used for mobile applications. Training lightweight face\nrecognition models also requires large identity-labeled datasets. Meanwhile,\nthere are privacy and ethical concerns with collecting and using large face\nrecognition datasets. While generating synthetic datasets for training face\nrecognition models is an alternative option, it is challenging to generate\nsynthetic data with sufficient intra-class variations. In addition, there is\nstill a considerable gap between the performance of models trained on real and\nsynthetic data. In this paper, we propose a new framework (named SynthDistill)\nto train lightweight face recognition models by distilling the knowledge of a\npretrained teacher face recognition model using synthetic data. We use a\npretrained face generator network to generate synthetic face images and use the\nsynthesized images to learn a lightweight student network. We use synthetic\nface images without identity labels, mitigating the problems in the intra-class\nvariation generation of synthetic datasets. Instead, we propose a novel dynamic\nsampling strategy from the intermediate latent space of the face generator\nnetwork to include new variations of the challenging images while further\nexploring new face images in the training batch. The results on five different\nface recognition datasets demonstrate the superiority of our lightweight model\ncompared to models trained on previous synthetic datasets, achieving a\nverification accuracy of 99.52% on the LFW dataset with a lightweight network.\nThe results also show that our proposed framework significantly reduces the gap\nbetween training with real and synthetic data. The source code for replicating\nthe experiments is publicly released.\n","authors":["Hatef Otroshi Shahreza","Anjith George","Sébastien Marcel"],"pdf_url":"https://arxiv.org/pdf/2308.14852v1.pdf","comment":"Accepted in the IEEE International Joint Conference on Biometrics\n (IJCB 2023)"},{"id":"http://arxiv.org/abs/2308.14847v1","updated":"2023-08-28T19:08:17Z","published":"2023-08-28T19:08:17Z","title":"NSF: Neural Surface Fields for Human Modeling from Monocular Depth","summary":" Obtaining personalized 3D animatable avatars from a monocular camera has\nseveral real world applications in gaming, virtual try-on, animation, and\nVR/XR, etc. However, it is very challenging to model dynamic and fine-grained\nclothing deformations from such sparse data. Existing methods for modeling 3D\nhumans from depth data have limitations in terms of computational efficiency,\nmesh coherency, and flexibility in resolution and topology. For instance,\nreconstructing shapes using implicit functions and extracting explicit meshes\nper frame is computationally expensive and cannot ensure coherent meshes across\nframes. Moreover, predicting per-vertex deformations on a pre-designed human\ntemplate with a discrete surface lacks flexibility in resolution and topology.\nTo overcome these limitations, we propose a novel method `\\keyfeature: Neural\nSurface Fields' for modeling 3D clothed humans from monocular depth. NSF\ndefines a neural field solely on the base surface which models a continuous and\nflexible displacement field. NSF can be adapted to the base surface with\ndifferent resolution and topology without retraining at inference time.\nCompared to existing approaches, our method eliminates the expensive per-frame\nsurface extraction while maintaining mesh coherency, and is capable of\nreconstructing meshes with arbitrary resolution without retraining. To foster\nresearch in this direction, we release our code in project page at:\nhttps://yuxuan-xue.com/nsf.\n","authors":["Yuxuan Xue","Bharat Lal Bhatnagar","Riccardo Marin","Nikolaos Sarafianos","Yuanlu Xu","Gerard Pons-Moll","Tony Tung"],"pdf_url":"https://arxiv.org/pdf/2308.14847v1.pdf","comment":"Accpted to ICCV 2023; Homepage at: https://yuxuan-xue.com/nsf"},{"id":"http://arxiv.org/abs/2308.14833v1","updated":"2023-08-28T18:43:33Z","published":"2023-08-28T18:43:33Z","title":"The Interstate-24 3D Dataset: a new benchmark for 3D multi-camera\n vehicle tracking","summary":" This work presents a novel video dataset recorded from overlapping highway\ntraffic cameras along an urban interstate, enabling multi-camera 3D object\ntracking in a traffic monitoring context. Data is released from 3 scenes\ncontaining video from at least 16 cameras each, totaling 57 minutes in length.\n877,000 3D bounding boxes and corresponding object tracklets are fully and\naccurately annotated for each camera field of view and are combined into a\nspatially and temporally continuous set of vehicle trajectories for each scene.\nLastly, existing algorithms are combined to benchmark a number of 3D\nmulti-camera tracking pipelines on the dataset, with results indicating that\nthe dataset is challenging due to the difficulty of matching objects traveling\nat high speeds across cameras and heavy object occlusion, potentially for\nhundreds of frames, during congested traffic. This work aims to enable the\ndevelopment of accurate and automatic vehicle trajectory extraction algorithms,\nwhich will play a vital role in understanding impacts of autonomous vehicle\ntechnologies on the safety and efficiency of traffic.\n","authors":["Derek Gloudemans","Yanbing Wang","Gracie Gumm","William Barbour","Daniel B. Work"],"pdf_url":"https://arxiv.org/pdf/2308.14833v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14831v1","updated":"2023-08-28T18:31:09Z","published":"2023-08-28T18:31:09Z","title":"Continual Learning with Dynamic Sparse Training: Exploring Algorithms\n for Effective Model Updates","summary":" Continual learning (CL) refers to the ability of an intelligent system to\nsequentially acquire and retain knowledge from a stream of data with as little\ncomputational overhead as possible. To this end; regularization, replay,\narchitecture, and parameter isolation approaches were introduced to the\nliterature. Parameter isolation using a sparse network which enables to\nallocate distinct parts of the neural network to different tasks and also\nallows to share of parameters between tasks if they are similar. Dynamic Sparse\nTraining (DST) is a prominent way to find these sparse networks and isolate\nthem for each task. This paper is the first empirical study investigating the\neffect of different DST components under the CL paradigm to fill a critical\nresearch gap and shed light on the optimal configuration of DST for CL if it\nexists. Therefore, we perform a comprehensive study in which we investigate\nvarious DST components to find the best topology per task on well-known\nCIFAR100 and miniImageNet benchmarks in a task-incremental CL setup since our\nprimary focus is to evaluate the performance of various DST criteria, rather\nthan the process of mask selection. We found that, at a low sparsity level,\nErdos-Renyi Kernel (ERK) initialization utilizes the backbone more efficiently\nand allows to effectively learn increments of tasks. At a high sparsity level,\nhowever, uniform initialization demonstrates more reliable and robust\nperformance. In terms of growth strategy; performance is dependent on the\ndefined initialization strategy, and the extent of sparsity. Finally,\nadaptivity within DST components is a promising way for better continual\nlearners.\n","authors":["Murat Onur Yildirim","Elif Ceren Gok Yildirim","Ghada Sokar","Decebal Constantin Mocanu","Joaquin Vanschoren"],"pdf_url":"https://arxiv.org/pdf/2308.14831v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14816v1","updated":"2023-08-28T18:09:13Z","published":"2023-08-28T18:09:13Z","title":"CLNeRF: Continual Learning Meets NeRF","summary":" Novel view synthesis aims to render unseen views given a set of calibrated\nimages. In practical applications, the coverage, appearance or geometry of the\nscene may change over time, with new images continuously being captured.\nEfficiently incorporating such continuous change is an open challenge. Standard\nNeRF benchmarks only involve scene coverage expansion. To study other practical\nscene changes, we propose a new dataset, World Across Time (WAT), consisting of\nscenes that change in appearance and geometry over time. We also propose a\nsimple yet effective method, CLNeRF, which introduces continual learning (CL)\nto Neural Radiance Fields (NeRFs). CLNeRF combines generative replay and the\nInstant Neural Graphics Primitives (NGP) architecture to effectively prevent\ncatastrophic forgetting and efficiently update the model when new data arrives.\nWe also add trainable appearance and geometry embeddings to NGP, allowing a\nsingle compact model to handle complex scene changes. Without the need to store\nhistorical images, CLNeRF trained sequentially over multiple scans of a\nchanging scene performs on-par with the upper bound model trained on all scans\nat once. Compared to other CL baselines CLNeRF performs much better across\nstandard benchmarks and WAT. The source code, and the WAT dataset are available\nat https://github.com/IntelLabs/CLNeRF. Video presentation is available at:\nhttps://youtu.be/nLRt6OoDGq0?si=8yD6k-8MMBJInQPs\n","authors":["Zhipeng Cai","Matthias Mueller"],"pdf_url":"https://arxiv.org/pdf/2308.14816v1.pdf","comment":"Accepted to ICCV 2023"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.14622v1","updated":"2023-08-28T16:58:44Z","published":"2023-08-28T16:58:44Z","title":"TRIVEA: Transparent Ranking Interpretation using Visual Explanation of\n Black-Box Algorithmic Rankers","summary":" Ranking schemes drive many real-world decisions, like, where to study, whom\nto hire, what to buy, etc. Many of these decisions often come with high\nconsequences. For example, a university can be deemed less prestigious if not\nfeatured in a top-k list, and consumers might not even explore products that do\nnot get recommended to buyers. At the heart of most of these decisions are\nopaque ranking schemes, which dictate the ordering of data entities, but their\ninternal logic is inaccessible or proprietary. Drawing inferences about the\nranking differences is like a guessing game to the stakeholders, like, the\nrankees (i.e., the entities who are ranked, like product companies) and the\ndecision-makers (i.e., who use the rankings, like buyers). In this paper, we\naim to enable transparency in ranking interpretation by using algorithmic\nrankers that learn from available data and by enabling human reasoning about\nthe learned ranking differences using explainable AI (XAI) methods. To realize\nthis aim, we leverage the exploration-explanation paradigm of human-data\ninteraction to let human stakeholders explore subsets and groupings of complex\nmulti-attribute ranking data using visual explanations of model fit and\nattribute influence on rankings. We realize this explanation paradigm for\ntransparent ranking interpretation in TRIVEA, a visual analytic system that is\nfueled by: i) visualizations of model fit derived from algorithmic rankers that\nlearn the associations between attributes and rankings from available data and\nii) visual explanations derived from XAI methods that help abstract important\npatterns, like, the relative influence of attributes in different ranking\nranges. Using TRIVEA, end users not trained in data science have the agency to\ntransparently reason about the global and local behavior of the rankings\nwithout the need to open black-box ranking models and develop confidence in the\nresulting attribute-based inferences. We demonstrate the efficacy of TRIVEA\nusing multiple usage scenarios and subjective feedback from researchers with\ndiverse domain expertise. Keywords: Visual Analytics, Learning-to-Rank,\nExplainable ML, Ranking\n","authors":["Jun Yuan","Kaustav Bhattacharjee","Akm Zahirul Islam","Aritra Dasgupta"],"pdf_url":"https://arxiv.org/pdf/2308.14622v1.pdf","comment":"Accepted for publication in SpringerNature's Visual Computer Journal"},{"id":"http://arxiv.org/abs/2308.14601v1","updated":"2023-08-28T14:12:25Z","published":"2023-08-28T14:12:25Z","title":"Fairness Through Domain Awareness: Mitigating Popularity Bias For Music\n Discovery","summary":" As online music platforms grow, music recommender systems play a vital role\nin helping users navigate and discover content within their vast musical\ndatabases. At odds with this larger goal, is the presence of popularity bias,\nwhich causes algorithmic systems to favor mainstream content over, potentially\nmore relevant, but niche items. In this work we explore the intrinsic\nrelationship between music discovery and popularity bias. To mitigate this\nissue we propose a domain-aware, individual fairness-based approach which\naddresses popularity bias in graph neural network (GNNs) based recommender\nsystems. Our approach uses individual fairness to reflect a ground truth\nlistening experience, i.e., if two songs sound similar, this similarity should\nbe reflected in their representations. In doing so, we facilitate meaningful\nmusic discovery that is robust to popularity bias and grounded in the music\ndomain. We apply our BOOST methodology to two discovery based tasks, performing\nrecommendations at both the playlist level and user level. Then, we ground our\nevaluation in the cold start setting, showing that our approach outperforms\nexisting fairness benchmarks in both performance and recommendation of\nlesser-known content. Finally, our analysis explains why our proposed\nmethodology is a novel and promising approach to mitigating popularity bias and\nimproving the discovery of new and niche content in music recommender systems.\n","authors":["Rebecca Salganik","Fernando Diaz","Golnoosh Farnadi"],"pdf_url":"https://arxiv.org/pdf/2308.14601v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14499v1","updated":"2023-08-28T11:19:44Z","published":"2023-08-28T11:19:44Z","title":"Efficient and Accurate Tree Detection from 3D Point Clouds through Paid\n Crowdsourcing","summary":" Accurate tree detection is of growing importance in applications such as\nurban planning, forest inventory, and environmental monitoring. In this\narticle, we present an approach to creating tree maps by annotating them in 3D\npoint clouds. Point cloud representations allow the precise identification of\ntree positions, particularly stem locations, and their heights. Our method\nleverages human computational power through paid crowdsourcing, employing a web\ntool designed to enable even non-experts to effectively tackle the task. The\nprimary focus of this paper is to discuss the web tool's development and\nstrategies to ensure high-quality tree annotations despite encountering noise\nin the crowdsourced data. Following our methodology, we achieve quality\nmeasures surpassing 90% for various challenging test sets of diverse\ncomplexities. We emphasize that our tree map creation process, including\ninitial point cloud collection, can be completed within 1-2 days.\n","authors":["Michael Kölle","Volker Walter","Ivan Shiller","Uwe Soergel"],"pdf_url":"https://arxiv.org/pdf/2308.14499v1.pdf","comment":"This paper can be considered an extension of the approach presented\n by Walter et al.\n (https://isprs-annals.copernicus.org/articles/V-4-2020/49/2020/)"},{"id":"http://arxiv.org/abs/2308.14436v1","updated":"2023-08-28T09:22:02Z","published":"2023-08-28T09:22:02Z","title":"Bridging the KB-Text Gap: Leveraging Structured Knowledge-aware\n Pre-training for KBQA","summary":" Knowledge Base Question Answering (KBQA) aims to answer natural language\nquestions with factual information such as entities and relations in KBs.\nHowever, traditional Pre-trained Language Models (PLMs) are directly\npre-trained on large-scale natural language corpus, which poses challenges for\nthem in understanding and representing complex subgraphs in structured KBs. To\nbridge the gap between texts and structured KBs, we propose a Structured\nKnowledge-aware Pre-training method (SKP). In the pre-training stage, we\nintroduce two novel structured knowledge-aware tasks, guiding the model to\neffectively learn the implicit relationship and better representations of\ncomplex subgraphs. In downstream KBQA task, we further design an efficient\nlinearization strategy and an interval attention mechanism, which assist the\nmodel to better encode complex subgraphs and shield the interference of\nirrelevant subgraphs during reasoning respectively. Detailed experiments and\nanalyses on WebQSP verify the effectiveness of SKP, especially the significant\nimprovement in subgraph retrieval (+4.08% H@10).\n","authors":["Guanting Dong","Rumei Li","Sirui Wang","Yupeng Zhang","Yunsen Xian","Weiran Xu"],"pdf_url":"https://arxiv.org/pdf/2308.14436v1.pdf","comment":"Accepted as a short paper at CIKM 2023"},{"id":"http://arxiv.org/abs/2308.09340v2","updated":"2023-08-28T08:00:58Z","published":"2023-08-18T06:52:07Z","title":"How Discriminative Are Your Qrels? How To Study the Statistical\n Significance of Document Adjudication Methods","summary":" Creating test collections for offline retrieval evaluation requires human\neffort to judge documents' relevance. This expensive activity motivated much\nwork in developing methods for constructing benchmarks with fewer assessment\ncosts. In this respect, adjudication methods actively decide both which\ndocuments and the order in which experts review them, in order to better\nexploit the assessment budget or to lower it. Researchers evaluate the quality\nof those methods by measuring the correlation between the known gold ranking of\nsystems under the full collection and the observed ranking of systems under the\nlower-cost one. This traditional analysis ignores whether and how the low-cost\njudgements impact on the statistically significant differences among systems\nwith respect to the full collection. We fill this void by proposing a novel\nmethodology to evaluate how the low-cost adjudication methods preserve the\npairwise significant differences between systems as the full collection. In\nother terms, while traditional approaches look for stability in answering the\nquestion \"is system A better than system B?\", our proposed approach looks for\nstability in answering the question \"is system A significantly better than\nsystem B?\", which is the ultimate questions researchers need to answer to\nguarantee the generalisability of their results. Among other results, we found\nthat the best methods in terms of ranking of systems correlation do not always\nmatch those preserving statistical significance.\n","authors":["David Otero","Javier Parapar","Nicola Ferro"],"pdf_url":"https://arxiv.org/pdf/2308.09340v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14355v1","updated":"2023-08-28T07:03:08Z","published":"2023-08-28T07:03:08Z","title":"Can Transformer and GNN Help Each Other?","summary":" Although Transformer has achieved great success in natural language process\nand computer vision, it has difficulty generalizing to medium and large-scale\ngraph data for two important reasons: (i) High complexity. (ii) Failing to\ncapture the complex and entangled structure information. In graph\nrepresentation learning, Graph Neural Networks(GNNs) can fuse the graph\nstructure and node attributes but have limited receptive fields. Therefore, we\nquestion whether can we combine Transformers and GNNs to help each other. In\nthis paper, we propose a new model named TransGNN where the Transformer layer\nand GNN layer are used alternately to improve each other. Specifically, to\nexpand the receptive field and disentangle the information aggregation from\nedges, we propose using Transformer to aggregate more relevant nodes'\ninformation to improve the message passing of GNNs. Besides, to capture the\ngraph structure information, we utilize positional encoding and make use of the\nGNN layer to fuse the structure into node attributes, which improves the\nTransformer in graph data. We also propose to sample the most relevant nodes\nfor Transformer and two efficient samples update strategies to lower the\ncomplexity. At last, we theoretically prove that TransGNN is more expressive\nthan GNNs only with extra linear complexity. The experiments on eight datasets\ncorroborate the effectiveness of TransGNN on node and graph classification\ntasks.\n","authors":["Peiyan Zhang","Yuchen Yan","Chaozhuo Li","Senzhang Wang","Xing Xie","Sunghun Kim"],"pdf_url":"https://arxiv.org/pdf/2308.14355v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.17426v2","updated":"2023-08-28T05:37:36Z","published":"2023-06-30T06:40:11Z","title":"Leveraging Watch-time Feedback for Short-Video Recommendations: A Causal\n Labeling Framework","summary":" With the proliferation of short video applications, the significance of short\nvideo recommendations has vastly increased. Unlike other recommendation\nscenarios, short video recommendation systems heavily rely on feedback from\nwatch time. Existing approaches simply treat watch time as a direct label,\nfailing to effectively harness its extensive semantics and introduce bias,\nthereby limiting the potential for modeling user interests based on watch time.\nTo overcome this challenge, we propose a framework named Debiased\nMultiple-semantics-extracting Labeling(DML). DML constructs labels that\nencompass various semantics by utilizing quantiles derived from the\ndistribution of watch time, prioritizing relative order rather than absolute\nlabel values. This approach facilitates easier model learning while aligning\nwith the ranking objective of recommendations. Furthermore, we introduce a\nmethod inspired by causal adjustment to refine label definitions, thereby\ndirectly mitigating bias at the label level. We substantiate the effectiveness\nof our DML framework through both online and offline experiments. Extensive\nresults demonstrate that our DML could effectively leverage watch time to\ndiscover users' real interests, enhancing their engagement in our application.\n","authors":["Yang Zhang","Yimeng Bai","Jianxin Chang","Xiaoxue Zang","Song Lu","Jing Lu","Fuli Feng","Yanan Niu","Yang Song"],"pdf_url":"https://arxiv.org/pdf/2306.17426v2.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.14296v1","updated":"2023-08-28T04:31:04Z","published":"2023-08-28T04:31:04Z","title":"RecMind: Large Language Model Powered Agent For Recommendation","summary":" Recent advancements in instructing Large Language Models (LLMs) to utilize\nexternal tools and execute multi-step plans have significantly enhanced their\nability to solve intricate tasks, ranging from mathematical problems to\ncreative writing. Yet, there remains a notable gap in studying the capacity of\nLLMs in responding to personalized queries such as a recommendation request. To\nbridge this gap, we have designed an LLM-powered autonomous recommender agent,\nRecMind, which is capable of providing precise personalized recommendations\nthrough careful planning, utilizing tools for obtaining external knowledge, and\nleveraging individual data. We propose a novel algorithm, Self-Inspiring, to\nimprove the planning ability of the LLM agent. At each intermediate planning\nstep, the LLM 'self-inspires' to consider all previously explored states to\nplan for next step. This mechanism greatly improves the model's ability to\ncomprehend and utilize historical planning information for recommendation. We\nevaluate RecMind's performance in various recommendation scenarios, including\nrating prediction, sequential recommendation, direct recommendation,\nexplanation generation, and review summarization. Our experiment shows that\nRecMind outperforms existing zero/few-shot LLM-based recommendation methods in\ndifferent recommendation tasks and achieves competitive performance to a recent\nmodel P5, which requires fully pre-train for the recommendation tasks.\n","authors":["Yancheng Wang","Ziyan Jiang","Zheng Chen","Fan Yang","Yingxue Zhou","Eunah Cho","Xing Fan","Xiaojiang Huang","Yanbin Lu","Yingzhen Yang"],"pdf_url":"https://arxiv.org/pdf/2308.14296v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14276v1","updated":"2023-08-28T03:15:37Z","published":"2023-08-28T03:15:37Z","title":"Alleviating Video-Length Effect for Micro-video Recommendation","summary":" Micro-videos platforms such as TikTok are extremely popular nowadays. One\nimportant feature is that users no longer select interested videos from a set,\ninstead they either watch the recommended video or skip to the next one. As a\nresult, the time length of users' watching behavior becomes the most important\nsignal for identifying preferences. However, our empirical data analysis has\nshown a video-length effect that long videos are easier to receive a higher\nvalue of average view time, thus adopting such view-time labels for measuring\nuser preferences can easily induce a biased model that favors the longer\nvideos. In this paper, we propose a Video Length Debiasing Recommendation\n(VLDRec) method to alleviate such an effect for micro-video recommendation.\nVLDRec designs the data labeling approach and the sample generation module that\nbetter capture user preferences in a view-time oriented manner. It further\nleverages the multi-task learning technique to jointly optimize the above\nsamples with original biased ones. Extensive experiments show that VLDRec can\nimprove the users' view time by 1.81% and 11.32% on two real-world datasets,\ngiven a recommendation list of a fixed overall video length, compared with the\nbest baseline method. Moreover, VLDRec is also more effective in matching\nusers' interests in terms of the video content.\n","authors":["Yuhan Quan","Jingtao Ding","Chen Gao","Nian Li","Lingling Yi","Depeng Jin","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2308.14276v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14263v1","updated":"2023-08-28T02:38:17Z","published":"2023-08-28T02:38:17Z","title":"Cross-Modal Retrieval: A Systematic Review of Methods and Future\n Directions","summary":" With the exponential surge in diverse multi-modal data, traditional uni-modal\nretrieval methods struggle to meet the needs of users demanding access to data\nfrom various modalities. To address this, cross-modal retrieval has emerged,\nenabling interaction across modalities, facilitating semantic matching, and\nleveraging complementarity and consistency between different modal data.\nAlthough prior literature undertook a review of the cross-modal retrieval\nfield, it exhibits numerous deficiencies pertaining to timeliness, taxonomy,\nand comprehensiveness. This paper conducts a comprehensive review of\ncross-modal retrieval's evolution, spanning from shallow statistical analysis\ntechniques to vision-language pre-training models. Commencing with a\ncomprehensive taxonomy grounded in machine learning paradigms, mechanisms, and\nmodels, the paper then delves deeply into the principles and architectures\nunderpinning existing cross-modal retrieval methods. Furthermore, it offers an\noverview of widely used benchmarks, metrics, and performances. Lastly, the\npaper probes the prospects and challenges that confront contemporary\ncross-modal retrieval, while engaging in a discourse on potential directions\nfor further progress in the field. To facilitate the research on cross-modal\nretrieval, we develop an open-source code repository at\nhttps://github.com/BMC-SDNU/Cross-Modal-Retrieval.\n","authors":["Lei Zhu","Tianshi Wang","Fengling Li","Jingjing Li","Zheng Zhang","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2308.14263v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14916v1","updated":"2023-08-28T22:26:50Z","published":"2023-08-28T22:26:50Z","title":"RecRec: Algorithmic Recourse for Recommender Systems","summary":" Recommender systems play an essential role in the choices people make in\ndomains such as entertainment, shopping, food, news, employment, and education.\nThe machine learning models underlying these recommender systems are often\nenormously large and black-box in nature for users, content providers, and\nsystem developers alike. It is often crucial for all stakeholders to understand\nthe model's rationale behind making certain predictions and recommendations.\nThis is especially true for the content providers whose livelihoods depend on\nthe recommender system. Drawing motivation from the practitioners' need, in\nthis work, we propose a recourse framework for recommender systems, targeted\ntowards the content providers. Algorithmic recourse in the recommendation\nsetting is a set of actions that, if executed, would modify the recommendations\n(or ranking) of an item in the desired manner. A recourse suggests actions of\nthe form: \"if a feature changes X to Y, then the ranking of that item for a set\nof users will change to Z.\" Furthermore, we demonstrate that RecRec is highly\neffective in generating valid, sparse, and actionable recourses through an\nempirical evaluation of recommender systems trained on three real-world\ndatasets. To the best of our knowledge, this work is the first to conceptualize\nand empirically test a generalized framework for generating recourses for\nrecommender systems.\n","authors":["Sahil Verma","Ashudeep Singh","Varich Boonsanong","John P. Dickerson","Chirag Shah"],"pdf_url":"https://arxiv.org/pdf/2308.14916v1.pdf","comment":"Accepted as a short paper at CIKM 2023"},{"id":"http://arxiv.org/abs/2308.14902v1","updated":"2023-08-28T21:08:06Z","published":"2023-08-28T21:08:06Z","title":"Ad-Rec: Advanced Feature Interactions to Address Covariate-Shifts in\n Recommendation Networks","summary":" Recommendation models are vital in delivering personalized user experiences\nby leveraging the correlation between multiple input features. However, deep\nlearning-based recommendation models often face challenges due to evolving user\nbehaviour and item features, leading to covariate shifts. Effective\ncross-feature learning is crucial to handle data distribution drift and\nadapting to changing user behaviour. Traditional feature interaction techniques\nhave limitations in achieving optimal performance in this context.\n This work introduces Ad-Rec, an advanced network that leverages feature\ninteraction techniques to address covariate shifts. This helps eliminate\nirrelevant interactions in recommendation tasks. Ad-Rec leverages masked\ntransformers to enable the learning of higher-order cross-features while\nmitigating the impact of data distribution drift. Our approach improves model\nquality, accelerates convergence, and reduces training time, as measured by the\nArea Under Curve (AUC) metric. We demonstrate the scalability of Ad-Rec and its\nability to achieve superior model quality through comprehensive ablation\nstudies.\n","authors":["Muhammad Adnan","Yassaman Ebrahimzadeh Maboud","Divya Mahajan","Prashant J. Nair"],"pdf_url":"https://arxiv.org/pdf/2308.14902v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14786v1","updated":"2023-08-28T17:07:31Z","published":"2023-08-28T17:07:31Z","title":"Extending Cross-Modal Retrieval with Interactive Learning to Improve\n Image Retrieval Performance in Forensics","summary":" Nowadays, one of the critical challenges in forensics is analyzing the\nenormous amounts of unstructured digital evidence, such as images. Often,\nunstructured digital evidence contains precious information for forensic\ninvestigations. Therefore, a retrieval system that can effectively identify\nforensically relevant images is paramount. In this work, we explored the\neffectiveness of interactive learning in improving image retrieval performance\nin the forensic domain by proposing Excalibur - a zero-shot cross-modal image\nretrieval system extended with interactive learning. Excalibur was evaluated\nusing both simulations and a user study. The simulations reveal that\ninteractive learning is highly effective in improving retrieval performance in\nthe forensic domain. Furthermore, user study participants could effectively\nleverage the power of interactive learning. Finally, they considered Excalibur\neffective and straightforward to use and expressed interest in using it in\ntheir daily practice.\n","authors":["Nils Böhne","Mark Berger","Ronald van Velzen"],"pdf_url":"https://arxiv.org/pdf/2308.14786v1.pdf","comment":"Submitted to the AAAI22 conference"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2308.14753v1","updated":"2023-08-28T17:59:47Z","published":"2023-08-28T17:59:47Z","title":"Efficient Discovery and Effective Evaluation of Visual Perceptual\n Similarity: A Benchmark and Beyond","summary":" Visual similarities discovery (VSD) is an important task with broad\ne-commerce applications. Given an image of a certain object, the goal of VSD is\nto retrieve images of different objects with high perceptual visual similarity.\nAlthough being a highly addressed problem, the evaluation of proposed methods\nfor VSD is often based on a proxy of an identification-retrieval task,\nevaluating the ability of a model to retrieve different images of the same\nobject. We posit that evaluating VSD methods based on identification tasks is\nlimited, and faithful evaluation must rely on expert annotations. In this\npaper, we introduce the first large-scale fashion visual similarity benchmark\ndataset, consisting of more than 110K expert-annotated image pairs. Besides\nthis major contribution, we share insight from the challenges we faced while\ncurating this dataset. Based on these insights, we propose a novel and\nefficient labeling procedure that can be applied to any dataset. Our analysis\nexamines its limitations and inductive biases, and based on these findings, we\npropose metrics to mitigate those limitations. Though our primary focus lies on\nvisual similarity, the methodologies we present have broader applications for\ndiscovering and evaluating perceptual similarity across various domains.\n","authors":["Oren Barkan","Tal Reiss","Jonathan Weill","Ori Katz","Roy Hirsch","Itzik Malkiel","Noam Koenigstein"],"pdf_url":"https://arxiv.org/pdf/2308.14753v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.14742v1","updated":"2023-08-28T17:43:04Z","published":"2023-08-28T17:43:04Z","title":"Minimizing Quasi-Self-Concordant Functions by Gradient Regularization of\n Newton Method","summary":" We study the composite convex optimization problems with a\nQuasi-Self-Concordant smooth component. This problem class naturally\ninterpolates between classic Self-Concordant functions and functions with\nLipschitz continuous Hessian. Previously, the best complexity bounds for this\nproblem class were associated with trust-region schemes and implementations of\na ball-minimization oracle. In this paper, we show that for minimizing\nQuasi-Self-Concordant functions we can use instead the basic Newton Method with\nGradient Regularization. For unconstrained minimization, it only involves a\nsimple matrix inversion operation (solving a linear system) at each step. We\nprove a fast global linear rate for this algorithm, matching the complexity\nbound of the trust-region scheme, while our method remains especially simple to\nimplement. Then, we introduce the Dual Newton Method, and based on it, develop\nthe corresponding Accelerated Newton Scheme for this problem class, which\nfurther improves the complexity factor of the basic method. As a direct\nconsequence of our results, we establish fast global linear rates of simple\nvariants of the Newton Method applied to several practical problems, including\nLogistic Regression, Soft Maximum, and Matrix Scaling, without requiring\nadditional assumptions on strong or uniform convexity for the target objective.\n","authors":["Nikita Doikov"],"pdf_url":"https://arxiv.org/pdf/2308.14742v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14740v1","updated":"2023-08-28T17:41:14Z","published":"2023-08-28T17:41:14Z","title":"Total Selfie: Generating Full-Body Selfies","summary":" We present a method to generate full-body selfies -- photos that you take of\nyourself, but capturing your whole body as if someone else took the photo of\nyou from a few feet away. Our approach takes as input a pre-captured video of\nyour body, a target pose photo, and a selfie + background pair for each\nlocation. We introduce a novel diffusion-based approach to combine all of this\ninformation into high quality, well-composed photos of you with the desired\npose and background.\n","authors":["Bowei Chen","Brian Curless","Ira Kemelmacher-Shlizerman","Steve Seitz"],"pdf_url":"https://arxiv.org/pdf/2308.14740v1.pdf","comment":"Project page:\n https://homes.cs.washington.edu/~boweiche/project_page/totalselfie/"},{"id":"http://arxiv.org/abs/2306.12926v2","updated":"2023-08-28T17:33:56Z","published":"2023-06-22T14:38:12Z","title":"Decentralized Multi-Agent Reinforcement Learning with Global State\n Prediction","summary":" Deep reinforcement learning (DRL) has seen remarkable success in the control\nof single robots. However, applying DRL to robot swarms presents significant\nchallenges. A critical challenge is non-stationarity, which occurs when two or\nmore robots update individual or shared policies concurrently, thereby engaging\nin an interdependent training process with no guarantees of convergence.\nCircumventing non-stationarity typically involves training the robots with\nglobal information about other agents' states and/or actions. In contrast, in\nthis paper we explore how to remove the need for global information. We pose\nour problem as a Partially Observable Markov Decision Process, due to the\nabsence of global knowledge on other agents. Using collective transport as a\ntestbed scenario, we study two approaches to multi-agent training. In the\nfirst, the robots exchange no messages, and are trained to rely on implicit\ncommunication through push-and-pull on the object to transport. In the second\napproach, we introduce Global State Prediction (GSP), a network trained to\nforma a belief over the swarm as a whole and predict its future states. We\nprovide a comprehensive study over four well-known deep reinforcement learning\nalgorithms in environments with obstacles, measuring performance as the\nsuccessful transport of the object to the goal within a desired time-frame.\nThrough an ablation study, we show that including GSP boosts performance and\nincreases robustness when compared with methods that use global knowledge.\n","authors":["Joshua Bloom","Pranjal Paliwal","Apratim Mukherjee","Carlo Pinciroli"],"pdf_url":"https://arxiv.org/pdf/2306.12926v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.07143v3","updated":"2023-08-28T17:23:46Z","published":"2023-01-17T19:15:06Z","title":"Revisiting mass-radius relationships for exoplanet populations: a\n machine learning insight","summary":" The growing number of exoplanet discoveries and advances in machine learning\ntechniques have opened new avenues for exploring and understanding the\ncharacteristics of worlds beyond our Solar System. In this study, we employ\nefficient machine learning approaches to analyze a dataset comprising 762\nconfirmed exoplanets and eight Solar System planets, aiming to characterize\ntheir fundamental quantities. By applying different unsupervised clustering\nalgorithms, we classify the data into two main classes: 'small' and 'giant'\nplanets, with cut-off values at $R_{p}=8.13R_{\\oplus}$ and\n$M_{p}=52.48M_{\\oplus}$. This classification reveals an intriguing distinction:\ngiant planets have lower densities, suggesting higher H-He mass fractions,\nwhile small planets are denser, composed mainly of heavier elements. We apply\nvarious regression models to uncover correlations between physical parameters\nand their predictive power for exoplanet radius. Our analysis highlights that\nplanetary mass, orbital period, and stellar mass play crucial roles in\npredicting exoplanet radius. Among the models evaluated, the Support Vector\nRegression consistently outperforms others, demonstrating its promise for\nobtaining accurate planetary radius estimates. Furthermore, we derive\nparametric equations using the M5P and Markov Chain Monte Carlo methods.\nNotably, our study reveals a noteworthy result: small planets exhibit a\npositive linear mass-radius relation, aligning with previous findings.\nConversely, for giant planets, we observe a strong correlation between\nplanetary radius and the mass of their host stars, which might provide\nintriguing insights into the relationship between giant planet formation and\nstellar characteristics.\n","authors":["Mahdiyar Mousavi-Sadr","Davood M. Jassur","Ghassem Gozaliasl"],"pdf_url":"https://arxiv.org/pdf/2301.07143v3.pdf","comment":"Accepted for publication in MNRAS. 17 pages, 18 figures"},{"id":"http://arxiv.org/abs/2308.14711v1","updated":"2023-08-28T17:11:41Z","published":"2023-08-28T17:11:41Z","title":"Fast Feedforward Networks","summary":" We break the linear link between the layer size and its inference cost by\nintroducing the fast feedforward (FFF) architecture, a logarithmic-time\nalternative to feedforward networks.\n We show that FFFs give comparable performance to feedforward networks at an\nexponential fraction of their inference cost, are quicker to deliver\nperformance compared to mixture-of-expert networks, and can readily take the\nplace of either in transformers.\n Pushing FFFs to the absolute limit, we train a vision transformer to perform\nsingle-neuron inferences at the cost of only 5.8% performance decrease against\nthe full-width variant.\n Our implementation is available as a Python package; just use \"pip install\nfastfeedforward\".\n","authors":["Peter Belcak","Roger Wattenhofer"],"pdf_url":"https://arxiv.org/pdf/2308.14711v1.pdf","comment":"12 pages, 6 figures, 4 tables"},{"id":"http://arxiv.org/abs/2308.14710v1","updated":"2023-08-28T17:10:12Z","published":"2023-08-28T17:10:12Z","title":"VideoCutLER: Surprisingly Simple Unsupervised Video Instance\n Segmentation","summary":" Existing approaches to unsupervised video instance segmentation typically\nrely on motion estimates and experience difficulties tracking small or\ndivergent motions. We present VideoCutLER, a simple method for unsupervised\nmulti-instance video segmentation without using motion-based learning signals\nlike optical flow or training on natural videos. Our key insight is that using\nhigh-quality pseudo masks and a simple video synthesis method for model\ntraining is surprisingly sufficient to enable the resulting video model to\neffectively segment and track multiple instances across video frames. We show\nthe first competitive unsupervised learning results on the challenging\nYouTubeVIS-2019 benchmark, achieving 50.7% APvideo^50 , surpassing the previous\nstate-of-the-art by a large margin. VideoCutLER can also serve as a strong\npretrained model for supervised video instance segmentation tasks, exceeding\nDINO by 15.9% on YouTubeVIS-2019 in terms of APvideo.\n","authors":["Xudong Wang","Ishan Misra","Ziyun Zeng","Rohit Girdhar","Trevor Darrell"],"pdf_url":"https://arxiv.org/pdf/2308.14710v1.pdf","comment":"Preprint. Code: https://github.com/facebookresearch/CutLER"},{"id":"http://arxiv.org/abs/2308.14705v1","updated":"2023-08-28T16:58:44Z","published":"2023-08-28T16:58:44Z","title":"Diversified Ensemble of Independent Sub-Networks for Robust\n Self-Supervised Representation Learning","summary":" Ensembling a neural network is a widely recognized approach to enhance model\nperformance, estimate uncertainty, and improve robustness in deep supervised\nlearning. However, deep ensembles often come with high computational costs and\nmemory demands. In addition, the efficiency of a deep ensemble is related to\ndiversity among the ensemble members which is challenging for large,\nover-parameterized deep neural networks. Moreover, ensemble learning has not\nyet seen such widespread adoption, and it remains a challenging endeavor for\nself-supervised or unsupervised representation learning. Motivated by these\nchallenges, we present a novel self-supervised training regime that leverages\nan ensemble of independent sub-networks, complemented by a new loss function\ndesigned to encourage diversity. Our method efficiently builds a sub-model\nensemble with high diversity, leading to well-calibrated estimates of model\nuncertainty, all achieved with minimal computational overhead compared to\ntraditional deep self-supervised ensembles. To evaluate the effectiveness of\nour approach, we conducted extensive experiments across various tasks,\nincluding in-distribution generalization, out-of-distribution detection,\ndataset corruption, and semi-supervised settings. The results demonstrate that\nour method significantly improves prediction reliability. Our approach not only\nachieves excellent accuracy but also enhances calibration, surpassing baseline\nperformance across a wide range of self-supervised architectures in computer\nvision, natural language processing, and genomics data.\n","authors":["Amirhossein Vahidi","Lisa Wimmer","Hüseyin Anil Gündüz","Bernd Bischl","Eyke Hüllermeier","Mina Rezaei"],"pdf_url":"https://arxiv.org/pdf/2308.14705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.00915v3","updated":"2023-08-28T16:36:31Z","published":"2023-06-01T17:18:15Z","title":"The feasibility of artificial consciousness through the lens of\n neuroscience","summary":" Interactions with large language models have led to the suggestion that these\nmodels may soon be conscious. From the perspective of neuroscience, this\nposition is difficult to defend. For one, the inputs to large language models\nlack the embodied, embedded information content characteristic of our sensory\ncontact with the world around us. Secondly, the architecture of large language\nmodels is missing key features of the thalamocortical system that have been\nlinked to conscious awareness in mammals. Finally, the evolutionary and\ndevelopmental trajectories that led to the emergence of living conscious\norganisms arguably have no parallels in artificial systems as envisioned today.\nThe existence of living organisms depends on their actions, and their survival\nis intricately linked to multi-level cellular, inter-cellular, and organismal\nprocesses culminating in agency and consciousness.\n","authors":["Jaan Aru","Matthew Larkum","James M. Shine"],"pdf_url":"https://arxiv.org/pdf/2306.00915v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14693v1","updated":"2023-08-28T16:34:50Z","published":"2023-08-28T16:34:50Z","title":"Hybrid PLS-ML Authentication Scheme for V2I Communication Networks","summary":" Vehicular communication networks are rapidly emerging as vehicles become\nsmarter. However, these networks are increasingly susceptible to various\nattacks. The situation is exacerbated by the rise in automated vehicles\ncomplicates, emphasizing the need for security and authentication measures to\nensure safe and effective traffic management. In this paper, we propose a novel\nhybrid physical layer security (PLS)-machine learning (ML) authentication\nscheme by exploiting the position of the transmitter vehicle as a device\nfingerprint. We use a time-of-arrival (ToA) based localization mechanism where\nthe ToA is estimated at roadside units (RSUs), and the coordinates of the\ntransmitter vehicle are extracted at the base station (BS).Furthermore, to\ntrack the mobility of the moving legitimate vehicle, we use ML model trained on\nseveral system parameters. We try two ML models for this purpose, i.e., support\nvector regression and decision tree. To evaluate our scheme, we conduct binary\nhypothesis testing on the estimated positions with the help of the ground\ntruths provided by the ML model, which classifies the transmitter node as\nlegitimate or malicious. Moreover, we consider the probability of false alarm\nand the probability of missed detection as performance metrics resulting from\nthe binary hypothesis testing, and mean absolute error (MAE), mean square error\n(MSE), and coefficient of determination $\\text{R}^2$ to further evaluate the ML\nmodels. We also compare our scheme with a baseline scheme that exploits the\nangle of arrival at RSUs for authentication. We observe that our proposed\nposition-based mechanism outperforms the baseline scheme significantly in terms\nof missed detections.\n","authors":["Hala Amin","Jawaher Kaldari","Nora Mohamed","Waqas Aman","Saif Al-Kuwari"],"pdf_url":"https://arxiv.org/pdf/2308.14693v1.pdf","comment":"Accepted for Publication following Presentation at IEEE ISNCC-23"},{"id":"http://arxiv.org/abs/2308.14683v1","updated":"2023-08-28T16:18:50Z","published":"2023-08-28T16:18:50Z","title":"Fine-Tuning Llama 2 Large Language Models for Detecting Online Sexual\n Predatory Chats and Abusive Texts","summary":" Detecting online sexual predatory behaviours and abusive language on social\nmedia platforms has become a critical area of research due to the growing\nconcerns about online safety, especially for vulnerable populations such as\nchildren and adolescents. Researchers have been exploring various techniques\nand approaches to develop effective detection systems that can identify and\nmitigate these risks. Recent development of large language models (LLMs) has\nopened a new opportunity to address this problem more effectively. This paper\nproposes an approach to detection of online sexual predatory chats and abusive\nlanguage using the open-source pretrained Llama 2 7B-parameter model, recently\nreleased by Meta GenAI. We fine-tune the LLM using datasets with different\nsizes, imbalance degrees, and languages (i.e., English, Roman Urdu and Urdu).\nBased on the power of LLMs, our approach is generic and automated without a\nmanual search for a synergy between feature extraction and classifier design\nsteps like conventional methods in this domain. Experimental results show a\nstrong performance of the proposed approach, which performs proficiently and\nconsistently across three distinct datasets with five sets of experiments. This\nstudy's outcomes indicate that the proposed method can be implemented in\nreal-world applications (even with non-English languages) for flagging sexual\npredators, offensive or toxic content, hate speech, and discriminatory language\nin online discussions and comments to maintain respectful internet or digital\ncommunities. Furthermore, it can be employed for solving text classification\nproblems with other potential applications such as sentiment analysis, spam and\nphishing detection, sorting legal documents, fake news detection, language\nidentification, user intent recognition, text-based product categorization,\nmedical record analysis, and resume screening.\n","authors":["Thanh Thi Nguyen","Campbell Wilson","Janis Dalins"],"pdf_url":"https://arxiv.org/pdf/2308.14683v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06243v2","updated":"2023-08-28T16:18:30Z","published":"2023-07-12T15:34:10Z","title":"Reconstructing Spatiotemporal Data with C-VAEs","summary":" The continuous representation of spatiotemporal data commonly relies on using\nabstract data types, such as \\textit{moving regions}, to represent entities\nwhose shape and position continuously change over time. Creating this\nrepresentation from discrete snapshots of real-world entities requires using\ninterpolation methods to compute in-between data representations and estimate\nthe position and shape of the object of interest at arbitrary temporal points.\nExisting region interpolation methods often fail to generate smooth and\nrealistic representations of a region's evolution. However, recent advancements\nin deep learning techniques have revealed the potential of deep models trained\non discrete observations to capture spatiotemporal dependencies through\nimplicit feature learning.\n In this work, we explore the capabilities of Conditional Variational\nAutoencoder (C-VAE) models to generate smooth and realistic representations of\nthe spatiotemporal evolution of moving regions. We evaluate our proposed\napproach on a sparsely annotated dataset on the burnt area of a forest fire. We\napply compression operations to sample from the dataset and use the C-VAE model\nand other commonly used interpolation algorithms to generate in-between region\nrepresentations. To evaluate the performance of the methods, we compare their\ninterpolation results with manually annotated data and regions generated by a\nU-Net model. We also assess the quality of generated data considering temporal\nconsistency metrics.\n The proposed C-VAE-based approach demonstrates competitive results in\ngeometric similarity metrics. It also exhibits superior temporal consistency,\nsuggesting that C-VAE models may be a viable alternative to modelling the\nspatiotemporal evolution of 2D moving regions.\n","authors":["Tiago F. R. Ribeiro","Fernando Silva","Rogério Luís de C. Costa"],"pdf_url":"https://arxiv.org/pdf/2307.06243v2.pdf","comment":"Update acknowledgments to include published article information"},{"id":"http://arxiv.org/abs/2211.11869v3","updated":"2023-08-28T16:14:57Z","published":"2022-11-21T21:42:50Z","title":"Examining Policy Entropy of Reinforcement Learning Agents for\n Personalization Tasks","summary":" This effort is focused on examining the behavior of reinforcement learning\nsystems in personalization environments and detailing the differences in policy\nentropy associated with the type of learning algorithm utilized. We demonstrate\nthat Policy Optimization agents often possess low-entropy policies during\ntraining, which in practice results in agents prioritizing certain actions and\navoiding others. Conversely, we also show that Q-Learning agents are far less\nsusceptible to such behavior and generally maintain high-entropy policies\nthroughout training, which is often preferable in real-world applications. We\nprovide a wide range of numerical experiments as well as theoretical\njustification to show that these differences in entropy are due to the type of\nlearning being employed.\n","authors":["Anton Dereventsov","Andrew Starnes","Clayton G. Webster"],"pdf_url":"https://arxiv.org/pdf/2211.11869v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10145v3","updated":"2023-08-28T16:13:03Z","published":"2023-08-20T03:12:10Z","title":"Wasserstein Geodesic Generator for Conditional Distributions","summary":" Generating samples given a specific label requires estimating conditional\ndistributions. We derive a tractable upper bound of the Wasserstein distance\nbetween conditional distributions to lay the theoretical groundwork to learn\nconditional distributions. Based on this result, we propose a novel conditional\ngeneration algorithm where conditional distributions are fully characterized by\na metric space defined by a statistical distance. We employ optimal transport\ntheory to propose the Wasserstein geodesic generator, a new conditional\ngenerator that learns the Wasserstein geodesic. The proposed method learns both\nconditional distributions for observed domains and optimal transport maps\nbetween them. The conditional distributions given unobserved intermediate\ndomains are on the Wasserstein geodesic between conditional distributions given\ntwo observed domain labels. Experiments on face images with light conditions as\ndomain labels demonstrate the efficacy of the proposed method.\n","authors":["Young-geun Kim","Kyungbok Lee","Youngwon Choi","Joong-Ho Won","Myunghee Cho Paik"],"pdf_url":"https://arxiv.org/pdf/2308.10145v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18160v2","updated":"2023-08-28T15:59:26Z","published":"2023-05-29T15:41:12Z","title":"Counterpart Fairness -- Addressing Systematic between-group Differences\n in Fairness Evaluation","summary":" When using machine learning (ML) to aid decision-making, it is critical to\nensure that an algorithmic decision is fair, i.e., it does not discriminate\nagainst specific individuals/groups, particularly those from underprivileged\npopulations. Existing group fairness methods require equal group-wise measures,\nwhich however fails to consider systematic between-group differences. The\nconfounding factors, which are non-sensitive variables but manifest systematic\ndifferences, can significantly affect fairness evaluation. To tackle this\nproblem, we believe that a fairness measurement should be based on the\ncomparison between counterparts (i.e., individuals who are similar to each\nother with respect to the task of interest) from different groups, whose group\nidentities cannot be distinguished algorithmically by exploring confounding\nfactors. We have developed a propensity-score-based method for identifying\ncounterparts, which prevents fairness evaluation from comparing \"oranges\" with\n\"apples\". In addition, we propose a counterpart-based statistical fairness\nindex, termed Counterpart-Fairness (CFair), to assess fairness of ML models.\nVarious empirical studies were conducted to validate the effectiveness of\nCFair. We publish our code at \\url{https://github.com/zhengyjo/CFair}.\n","authors":["Yifei Wang","Zhengyang Zhou","Liqin Wang","John Laurentiev","Peter Hou","Li Zhou","Pengyu Hong"],"pdf_url":"https://arxiv.org/pdf/2305.18160v2.pdf","comment":"25 pages, 6 figures, 16 tables"},{"id":"http://arxiv.org/abs/2305.02527v2","updated":"2023-08-28T15:52:36Z","published":"2023-05-04T03:31:30Z","title":"Reinforcement Learning with Delayed, Composite, and Partially Anonymous\n Reward","summary":" We investigate an infinite-horizon average reward Markov Decision Process\n(MDP) with delayed, composite, and partially anonymous reward feedback. The\ndelay and compositeness of rewards mean that rewards generated as a result of\ntaking an action at a given state are fragmented into different components, and\nthey are sequentially realized at delayed time instances. The partial anonymity\nattribute implies that a learner, for each state, only observes the aggregate\nof past reward components generated as a result of different actions taken at\nthat state, but realized at the observation instance. We propose an algorithm\nnamed $\\mathrm{DUCRL2}$ to obtain a near-optimal policy for this setting and\nshow that it achieves a regret bound of $\\tilde{\\mathcal{O}}\\left(DS\\sqrt{AT} +\nd (SA)^3\\right)$ where $S$ and $A$ are the sizes of the state and action\nspaces, respectively, $D$ is the diameter of the MDP, $d$ is a parameter upper\nbounded by the maximum reward delay, and $T$ denotes the time horizon. This\ndemonstrates the optimality of the bound in the order of $T$, and an additive\nimpact of the delay.\n","authors":["Washim Uddin Mondal","Vaneet Aggarwal"],"pdf_url":"https://arxiv.org/pdf/2305.02527v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14659v1","updated":"2023-08-28T15:41:30Z","published":"2023-08-28T15:41:30Z","title":"RESTORE: Graph Embedding Assessment Through Reconstruction","summary":" Following the success of Word2Vec embeddings, graph embeddings (GEs) have\ngained substantial traction. GEs are commonly generated and evaluated\nextrinsically on downstream applications, but intrinsic evaluations of the\noriginal graph properties in terms of topological structure and semantic\ninformation have been lacking. Understanding these will help identify the\ndeficiency of the various families of GE methods when vectorizing graphs in\nterms of preserving the relevant knowledge or learning incorrect knowledge. To\naddress this, we propose RESTORE, a framework for intrinsic GEs assessment\nthrough graph reconstruction. We show that reconstructing the original graph\nfrom the underlying GEs yields insights into the relative amount of information\npreserved in a given vector form. We first introduce the graph reconstruction\ntask. We generate GEs from three GE families based on factorization methods,\nrandom walks, and deep learning (with representative algorithms from each\nfamily) on the CommonSense Knowledge Graph (CSKG). We analyze their\neffectiveness in preserving the (a) topological structure of node-level graph\nreconstruction with an increasing number of hops and (b) semantic information\non various word semantic and analogy tests. Our evaluations show deep\nlearning-based GE algorithm (SDNE) is overall better at preserving (a) with a\nmean average precision (mAP) of 0.54 and 0.35 for 2 and 3-hop reconstruction\nrespectively, while the factorization-based algorithm (HOPE) is better at\nencapsulating (b) with an average Euclidean distance of 0.14, 0.17, and 0.11\nfor 1, 2, and 3-hop reconstruction respectively. The modest performance of\nthese GEs leaves room for further research avenues on better graph\nrepresentation learning.\n","authors":["Hong Yung Yip","Chidaksh Ravuru","Neelabha Banerjee","Shashwat Jha","Amit Sheth","Aman Chadha","Amitava Das"],"pdf_url":"https://arxiv.org/pdf/2308.14659v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14658v1","updated":"2023-08-28T15:40:50Z","published":"2023-08-28T15:40:50Z","title":"Adversarial Predictions of Data Distributions Across Federated\n Internet-of-Things Devices","summary":" Federated learning (FL) is increasingly becoming the default approach for\ntraining machine learning models across decentralized Internet-of-Things (IoT)\ndevices. A key advantage of FL is that no raw data are communicated across the\nnetwork, providing an immediate layer of privacy. Despite this, recent works\nhave demonstrated that data reconstruction can be done with the locally trained\nmodel updates which are communicated across the network. However, many of these\nworks have limitations with regard to how the gradients are computed in\nbackpropagation. In this work, we demonstrate that the model weights shared in\nFL can expose revealing information about the local data distributions of IoT\ndevices. This leakage could expose sensitive information to malicious actors in\na distributed system. We further discuss results which show that injecting\nnoise into model weights is ineffective at preventing data leakage without\nseriously harming the global model accuracy.\n","authors":["Samir Rajani","Dario Dematties","Nathaniel Hudson","Kyle Chard","Nicola Ferrier","Rajesh Sankaran","Peter Beckman"],"pdf_url":"https://arxiv.org/pdf/2308.14658v1.pdf","comment":"6 pages, 6 figures, accepted for publication through 2023 IEEE World\n Forum on Internet of Things"},{"id":"http://arxiv.org/abs/2308.08086v2","updated":"2023-08-28T15:40:02Z","published":"2023-08-16T01:30:13Z","title":"Safety Filter Design for Neural Network Systems via Convex Optimization","summary":" With the increase in data availability, it has been widely demonstrated that\nneural networks (NN) can capture complex system dynamics precisely in a\ndata-driven manner. However, the architectural complexity and nonlinearity of\nthe NNs make it challenging to synthesize a provably safe controller. In this\nwork, we propose a novel safety filter that relies on convex optimization to\nensure safety for a NN system, subject to additive disturbances that are\ncapable of capturing modeling errors. Our approach leverages tools from NN\nverification to over-approximate NN dynamics with a set of linear bounds,\nfollowed by an application of robust linear MPC to search for controllers that\ncan guarantee robust constraint satisfaction. We demonstrate the efficacy of\nthe proposed framework numerically on a nonlinear pendulum system.\n","authors":["Shaoru Chen","Kong Yao Chee","Nikolai Matni","M. Ani Hsieh","George J. Pappas"],"pdf_url":"https://arxiv.org/pdf/2308.08086v2.pdf","comment":"This paper has been accepted to the 2023 62nd IEEE Conference on\n Decision and Control (CDC)"},{"id":"http://arxiv.org/abs/2308.01674v2","updated":"2023-08-28T15:38:47Z","published":"2023-08-03T10:21:53Z","title":"End-to-End Reinforcement Learning of Koopman Models for Economic\n Nonlinear Model Predictive Control","summary":" (Economic) nonlinear model predictive control ((e)NMPC) requires dynamic\nsystem models that are sufficiently accurate in all relevant state-space\nregions. These models must also be computationally cheap enough to ensure\nreal-time tractability. Data-driven surrogate models for mechanistic models can\nbe used to reduce the computational burden of (e)NMPC; however, such models are\ntypically trained by system identification for maximum average prediction\naccuracy on simulation samples and perform suboptimally as part of actual\n(e)NMPC. We present a method for end-to-end reinforcement learning of dynamic\nsurrogate models for optimal performance in (e)NMPC applications, resulting in\npredictive controllers that strike a favorable balance between control\nperformance and computational demand. We validate our method on two\napplications derived from an established nonlinear continuous stirred-tank\nreactor model. We compare the controller performance to that of MPCs utilizing\nmodels trained by the prevailing maximum prediction accuracy paradigm, and\nmodel-free neural network controllers trained using reinforcement learning. We\nshow that our method matches the performance of the model-free neural network\ncontrollers while consistently outperforming models derived from system\nidentification. Additionally, we show that the MPC policies can react to\nchanges in the control setting without retraining.\n","authors":["Daniel Mayfrank","Alexander Mitsos","Manuel Dahmen"],"pdf_url":"https://arxiv.org/pdf/2308.01674v2.pdf","comment":"manuscript (18 pages, 7 figures, 5 tables), supplementary materials\n (3 pages, 2 tables)"},{"id":"http://arxiv.org/abs/2306.11167v2","updated":"2023-08-28T15:34:27Z","published":"2023-06-19T21:14:57Z","title":"Large Language Models are Fixated by Red Herrings: Exploring Creative\n Problem Solving and Einstellung Effect using the Only Connect Wall Dataset","summary":" The quest for human imitative AI has been an enduring topic in AI research\nsince its inception. The technical evolution and emerging capabilities of the\nlatest cohort of large language models (LLMs) have reinvigorated the subject\nbeyond academia to the cultural zeitgeist. While recent NLP evaluation\nbenchmark tasks test some aspects of human-imitative behaviour (e.g.,\nBIG-bench's 'human-like behavior' tasks), few, if not none, examine creative\nproblem solving abilities. Creative problem solving in humans is a well-studied\ntopic in cognitive neuroscience with standardized tests that predominantly use\nthe ability to associate (heterogeneous) connections among clue words as a\nmetric for creativity. Exposure to misleading stimuli - distractors dubbed red\nherrings - impede human performance in such tasks via the fixation effect and\nEinstellung paradigm. In cognitive neuroscience studies, such fixations are\nexperimentally induced by pre-exposing participants to orthographically similar\nincorrect words to subsequent word-fragments or clues. The popular British quiz\nshow Only Connect's Connecting Wall segment essentially mimics Mednick's Remote\nAssociates Test (RAT) formulation with built-in, deliberate red herrings, which\nmakes it an ideal proxy dataset to explore and study fixation effect and\nEinstellung paradigm from cognitive neuroscience in LLMs. In this paper we\npresent the novel Only Connect Wall (OCW) dataset and report results from our\nevaluation of selected pre-trained language models and LLMs on creative problem\nsolving tasks like grouping clue words by heterogeneous connections, and\nidentifying correct open knowledge domain connections in respective groups. We\nsynthetically generate two additional datasets: OCW-Randomized, OCW-WordNet to\nfurther analyze our red-herrings hypothesis in language models. The code and\nlink to the dataset are available at https://github.com/TaatiTeam/OCW.\n","authors":["Saeid Naeini","Raeid Saqur","Mozhgan Saeidi","John Giorgi","Babak Taati"],"pdf_url":"https://arxiv.org/pdf/2306.11167v2.pdf","comment":"V2: with added OCW-Randomized and OCW-WordNet results in Section 4.3\n (added). 22 pages with Appendix"},{"id":"http://arxiv.org/abs/2308.14650v1","updated":"2023-08-28T15:22:15Z","published":"2023-08-28T15:22:15Z","title":"Comparison of automated crater catalogs for Mars from Benedix et al.\n (2020) and Lee and Hogan (2021)","summary":" Crater mapping using neural networks and other automated methods has\nincreased recently with automated Crater Detection Algorithms (CDAs) applied to\nplanetary bodies throughout the solar system. A recent publication by Benedix\net al. (2020) showed high performance at small scales compared to similar\nautomated CDAs but with a net positive diameter bias in many crater candidates.\nI compare the publicly available catalogs from Benedix et al. (2020) and Lee &\nHogan (2021) and show that the reported performance is sensitive to the metrics\nused to test the catalogs. I show how the more permissive comparison methods\nindicate a higher CDA performance by allowing worse candidate craters to match\nground-truth craters. I show that the Benedix et al. (2020) catalog has a\nsubstantial performance loss with increasing latitude and identify an image\nprojection issue that might cause this loss. Finally, I suggest future\napplications of neural networks in generating large scientific datasets be\nvalidated using secondary networks with independent data sources or training\nmethods.\n","authors":["Christopher Lee"],"pdf_url":"https://arxiv.org/pdf/2308.14650v1.pdf","comment":"14 pages, 6 figures. Accepted August 13th 2023"},{"id":"http://arxiv.org/abs/2308.14647v1","updated":"2023-08-28T15:19:18Z","published":"2023-08-28T15:19:18Z","title":"Edge Generation Scheduling for DAG Tasks using Deep Reinforcement\n Learning","summary":" Directed acyclic graph (DAG) tasks are currently adopted in the real-time\ndomain to model complex applications from the automotive, avionics, and\nindustrial domain that implement their functionalities through chains of\nintercommunicating tasks. This paper studies the problem of scheduling\nreal-time DAG tasks by presenting a novel schedulability test based on the\nconcept of trivial schedulability. Using this schedulability test, we propose a\nnew DAG scheduling framework (edge generation scheduling -- EGS) that attempts\nto minimize the DAG width by iteratively generating edges while guaranteeing\nthe deadline constraint. We study how to efficiently solve the problem of\ngenerating edges by developing a deep reinforcement learning algorithm combined\nwith a graph representation neural network to learn an efficient edge\ngeneration policy for EGS. We evaluate the effectiveness of the proposed\nalgorithm by comparing it with state-of-the-art DAG scheduling heuristics and\nan optimal mixed-integer linear programming baseline. Experimental results show\nthat the proposed algorithm outperforms the state-of-the-art by requiring fewer\nprocessors to schedule the same DAG tasks.\n","authors":["Binqi Sun","Mirco Theile","Ziyuan Qin","Daniele Bernardini","Debayan Roy","Andrea Bastoni","Marco Caccamo"],"pdf_url":"https://arxiv.org/pdf/2308.14647v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2308.14644v1","updated":"2023-08-28T15:16:35Z","published":"2023-08-28T15:16:35Z","title":"Human Comfortability Index Estimation in Industrial Human-Robot\n Collaboration Task","summary":" Fluent human-robot collaboration requires a robot teammate to understand,\nlearn, and adapt to the human's psycho-physiological state. Such collaborations\nrequire a computing system that monitors human physiological signals during\nhuman-robot collaboration (HRC) to quantitatively estimate a human's level of\ncomfort, which we have termed in this research as comfortability index (CI) and\nuncomfortability index (unCI). Subjective metrics (surprise, anxiety, boredom,\ncalmness, and comfortability) and physiological signals were collected during a\nhuman-robot collaboration experiment that varied robot behavior. The emotion\ncircumplex model is adapted to calculate the CI from the participant's\nquantitative data as well as physiological data. To estimate CI/unCI from\nphysiological signals, time features were extracted from electrocardiogram\n(ECG), galvanic skin response (GSR), and pupillometry signals. In this\nresearch, we successfully adapt the circumplex model to find the location\n(axis) of 'comfortability' and 'uncomfortability' on the circumplex model, and\nits location match with the closest emotions on the circumplex model. Finally,\nthe study showed that the proposed approach can estimate human\ncomfortability/uncomfortability from physiological signals.\n","authors":["Celal Savur","Jamison Heard","Ferat Sahin"],"pdf_url":"https://arxiv.org/pdf/2308.14644v1.pdf","comment":"Submitted to IEEE-THMS"},{"id":"http://arxiv.org/abs/2308.14642v1","updated":"2023-08-28T15:16:09Z","published":"2023-08-28T15:16:09Z","title":"Rate-Optimal Policy Optimization for Linear Markov Decision Processes","summary":" We study regret minimization in online episodic linear Markov Decision\nProcesses, and obtain rate-optimal $\\widetilde O (\\sqrt K)$ regret where $K$\ndenotes the number of episodes. Our work is the first to establish the optimal\n(w.r.t.~$K$) rate of convergence in the stochastic setting with bandit feedback\nusing a policy optimization based approach, and the first to establish the\noptimal (w.r.t.~$K$) rate in the adversarial setup with full information\nfeedback, for which no algorithm with an optimal rate guarantee is currently\nknown.\n","authors":["Uri Sherman","Alon Cohen","Tomer Koren","Yishay Mansour"],"pdf_url":"https://arxiv.org/pdf/2308.14642v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14634v1","updated":"2023-08-28T15:04:16Z","published":"2023-08-28T15:04:16Z","title":"Breaking the Bank with ChatGPT: Few-Shot Text Classification for Finance","summary":" We propose the use of conversational GPT models for easy and quick few-shot\ntext classification in the financial domain using the Banking77 dataset. Our\napproach involves in-context learning with GPT-3.5 and GPT-4, which minimizes\nthe technical expertise required and eliminates the need for expensive GPU\ncomputing while yielding quick and accurate results. Additionally, we fine-tune\nother pre-trained, masked language models with SetFit, a recent contrastive\nlearning technique, to achieve state-of-the-art results both in full-data and\nfew-shot settings. Our findings show that querying GPT-3.5 and GPT-4 can\noutperform fine-tuned, non-generative models even with fewer examples. However,\nsubscription fees associated with these solutions may be considered costly for\nsmall organizations. Lastly, we find that generative models perform better on\nthe given task when shown representative samples selected by a human expert\nrather than when shown random ones. We conclude that a) our proposed methods\noffer a practical solution for few-shot tasks in datasets with limited label\navailability, and b) our state-of-the-art results can inspire future work in\nthe area.\n","authors":["Lefteris Loukas","Ilias Stogiannidis","Prodromos Malakasiotis","Stavros Vassos"],"pdf_url":"https://arxiv.org/pdf/2308.14634v1.pdf","comment":"Early pre-print; Accepted at the 5th FinNLP workshop @ IJCAI-2023"},{"id":"http://arxiv.org/abs/2308.14632v1","updated":"2023-08-28T14:57:29Z","published":"2023-08-28T14:57:29Z","title":"Comparing AutoML and Deep Learning Methods for Condition Monitoring\n using Realistic Validation Scenarios","summary":" This study extensively compares conventional machine learning methods and\ndeep learning for condition monitoring tasks using an AutoML toolbox. The\nexperiments reveal consistent high accuracy in random K-fold cross-validation\nscenarios across all tested models. However, when employing leave-one-group-out\n(LOGO) cross-validation on the same datasets, no clear winner emerges,\nindicating the presence of domain shift in real-world scenarios. Additionally,\nthe study assesses the scalability and interpretability of conventional methods\nand neural networks. Conventional methods offer explainability with their\nmodular structure aiding feature identification. In contrast, neural networks\nrequire specialized interpretation techniques like occlusion maps to visualize\nimportant regions in the input data. Finally, the paper highlights the\nsignificance of feature selection, particularly in condition monitoring tasks\nwith limited class variations. Low-complexity models prove sufficient for such\ntasks, as only a few features from the input signal are typically needed. In\nsummary, these findings offer crucial insights into the strengths and\nlimitations of various approaches, providing valuable benchmarks and\nidentifying the most suitable methods for condition monitoring applications,\nthereby enhancing their applicability in real-world scenarios.\n","authors":["Payman Goodarzi","Andreas Schütze","Tizian Schneider"],"pdf_url":"https://arxiv.org/pdf/2308.14632v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2308.14626v1","updated":"2023-08-28T14:48:49Z","published":"2023-08-28T14:48:49Z","title":"VesselShot: Few-shot learning for cerebral blood vessel segmentation","summary":" Angiography is widely used to detect, diagnose, and treat cerebrovascular\ndiseases. While numerous techniques have been proposed to segment the vascular\nnetwork from different imaging modalities, deep learning (DL) has emerged as a\npromising approach. However, existing DL methods often depend on proprietary\ndatasets and extensive manual annotation. Moreover, the availability of\npre-trained networks specifically for medical domains and 3D volumes is\nlimited. To overcome these challenges, we propose a few-shot learning approach\ncalled VesselShot for cerebrovascular segmentation. VesselShot leverages\nknowledge from a few annotated support images and mitigates the scarcity of\nlabeled data and the need for extensive annotation in cerebral blood vessel\nsegmentation. We evaluated the performance of VesselShot using the publicly\navailable TubeTK dataset for the segmentation task, achieving a mean Dice\ncoefficient (DC) of 0.62(0.03).\n","authors":["Mumu Aktar","Hassan Rivaz","Marta Kersten-Oertel","Yiming Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.14626v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.02054v2","updated":"2023-08-28T14:38:33Z","published":"2023-05-03T11:39:31Z","title":"Map-based Experience Replay: A Memory-Efficient Solution to Catastrophic\n Forgetting in Reinforcement Learning","summary":" Deep Reinforcement Learning agents often suffer from catastrophic forgetting,\nforgetting previously found solutions in parts of the input space when training\non new data. Replay Memories are a common solution to the problem,\ndecorrelating and shuffling old and new training samples. They naively store\nstate transitions as they come in, without regard for redundancy. We introduce\na novel cognitive-inspired replay memory approach based on the\nGrow-When-Required (GWR) self-organizing network, which resembles a map-based\nmental model of the world. Our approach organizes stored transitions into a\nconcise environment-model-like network of state-nodes and transition-edges,\nmerging similar samples to reduce the memory size and increase pair-wise\ndistance among samples, which increases the relevancy of each sample. Overall,\nour paper shows that map-based experience replay allows for significant memory\nreduction with only small performance decreases.\n","authors":["Muhammad Burhan Hafez","Tilman Immisch","Tom Weber","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2305.02054v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.05151v3","updated":"2023-08-28T14:38:11Z","published":"2022-11-09T19:02:40Z","title":"QuadConv: Quadrature-Based Convolutions with Applications to Non-Uniform\n PDE Data Compression","summary":" We present a new convolution layer for deep learning architectures which we\ncall QuadConv -- an approximation to continuous convolution via quadrature. Our\noperator is developed explicitly for use on non-uniform, mesh-based data, and\naccomplishes this by learning a continuous kernel that can be sampled at\narbitrary locations. Moreover, the construction of our operator admits an\nefficient implementation which we detail and construct. As an experimental\nvalidation of our operator, we consider the task of compressing partial\ndifferential equation (PDE) simulation data from fixed meshes. We show that\nQuadConv can match the performance of standard discrete convolutions on uniform\ngrid data by comparing a QuadConv autoencoder (QCAE) to a standard\nconvolutional autoencoder (CAE). Further, we show that the QCAE can maintain\nthis accuracy even on non-uniform data. In both cases, QuadConv also\noutperforms alternative unstructured convolution methods such as graph\nconvolution.\n","authors":["Kevin Doherty","Cooper Simpson","Stephen Becker","Alireza Doostan"],"pdf_url":"https://arxiv.org/pdf/2211.05151v3.pdf","comment":"26 pages, 18 figures, 5 tables"},{"id":"http://arxiv.org/abs/2305.19442v4","updated":"2023-08-28T14:29:19Z","published":"2023-05-30T22:30:30Z","title":"SimFBO: Towards Simple, Flexible and Communication-efficient Federated\n Bilevel Learning","summary":" Federated bilevel optimization (FBO) has shown great potential recently in\nmachine learning and edge computing due to the emerging nested optimization\nstructure in meta-learning, fine-tuning, hyperparameter tuning, etc. However,\nexisting FBO algorithms often involve complicated computations and require\nmultiple sub-loops per iteration, each of which contains a number of\ncommunication rounds. In this paper, we propose a simple and flexible FBO\nframework named SimFBO, which is easy to implement without sub-loops, and\nincludes a generalized server-side aggregation and update for improving\ncommunication efficiency. We further propose System-level heterogeneity robust\nFBO (ShroFBO) as a variant of SimFBO with stronger resilience to heterogeneous\nlocal computation. We show that SimFBO and ShroFBO provably achieve a linear\nconvergence speedup with partial client participation and client sampling\nwithout replacement, as well as improved sample and communication complexities.\nExperiments demonstrate the effectiveness of the proposed methods over existing\nFBO algorithms.\n","authors":["Yifan Yang","Peiyao Xiao","Kaiyi Ji"],"pdf_url":"https://arxiv.org/pdf/2305.19442v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14608v1","updated":"2023-08-28T14:23:04Z","published":"2023-08-28T14:23:04Z","title":"AI in the Gray: Exploring Moderation Policies in Dialogic Large Language\n Models vs. Human Answers in Controversial Topics","summary":" The introduction of ChatGPT and the subsequent improvement of Large Language\nModels (LLMs) have prompted more and more individuals to turn to the use of\nChatBots, both for information and assistance with decision-making. However,\nthe information the user is after is often not formulated by these ChatBots\nobjectively enough to be provided with a definite, globally accepted answer.\n Controversial topics, such as \"religion\", \"gender identity\", \"freedom of\nspeech\", and \"equality\", among others, can be a source of conflict as partisan\nor biased answers can reinforce preconceived notions or promote disinformation.\nBy exposing ChatGPT to such debatable questions, we aim to understand its level\nof awareness and if existing models are subject to socio-political and/or\neconomic biases. We also aim to explore how AI-generated answers compare to\nhuman ones. For exploring this, we use a dataset of a social media platform\ncreated for the purpose of debating human-generated claims on polemic subjects\namong users, dubbed Kialo.\n Our results show that while previous versions of ChatGPT have had important\nissues with controversial topics, more recent versions of ChatGPT\n(gpt-3.5-turbo) are no longer manifesting significant explicit biases in\nseveral knowledge areas. In particular, it is well-moderated regarding economic\naspects. However, it still maintains degrees of implicit libertarian leaning\ntoward right-winged ideals which suggest the need for increased moderation from\nthe socio-political point of view. In terms of domain knowledge on\ncontroversial topics, with the exception of the \"Philosophical\" category,\nChatGPT is performing well in keeping up with the collective human level of\nknowledge. Finally, we see that sources of Bing AI have slightly more tendency\nto the center when compared to human answers. All the analyses we make are\ngeneralizable to other types of biases and domains.\n","authors":["Vahid Ghafouri","Vibhor Agarwal","Yong Zhang","Nishanth Sastry","Jose Such","Guillermo Suarez-Tangil"],"pdf_url":"https://arxiv.org/pdf/2308.14608v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14606v1","updated":"2023-08-28T14:20:53Z","published":"2023-08-28T14:20:53Z","title":"On the Tradeoff between Privacy Preservation and Byzantine-Robustness in\n Decentralized Learning","summary":" This paper jointly considers privacy preservation and Byzantine-robustness in\ndecentralized learning. In a decentralized network, honest-but-curious agents\nfaithfully follow the prescribed algorithm, but expect to infer their\nneighbors' private data from messages received during the learning process,\nwhile dishonest-and-Byzantine agents disobey the prescribed algorithm, and\ndeliberately disseminate wrong messages to their neighbors so as to bias the\nlearning process. For this novel setting, we investigate a generic\nprivacy-preserving and Byzantine-robust decentralized stochastic gradient\ndescent (SGD) framework, in which Gaussian noise is injected to preserve\nprivacy and robust aggregation rules are adopted to counteract Byzantine\nattacks. We analyze its learning error and privacy guarantee, discovering an\nessential tradeoff between privacy preservation and Byzantine-robustness in\ndecentralized learning -- the learning error caused by defending against\nByzantine attacks is exacerbated by the Gaussian noise added to preserve\nprivacy. Numerical experiments are conducted and corroborate our theoretical\nfindings.\n","authors":["Haoxiang Ye","Heng Zhu","Qing Ling"],"pdf_url":"https://arxiv.org/pdf/2308.14606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14602v1","updated":"2023-08-28T14:12:52Z","published":"2023-08-28T14:12:52Z","title":"Recent Progress in Energy Management of Connected Hybrid Electric\n Vehicles Using Reinforcement Learning","summary":" The growing adoption of hybrid electric vehicles (HEVs) presents a\ntransformative opportunity for revolutionizing transportation energy systems.\nThe shift towards electrifying transportation aims to curb environmental\nconcerns related to fossil fuel consumption. This necessitates efficient energy\nmanagement systems (EMS) to optimize energy efficiency. The evolution of EMS\nfrom HEVs to connected hybrid electric vehicles (CHEVs) represent a pivotal\nshift. For HEVs, EMS now confronts the intricate energy cooperation\nrequirements of CHEVs, necessitating advanced algorithms for route\noptimization, charging coordination, and load distribution. Challenges persist\nin both domains, including optimal energy utilization for HEVs, and cooperative\neco-driving control (CED) for CHEVs across diverse vehicle types. Reinforcement\nlearning (RL) stands out as a promising tool for addressing these challenges at\nhand. Specifically, within the realm of CHEVs, the application of multi-agent\nreinforcement learning (MARL) emerges as a powerful approach for effectively\ntackling the intricacies of CED control. Despite extensive research, few\nreviews span from individual vehicles to multi-vehicle scenarios. This review\nbridges the gap, highlighting challenges, advancements, and potential\ncontributions of RL-based solutions for future sustainable transportation\nsystems.\n","authors":["Min Hua","Bin Shuai","Quan Zhou","Jinhai Wang","Yinglong He","Hongming Xu"],"pdf_url":"https://arxiv.org/pdf/2308.14602v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14601v1","updated":"2023-08-28T14:12:25Z","published":"2023-08-28T14:12:25Z","title":"Fairness Through Domain Awareness: Mitigating Popularity Bias For Music\n Discovery","summary":" As online music platforms grow, music recommender systems play a vital role\nin helping users navigate and discover content within their vast musical\ndatabases. At odds with this larger goal, is the presence of popularity bias,\nwhich causes algorithmic systems to favor mainstream content over, potentially\nmore relevant, but niche items. In this work we explore the intrinsic\nrelationship between music discovery and popularity bias. To mitigate this\nissue we propose a domain-aware, individual fairness-based approach which\naddresses popularity bias in graph neural network (GNNs) based recommender\nsystems. Our approach uses individual fairness to reflect a ground truth\nlistening experience, i.e., if two songs sound similar, this similarity should\nbe reflected in their representations. In doing so, we facilitate meaningful\nmusic discovery that is robust to popularity bias and grounded in the music\ndomain. We apply our BOOST methodology to two discovery based tasks, performing\nrecommendations at both the playlist level and user level. Then, we ground our\nevaluation in the cold start setting, showing that our approach outperforms\nexisting fairness benchmarks in both performance and recommendation of\nlesser-known content. Finally, our analysis explains why our proposed\nmethodology is a novel and promising approach to mitigating popularity bias and\nimproving the discovery of new and niche content in music recommender systems.\n","authors":["Rebecca Salganik","Fernando Diaz","Golnoosh Farnadi"],"pdf_url":"https://arxiv.org/pdf/2308.14601v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.12561v2","updated":"2023-08-28T14:10:50Z","published":"2022-12-23T19:37:39Z","title":"An active learning method for solving competitive multi-agent\n decision-making and control problems","summary":" We propose a scheme based on active learning to reconstruct private\nstrategies executed by a population of interacting agents and predict an exact\noutcome of the underlying multi-agent interaction process, here identified as a\nstationary action profile. We envision a scenario where an external observer,\nendowed with a learning procedure, can make queries and observe the agents'\nreactions through private action-reaction mappings, whose collective fixed\npoint corresponds to a stationary profile. By iteratively collecting sensible\ndata and updating parametric estimates of the action-reaction mappings, we\nestablish sufficient conditions to assess the asymptotic properties of the\nproposed active learning methodology so that, if convergence happens, it can\nonly be towards a stationary action profile. This fact yields two main\nconsequences: i) learning locally-exact surrogates of the action-reaction\nmappings allows the external observer to succeed in its prediction task, and\nii) working with assumptions so general that a stationary profile is not even\nguaranteed to exist, the established sufficient conditions hence act also as\ncertificates for the existence of such a desirable profile. Extensive numerical\nsimulations involving typical competitive multi-agent control and\ndecision-making problems illustrate the practical effectiveness of the proposed\nlearning-based approach.\n","authors":["Filippo Fabiani","Alberto Bemporad"],"pdf_url":"https://arxiv.org/pdf/2212.12561v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.13368v2","updated":"2023-08-28T14:09:47Z","published":"2022-12-27T05:51:54Z","title":"Deep Reinforcement Learning for Wind and Energy Storage Coordination in\n Wholesale Energy and Ancillary Service Markets","summary":" Wind energy has been increasingly adopted to mitigate climate change.\nHowever, the variability of wind energy causes wind curtailment, resulting in\nconsiderable economic losses for wind farm owners. Wind curtailment can be\nreduced using battery energy storage systems (BESS) as onsite backup sources.\nYet, this auxiliary role may significantly weaken the economic potential of\nBESS in energy trading. Ideal BESS scheduling should balance onsite wind\ncurtailment reduction and market bidding, but practical implementation is\nchallenging due to coordination complexity and the stochastic nature of energy\nprices and wind generation. We investigate the joint-market bidding strategy of\na co-located wind-battery system in the spot and Regulation Frequency Control\nAncillary Service markets. We propose a novel deep reinforcement learning-based\napproach that decouples the system's market participation into two related\nMarkov decision processes for each facility, enabling the BESS to absorb onsite\nwind curtailment while performing joint-market bidding to maximize overall\noperational revenues. Using realistic wind farm data, we validated the\ncoordinated bidding strategy, with outcomes surpassing the optimization-based\nbenchmark in terms of higher revenue by approximately 25\\% and more wind\ncurtailment reduction by 2.3 times. Our results show that joint-market bidding\ncan significantly improve the financial performance of wind-battery systems\ncompared to participating in each market separately. Simulations also show that\nusing curtailed wind generation as a power source for charging the BESS can\nlead to additional financial gains. The successful implementation of our\nalgorithm would encourage co-location of generation and storage assets to\nunlock wider system benefits.\n","authors":["Jinhao Li","Changlong Wang","Hao Wang"],"pdf_url":"https://arxiv.org/pdf/2212.13368v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14597v1","updated":"2023-08-28T14:09:02Z","published":"2023-08-28T14:09:02Z","title":"Adversarial Attacks on Foundational Vision Models","summary":" Rapid progress is being made in developing large, pretrained, task-agnostic\nfoundational vision models such as CLIP, ALIGN, DINOv2, etc. In fact, we are\napproaching the point where these models do not have to be finetuned\ndownstream, and can simply be used in zero-shot or with a lightweight probing\nhead. Critically, given the complexity of working at this scale, there is a\nbottleneck where relatively few organizations in the world are executing the\ntraining then sharing the models on centralized platforms such as HuggingFace\nand torch.hub. The goal of this work is to identify several key adversarial\nvulnerabilities of these models in an effort to make future designs more\nrobust. Intuitively, our attacks manipulate deep feature representations to\nfool an out-of-distribution (OOD) detector which will be required when using\nthese open-world-aware models to solve closed-set downstream tasks. Our methods\nreliably make in-distribution (ID) images (w.r.t. a downstream task) be\npredicted as OOD and vice versa while existing in extremely\nlow-knowledge-assumption threat models. We show our attacks to be potent in\nwhitebox and blackbox settings, as well as when transferred across foundational\nmodel types (e.g., attack DINOv2 with CLIP)! This work is only just the\nbeginning of a long journey towards adversarially robust foundational vision\nmodels.\n","authors":["Nathan Inkawhich","Gwendolyn McDonald","Ryan Luley"],"pdf_url":"https://arxiv.org/pdf/2308.14597v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14596v1","updated":"2023-08-28T14:08:42Z","published":"2023-08-28T14:08:42Z","title":"LatentDR: Improving Model Generalization Through Sample-Aware Latent\n Degradation and Restoration","summary":" Despite significant advances in deep learning, models often struggle to\ngeneralize well to new, unseen domains, especially when training data is\nlimited. To address this challenge, we propose a novel approach for\ndistribution-aware latent augmentation that leverages the relationships across\nsamples to guide the augmentation procedure. Our approach first degrades the\nsamples stochastically in the latent space, mapping them to augmented labels,\nand then restores the samples from their corrupted versions during training.\nThis process confuses the classifier in the degradation step and restores the\noverall class distribution of the original samples, promoting diverse\nintra-class/cross-domain variability. We extensively evaluate our approach on a\ndiverse set of datasets and tasks, including domain generalization benchmarks\nand medical imaging datasets with strong domain shift, where we show our\napproach achieves significant improvements over existing methods for latent\nspace augmentation. We further show that our method can be flexibly adapted to\nlong-tail recognition tasks, demonstrating its versatility in building more\ngeneralizable models. Code is available at\nhttps://github.com/nerdslab/LatentDR.\n","authors":["Ran Liu","Sahil Khose","Jingyun Xiao","Lakshmi Sathidevi","Keerthan Ramnath","Zsolt Kira","Eva L. Dyer"],"pdf_url":"https://arxiv.org/pdf/2308.14596v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14595v1","updated":"2023-08-28T14:06:36Z","published":"2023-08-28T14:06:36Z","title":"Neural Network Training Strategy to Enhance Anomaly Detection\n Performance: A Perspective on Reconstruction Loss Amplification","summary":" Unsupervised anomaly detection (UAD) is a widely adopted approach in industry\ndue to rare anomaly occurrences and data imbalance. A desirable characteristic\nof an UAD model is contained generalization ability which excels in the\nreconstruction of seen normal patterns but struggles with unseen anomalies.\nRecent studies have pursued to contain the generalization capability of their\nUAD models in reconstruction from different perspectives, such as design of\nneural network (NN) structure and training strategy. In contrast, we note that\ncontaining of generalization ability in reconstruction can also be obtained\nsimply from steep-shaped loss landscape. Motivated by this, we propose a loss\nlandscape sharpening method by amplifying the reconstruction loss, dubbed Loss\nAMPlification (LAMP). LAMP deforms the loss landscape into a steep shape so the\nreconstruction error on unseen anomalies becomes greater. Accordingly, the\nanomaly detection performance is improved without any change of the NN\narchitecture. Our findings suggest that LAMP can be easily applied to any\nreconstruction error metrics in UAD settings where the reconstruction model is\ntrained with anomaly-free samples only.\n","authors":["YeongHyeon Park","Sungho Kang","Myung Jin Kim","Hyeonho Jeong","Hyunkyu Park","Hyeong Seok Kim","Juneho Yi"],"pdf_url":"https://arxiv.org/pdf/2308.14595v1.pdf","comment":"5 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2308.10842v2","updated":"2023-08-28T13:55:12Z","published":"2023-08-18T09:45:21Z","title":"Enhancing Agent Communication and Learning through Action and Language","summary":" We introduce a novel category of GC-agents capable of functioning as both\nteachers and learners. Leveraging action-based demonstrations and\nlanguage-based instructions, these agents enhance communication efficiency. We\ninvestigate the incorporation of pedagogy and pragmatism, essential elements in\nhuman communication and goal achievement, enhancing the agents' teaching and\nlearning capabilities. Furthermore, we explore the impact of combining\ncommunication modes (action and language) on learning outcomes, highlighting\nthe benefits of a multi-modal approach.\n","authors":["Hugo Caselles-Dupré","Olivier Sigaud","Mohamed Chetouani"],"pdf_url":"https://arxiv.org/pdf/2308.10842v2.pdf","comment":"IMOL workshop, Paris 2023"},{"id":"http://arxiv.org/abs/2308.14555v1","updated":"2023-08-28T13:17:39Z","published":"2023-08-28T13:17:39Z","title":"Kernel Limit of Recurrent Neural Networks Trained on Ergodic Data\n Sequences","summary":" Mathematical methods are developed to characterize the asymptotics of\nrecurrent neural networks (RNN) as the number of hidden units, data samples in\nthe sequence, hidden state updates, and training steps simultaneously grow to\ninfinity. In the case of an RNN with a simplified weight matrix, we prove the\nconvergence of the RNN to the solution of an infinite-dimensional ODE coupled\nwith the fixed point of a random algebraic equation. The analysis requires\naddressing several challenges which are unique to RNNs. In typical mean-field\napplications (e.g., feedforward neural networks), discrete updates are of\nmagnitude $\\mathcal{O}(\\frac{1}{N})$ and the number of updates is\n$\\mathcal{O}(N)$. Therefore, the system can be represented as an Euler\napproximation of an appropriate ODE/PDE, which it will converge to as $N\n\\rightarrow \\infty$. However, the RNN hidden layer updates are\n$\\mathcal{O}(1)$. Therefore, RNNs cannot be represented as a discretization of\nan ODE/PDE and standard mean-field techniques cannot be applied. Instead, we\ndevelop a fixed point analysis for the evolution of the RNN memory states, with\nconvergence estimates in terms of the number of update steps and the number of\nhidden units. The RNN hidden layer is studied as a function in a Sobolev space,\nwhose evolution is governed by the data sequence (a Markov chain), the\nparameter updates, and its dependence on the RNN hidden layer at the previous\ntime step. Due to the strong correlation between updates, a Poisson equation\nmust be used to bound the fluctuations of the RNN around its limit equation.\nThese mathematical methods give rise to the neural tangent kernel (NTK) limits\nfor RNNs trained on data sequences as the number of data samples and size of\nthe neural network grow to infinity.\n","authors":["Samuel Chun-Hei Lam","Justin Sirignano","Konstantinos Spiliopoulos"],"pdf_url":"https://arxiv.org/pdf/2308.14555v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.05249v2","updated":"2023-08-28T13:00:38Z","published":"2022-05-11T03:36:04Z","title":"Secure & Private Federated Neuroimaging","summary":" The amount of biomedical data continues to grow rapidly. However, collecting\ndata from multiple sites for joint analysis remains challenging due to\nsecurity, privacy, and regulatory concerns. To overcome this challenge, we use\nFederated Learning, which enables distributed training of neural network models\nover multiple data sources without sharing data. Each site trains the neural\nnetwork over its private data for some time, then shares the neural network\nparameters (i.e., weights, gradients) with a Federation Controller, which in\nturn aggregates the local models, sends the resulting community model back to\neach site, and the process repeats. Our Federated Learning architecture,\nMetisFL, provides strong security and privacy. First, sample data never leaves\na site. Second, neural network parameters are encrypted before transmission and\nthe global neural model is computed under fully-homomorphic encryption.\nFinally, we use information-theoretic methods to limit information leakage from\nthe neural model to prevent a curious site from performing model inversion or\nmembership attacks. We present a thorough evaluation of the performance of\nsecure, private federated learning in neuroimaging tasks, including for\npredicting Alzheimer's disease and estimating BrainAGE from magnetic resonance\nimaging (MRI) studies, in challenging, heterogeneous federated environments\nwhere sites have different amounts of data and statistical distributions.\n","authors":["Dimitris Stripelis","Umang Gupta","Hamza Saleem","Nikhil Dhinagar","Tanmay Ghai","Rafael Chrysovalantis Anastasiou","Armaghan Asghar","Greg Ver Steeg","Srivatsan Ravi","Muhammad Naveed","Paul M. Thompson","Jose Luis Ambite"],"pdf_url":"https://arxiv.org/pdf/2205.05249v2.pdf","comment":"18 pages, 13 figures, 2 tables"},{"id":"http://arxiv.org/abs/2307.03854v3","updated":"2023-08-28T12:50:34Z","published":"2023-07-07T22:00:31Z","title":"inTformer: A Time-Embedded Attention-Based Transformer for Crash\n Likelihood Prediction at Intersections Using Connected Vehicle Data","summary":" The real-time crash likelihood prediction model is an essential component of\nthe proactive traffic safety management system. Over the years, numerous\nstudies have attempted to construct a crash likelihood prediction model in\norder to enhance traffic safety, but mostly on freeways. In the majority of the\nexisting studies, researchers have primarily employed a deep learning-based\nframework to identify crash potential. Lately, Transformer has emerged as a\npotential deep neural network that fundamentally operates through\nattention-based mechanisms. Transformer has several functional benefits over\nextant deep learning models such as LSTM, CNN, etc. Firstly, Transformer can\nreadily handle long-term dependencies in a data sequence. Secondly,\nTransformers can parallelly process all elements in a data sequence during\ntraining. Finally, a Transformer does not have the vanishing gradient issue.\nRealizing the immense possibility of Transformers, this paper proposes\ninTersection-Transformer (inTformer), a time-embedded attention-based\nTransformer model that can effectively predict intersection crash likelihood in\nreal-time. The proposed model was evaluated using connected vehicle data\nextracted from Signal Analytics Platform. Acknowledging the complex traffic\noperation mechanism at intersection, this study developed zone-specific models\nby dividing the intersection region into two distinct zones:\nwithin-intersection and approach zone. The best inTformer models in\n'within-intersection,' and 'approach' zone achieved a sensitivity of 73%, and\n70%, respectively. The zone-level models were also compared to earlier studies\non crash likelihood prediction at intersections and with several established\ndeep learning models trained on the same connected vehicle dataset.\n","authors":["B M Tazbiul Hassan Anik","Zubayer Islam","Mohamed Abdel-Aty","Ling Wang"],"pdf_url":"https://arxiv.org/pdf/2307.03854v3.pdf","comment":"29 pages, 10 figures, 8 tables"},{"id":"http://arxiv.org/abs/2308.14536v1","updated":"2023-08-28T12:47:41Z","published":"2023-08-28T12:47:41Z","title":"Spoken Language Intelligence of Large Language Models for Language\n Learning","summary":" People have long hoped for a conversational system that can assist in\nreal-life situations, and recent progress on large language models (LLMs) is\nbringing this idea closer to reality. While LLMs are often impressive in\nperformance, their efficacy in real-world scenarios that demand expert\nknowledge remains unclear. LLMs are believed to hold the most potential and\nvalue in education, especially in the development of Artificial intelligence\n(AI) based virtual teachers capable of facilitating language learning. Our\nfocus is centered on evaluating the efficacy of LLMs in the realm of education,\nspecifically in the areas of spoken language learning which encompass\nphonetics, phonology, and second language acquisition. We introduce a new\nmultiple-choice question dataset to evaluate the effectiveness of LLMs in the\naforementioned scenarios, including understanding and application of spoken\nlanguage knowledge. In addition, we investigate the influence of various\nprompting techniques such as zero- and few-shot method (prepending the question\nwith question-answer exemplars), chain-of-thought (CoT, think step-by-step),\nin-domain exampler and external tools (Google, Wikipedia). We conducted\nlarge-scale evaluation on popular LLMs (20 distinct models) using these\nmethods. We achieved significant performance improvements compared to the\nzero-shot baseline in the practical questions reasoning (GPT-3.5, 49.1% ->\n63.1%; LLaMA2-70B-Chat, 42.2% -> 48.6%). We found that models of different\nsizes have good understanding of concepts in phonetics, phonology, and second\nlanguage acquisition, but show limitations in reasoning for real-world\nproblems. Additionally, we also explore preliminary findings on conversational\ncommunication.\n","authors":["Linkai Peng","Baorian Nuchged","Yingming Gao"],"pdf_url":"https://arxiv.org/pdf/2308.14536v1.pdf","comment":"28 pages, 7 figures, Preprint"},{"id":"http://arxiv.org/abs/2308.14522v1","updated":"2023-08-28T12:17:51Z","published":"2023-08-28T12:17:51Z","title":"Large Graph Models: A Perspective","summary":" Large models have emerged as the most recent groundbreaking achievements in\nartificial intelligence, and particularly machine learning. However, when it\ncomes to graphs, large models have not achieved the same level of success as in\nother fields, such as natural language processing and computer vision. In order\nto promote applying large models for graphs forward, we present a perspective\npaper to discuss the challenges and opportunities associated with developing\nlarge graph models. First, we discuss the desired characteristics of large\ngraph models. Then, we present detailed discussions from three key\nperspectives: representation basis, graph data, and graph models. In each\ncategory, we provide a brief overview of recent advances and highlight the\nremaining challenges together with our visions. Finally, we discuss valuable\napplications of large graph models. We believe this perspective paper is able\nto encourage further investigations into large graph models, ultimately pushing\nus one step closer towards artificial general intelligence (AGI).\n","authors":["Ziwei Zhang","Haoyang Li","Zeyang Zhang","Yijian Qin","Xin Wang","Wenwu Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.14522v1.pdf","comment":"Preliminary version. Comments are welcome"},{"id":"http://arxiv.org/abs/2308.14521v1","updated":"2023-08-28T12:13:36Z","published":"2023-08-28T12:13:36Z","title":"Context-Aware Composition of Agent Policies by Markov Decision Process\n Entity Embeddings and Agent Ensembles","summary":" Computational agents support humans in many areas of life and are therefore\nfound in heterogeneous contexts. This means that agents operate in rapidly\nchanging environments and can be confronted with huge state and action spaces.\nIn order to perform services and carry out activities in a goal-oriented\nmanner, agents require prior knowledge and therefore have to develop and pursue\ncontext-dependent policies. The problem is that prescribing policies in advance\nis limited and inflexible, especially in dynamically changing environments.\nMoreover, the context of an agent determines its choice of actions. Since the\nenvironments in which agents operate can be stochastic and complex in terms of\nthe number of states and feasible actions, activities are usually modelled in a\nsimplified way by Markov decision processes so that agents with reinforcement\nlearning are able to learn policies that help to capture the context and act\naccordingly to optimally perform activities. However, training policies for all\npossible contexts using reinforcement learning is time-consuming. A requirement\nand challenge for agents is to learn strategies quickly and respond immediately\nin cross-context environments and applications. In this work, we propose a\nnovel simulation-based approach that enables a) the representation of\nheterogeneous contexts through knowledge graphs and entity embeddings and b)\nthe context-aware composition of policies on demand by ensembles of agents\nrunning in parallel. The evaluation we performed on the \"Virtual Home\" dataset\nindicates that agents that need to seamlessly switch between different\ncontexts, can request on-the-fly composed policies that lead to the successful\ncompletion of context-appropriate activities without having to learn these\npolicies in lengthy training steps and episodes, in contrast to agents that\napply reinforcement learning.\n","authors":["Nicole Merkle","Ralf Mikut"],"pdf_url":"https://arxiv.org/pdf/2308.14521v1.pdf","comment":"29 pages, 11 figures, 9 tables, 3 listings, Submitted to Semantic Web\n Journal, Under revision for re-submission to Semantic Web Journal"},{"id":"http://arxiv.org/abs/2201.08110v2","updated":"2023-08-28T12:04:46Z","published":"2022-01-20T10:57:20Z","title":"NNP/MM: Accelerating molecular dynamics simulations with machine\n learning potentials and molecular mechanic","summary":" Machine learning potentials have emerged as a means to enhance the accuracy\nof biomolecular simulations. However, their application is constrained by the\nsignificant computational cost arising from the vast number of parameters\ncompared to traditional molecular mechanics. To tackle this issue, we introduce\nan optimized implementation of the hybrid method (NNP/MM), which combines\nneural network potentials (NNP) and molecular mechanics (MM). This approach\nmodels a portion of the system, such as a small molecule, using NNP while\nemploying MM for the remaining system to boost efficiency. By conducting\nmolecular dynamics (MD) simulations on various protein-ligand complexes and\nmetadynamics (MTD) simulations on a ligand, we showcase the capabilities of our\nimplementation of NNP/MM. It has enabled us to increase the simulation speed by\n5 times and achieve a combined sampling of one microsecond for each complex,\nmarking the longest simulations ever reported for this class of simulation.\n","authors":["Raimondas Galvelis","Alejandro Varela-Rial","Stefan Doerr","Roberto Fino","Peter Eastman","Thomas E. Markland","John D. Chodera","Gianni De Fabritiis"],"pdf_url":"https://arxiv.org/pdf/2201.08110v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14516v1","updated":"2023-08-28T12:03:03Z","published":"2023-08-28T12:03:03Z","title":"Prediction of Tourism Flow with Sparse Geolocation Data","summary":" Modern tourism in the 21st century is facing numerous challenges. Among these\nthe rapidly growing number of tourists visiting space-limited regions like\nhistorical cities, museums and bottlenecks such as bridges is one of the\nbiggest. In this context, a proper and accurate prediction of tourism volume\nand tourism flow within a certain area is important and critical for visitor\nmanagement tasks such as sustainable treatment of the environment and\nprevention of overcrowding. Static flow control methods like conventional\nlow-level controllers or limiting access to overcrowded venues could not solve\nthe problem yet. In this paper, we empirically evaluate the performance of\nstate-of-the-art deep-learning methods such as RNNs, GNNs, and Transformers as\nwell as the classic statistical ARIMA method. Granular limited data supplied by\na tourism region is extended by exogenous data such as geolocation trajectories\nof individual tourists, weather and holidays. In the field of visitor flow\nprediction with sparse data, we are thereby capable of increasing the accuracy\nof our predictions, incorporating modern input feature handling as well as\nmapping geolocation data on top of discrete POI data.\n","authors":["Julian Lemmel","Zahra Babaiee","Marvin Kleinlehner","Ivan Majic","Philipp Neubauer","Johannes Scholz","Radu Grosu","Sophie A. Neubauer"],"pdf_url":"https://arxiv.org/pdf/2308.14516v1.pdf","comment":"Accepted for publication at the proceedings of the 5th International\n Data Science Conference - iDSC2023. arXiv admin note: substantial text\n overlap with arXiv:2206.13274"},{"id":"http://arxiv.org/abs/2308.14507v1","updated":"2023-08-28T11:49:23Z","published":"2023-08-28T11:49:23Z","title":"Spectral Estimators for Structured Generalized Linear Models via\n Approximate Message Passing","summary":" We consider the problem of parameter estimation from observations given by a\ngeneralized linear model. Spectral methods are a simple yet effective approach\nfor estimation: they estimate the parameter via the principal eigenvector of a\nmatrix obtained by suitably preprocessing the observations. Despite their wide\nuse, a rigorous performance characterization of spectral estimators, as well as\na principled way to preprocess the data, is available only for unstructured\n(i.e., i.i.d. Gaussian and Haar) designs. In contrast, real-world design\nmatrices are highly structured and exhibit non-trivial correlations. To address\nthis problem, we consider correlated Gaussian designs which capture the\nanisotropic nature of the measurements via a feature covariance matrix\n$\\Sigma$. Our main result is a precise asymptotic characterization of the\nperformance of spectral estimators in this setting. This then allows to\nidentify the optimal preprocessing that minimizes the number of samples needed\nto meaningfully estimate the parameter. Remarkably, such an optimal spectral\nestimator depends on $\\Sigma$ only through its normalized trace, which can be\nconsistently estimated from the data. Numerical results demonstrate the\nadvantage of our principled approach over previous heuristic methods.\n Existing analyses of spectral estimators crucially rely on the rotational\ninvariance of the design matrix. This key assumption does not hold for\ncorrelated Gaussian designs. To circumvent this difficulty, we develop a novel\nstrategy based on designing and analyzing an approximate message passing\nalgorithm whose fixed point coincides with the desired spectral estimator. Our\nmethodology is general, and opens the way to the precise characterization of\nspiked matrices and of the corresponding spectral methods in a variety of\nsettings.\n","authors":["Yihan Zhang","Hong Chang Ji","Ramji Venkataramanan","Marco Mondelli"],"pdf_url":"https://arxiv.org/pdf/2308.14507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14486v1","updated":"2023-08-28T10:59:05Z","published":"2023-08-28T10:59:05Z","title":"Rebalancing Social Feed to Minimize Polarization and Disagreement","summary":" Social media have great potential for enabling public discourse on important\nsocietal issues. However, adverse effects, such as polarization and echo\nchambers, greatly impact the benefits of social media and call for algorithms\nthat mitigate these effects. In this paper, we propose a novel problem\nformulation aimed at slightly nudging users' social feeds in order to strike a\nbalance between relevance and diversity, thus mitigating the emergence of\npolarization, without lowering the quality of the feed. Our approach is based\non re-weighting the relative importance of the accounts that a user follows, so\nas to calibrate the frequency with which the content produced by various\naccounts is shown to the user. We analyze the convexity properties of the\nproblem, demonstrating the non-matrix convexity of the objective function and\nthe convexity of the feasible set. To efficiently address the problem, we\ndevelop a scalable algorithm based on projected gradient descent. We also prove\nthat our problem statement is a proper generalization of the undirected-case\nproblem so that our method can also be adopted for undirected social networks.\nAs a baseline for comparison in the undirected case, we develop a semidefinite\nprogramming approach, which provides the optimal solution. Through extensive\nexperiments on synthetic and real-world datasets, we validate the effectiveness\nof our approach, which outperforms non-trivial baselines, underscoring its\nability to foster healthier and more cohesive online communities.\n","authors":["Federico Cinus","Aristides Gionis","Francesco Bonchi"],"pdf_url":"https://arxiv.org/pdf/2308.14486v1.pdf","comment":"Accepted for publication at ACM CIKM 2023"},{"id":"http://arxiv.org/abs/2303.10058v2","updated":"2023-08-28T10:46:22Z","published":"2023-03-17T15:38:39Z","title":"No Fear of Classifier Biases: Neural Collapse Inspired Federated\n Learning with Synthetic and Fixed Classifier","summary":" Data heterogeneity is an inherent challenge that hinders the performance of\nfederated learning (FL). Recent studies have identified the biased classifiers\nof local models as the key bottleneck. Previous attempts have used classifier\ncalibration after FL training, but this approach falls short in improving the\npoor feature representations caused by training-time classifier biases.\nResolving the classifier bias dilemma in FL requires a full understanding of\nthe mechanisms behind the classifier. Recent advances in neural collapse have\nshown that the classifiers and feature prototypes under perfect training\nscenarios collapse into an optimal structure called simplex equiangular tight\nframe (ETF). Building on this neural collapse insight, we propose a solution to\nthe FL's classifier bias problem by utilizing a synthetic and fixed ETF\nclassifier during training. The optimal classifier structure enables all\nclients to learn unified and optimal feature representations even under\nextremely heterogeneous data. We devise several effective modules to better\nadapt the ETF structure in FL, achieving both high generalization and\npersonalization. Extensive experiments demonstrate that our method achieves\nstate-of-the-art performances on CIFAR-10, CIFAR-100, and Tiny-ImageNet.\n","authors":["Zexi Li","Xinyi Shang","Rui He","Tao Lin","Chao Wu"],"pdf_url":"https://arxiv.org/pdf/2303.10058v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.14481v1","updated":"2023-08-28T10:43:53Z","published":"2023-08-28T10:43:53Z","title":"Group Regression for Query Based Object Detection and Tracking","summary":" Group regression is commonly used in 3D object detection to predict box\nparameters of similar classes in a joint head, aiming to benefit from\nsimilarities while separating highly dissimilar classes. For query-based\nperception methods, this has, so far, not been feasible. We close this gap and\npresent a method to incorporate multi-class group regression, especially\ndesigned for the 3D domain in the context of autonomous driving, into existing\nattention and query-based perception approaches. We enhance a transformer based\njoint object detection and tracking model with this approach, and thoroughly\nevaluate its behavior and performance. For group regression, the classes of the\nnuScenes dataset are divided into six groups of similar shape and prevalence,\neach being regressed by a dedicated head. We show that the proposed method is\napplicable to many existing transformer based perception approaches and can\nbring potential benefits. The behavior of query group regression is thoroughly\nanalyzed in comparison to a unified regression head, e.g. in terms of\nclass-switching behavior and distribution of the output parameters. The\nproposed method offers many possibilities for further research, such as in the\ndirection of deep multi-hypotheses tracking.\n","authors":["Felicia Ruppel","Florian Faion","Claudius Gläser","Klaus Dietmayer"],"pdf_url":"https://arxiv.org/pdf/2308.14481v1.pdf","comment":"Accepted for publication at the 2023 26th IEEE International\n Conference on Intelligent Transportation Systems (ITSC 2023), Sep 24-28,\n 2023, in Bilbao, Spain"},{"id":"http://arxiv.org/abs/2308.14478v1","updated":"2023-08-28T10:35:04Z","published":"2023-08-28T10:35:04Z","title":"Some issues in robust clustering","summary":" Some key issues in robust clustering are discussed with focus on Gaussian\nmixture model based clustering, namely the formal definition of outliers,\nambiguity between groups of outliers and clusters, the interaction between\nrobust clustering and the estimation of the number of clusters, the essential\ndependence of (not only) robust clustering on tuning decisions, and\nshortcomings of existing measurements of cluster stability when it comes to\noutliers.\n","authors":["Christian Hennig"],"pdf_url":"https://arxiv.org/pdf/2308.14478v1.pdf","comment":"11 pages, no figures"},{"id":"http://arxiv.org/abs/2212.07524v3","updated":"2023-08-28T10:06:41Z","published":"2022-12-14T22:12:32Z","title":"Invariant Lipschitz Bandits: A Side Observation Approach","summary":" Symmetry arises in many optimization and decision-making problems, and has\nattracted considerable attention from the optimization community: By utilizing\nthe existence of such symmetries, the process of searching for optimal\nsolutions can be improved significantly. Despite its success in (offline)\noptimization, the utilization of symmetries has not been well examined within\nthe online optimization settings, especially in the bandit literature. As such,\nin this paper we study the invariant Lipschitz bandit setting, a subclass of\nthe Lipschitz bandits where the reward function and the set of arms are\npreserved under a group of transformations. We introduce an algorithm named\n\\texttt{UniformMesh-N}, which naturally integrates side observations using\ngroup orbits into the \\texttt{UniformMesh} algorithm\n(\\cite{Kleinberg2005_UniformMesh}), which uniformly discretizes the set of\narms. Using the side-observation approach, we prove an improved regret upper\nbound, which depends on the cardinality of the group, given that the group is\nfinite. We also prove a matching regret's lower bound for the invariant\nLipschitz bandit class (up to logarithmic factors). We hope that our work will\nignite further investigation of symmetry in bandit theory and sequential\ndecision-making theory in general.\n","authors":["Nam Phuong Tran","Long Tran-Thanh"],"pdf_url":"https://arxiv.org/pdf/2212.07524v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14456v1","updated":"2023-08-28T09:49:48Z","published":"2023-08-28T09:49:48Z","title":"Speech Self-Supervised Representations Benchmarking: a Case for Larger\n Probing Heads","summary":" Self-supervised learning (SSL) leverages large datasets of unlabeled speech\nto reach impressive performance with reduced amounts of annotated data. The\nhigh number of proposed approaches fostered the emergence of comprehensive\nbenchmarks that evaluate their performance on a set of downstream tasks\nexploring various aspects of the speech signal. However, while the number of\nconsidered tasks has been growing, most proposals rely upon a single downstream\narchitecture that maps the frozen SSL representations to the task labels. This\nstudy examines how benchmarking results are affected by changes in the probing\nhead architecture. Interestingly, we found that altering the downstream\narchitecture structure leads to significant fluctuations in the performance\nranking of the evaluated models. Against common practices in speech SSL\nbenchmarking, we evaluate larger-capacity probing heads, showing their impact\non performance, inference costs, generalization and multi-level feature\nexploitation.\n","authors":["Salah Zaiem","Youcef Kemiche","Titouan Parcollet","Slim Essid","Mirco Ravanelli"],"pdf_url":"https://arxiv.org/pdf/2308.14456v1.pdf","comment":"11 Pages"},{"id":"http://arxiv.org/abs/2304.14824v2","updated":"2023-08-28T09:14:34Z","published":"2023-04-28T13:06:14Z","title":"A noise-robust acoustic method for recognizing foraging activities of\n grazing cattle","summary":" Farmers must continuously improve their livestock production systems to\nremain competitive in the growing dairy market. Precision livestock farming\ntechnologies provide individualized monitoring of animals on commercial farms,\noptimizing livestock production. Continuous acoustic monitoring is a widely\naccepted sensing technique used to estimate the daily rumination and grazing\ntime budget of free-ranging cattle. However, typical environmental and natural\nnoises on pastures noticeably affect the performance limiting the practical\napplication of current acoustic methods. In this study, we present the\noperating principle and generalization capability of an acoustic method called\nNoise-Robust Foraging Activity Recognizer (NRFAR). The proposed method\ndetermines foraging activity bouts by analyzing fixed-length segments of\nidentified jaw movement events produced during grazing and rumination. The\nadditive noise robustness of the NRFAR was evaluated for several\nsignal-to-noise ratios using stationary Gaussian white noise and four different\nnonstationary natural noise sources. In noiseless conditions, NRFAR reached an\naverage balanced accuracy of 86.4%, outperforming two previous acoustic methods\nby more than 7.5%. Furthermore, NRFAR performed better than previous acoustic\nmethods in 77 of 80 evaluated noisy scenarios (53 cases with p<0.05). NRFAR has\nbeen shown to be effective in harsh free-ranging environments and could be used\nas a reliable solution to improve pasture management and monitor the health and\nwelfare of dairy cows. The instrumentation and computational algorithms\npresented in this publication are protected by a pending patent application: AR\nP20220100910. Web demo available at: https://sinc.unl.edu.ar/web-demo/nrfar\n","authors":["Luciano S. Martinez-Rau","José O. Chelotti","Mariano Ferrero","Julio R. Galli","Santiago A. Utsumi","Alejandra M. Planisich","H. Leonardo Rufiner","Leonardo L. Giovanini"],"pdf_url":"https://arxiv.org/pdf/2304.14824v2.pdf","comment":"list of used audio-clips is available in the list_audio_clips.xlsx"},{"id":"http://arxiv.org/abs/2308.14430v1","updated":"2023-08-28T09:06:32Z","published":"2023-08-28T09:06:32Z","title":"TextrolSpeech: A Text Style Control Speech Corpus With Codec Language\n Text-to-Speech Models","summary":" Recently, there has been a growing interest in the field of controllable\nText-to-Speech (TTS). While previous studies have relied on users providing\nspecific style factor values based on acoustic knowledge or selecting reference\nspeeches that meet certain requirements, generating speech solely from natural\ntext prompts has emerged as a new challenge for researchers. This challenge\narises due to the scarcity of high-quality speech datasets with natural text\nstyle prompt and the absence of advanced text-controllable TTS models. In light\nof this, 1) we propose TextrolSpeech, which is the first large-scale speech\nemotion dataset annotated with rich text attributes. The dataset comprises\n236,220 pairs of style prompt in natural text descriptions with five style\nfactors and corresponding speech samples. Through iterative experimentation, we\nintroduce a multi-stage prompt programming approach that effectively utilizes\nthe GPT model for generating natural style descriptions in large volumes. 2)\nFurthermore, to address the need for generating audio with greater style\ndiversity, we propose an efficient architecture called Salle. This architecture\ntreats text controllable TTS as a language model task, utilizing audio codec\ncodes as an intermediate representation to replace the conventional\nmel-spectrogram. Finally, we successfully demonstrate the ability of the\nproposed model by showing a comparable performance in the controllable TTS\ntask. Audio samples are available at https://sall-e.github.io/\n","authors":["Shengpeng Ji","Jialong Zuo","Minghui Fang","Ziyue Jiang","Feiyang Chen","Xinyu Duan","Baoxing Huai","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.14430v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14424v1","updated":"2023-08-28T09:04:52Z","published":"2023-08-28T09:04:52Z","title":"Shielded Reinforcement Learning for Hybrid Systems","summary":" Safe and optimal controller synthesis for switched-controlled hybrid systems,\nwhich combine differential equations and discrete changes of the system's\nstate, is known to be intricately hard. Reinforcement learning has been\nleveraged to construct near-optimal controllers, but their behavior is not\nguaranteed to be safe, even when it is encouraged by reward engineering. One\nway of imposing safety to a learned controller is to use a shield, which is\ncorrect by design. However, obtaining a shield for non-linear and hybrid\nenvironments is itself intractable. In this paper, we propose the construction\nof a shield using the so-called barbaric method, where an approximate finite\nrepresentation of an underlying partition-based two-player safety game is\nextracted via systematically picked samples of the true transition function.\nWhile hard safety guarantees are out of reach, we experimentally demonstrate\nstrong statistical safety guarantees with a prototype implementation and UPPAAL\nSTRATEGO. Furthermore, we study the impact of the synthesized shield when\napplied as either a pre-shield (applied before learning a controller) or a\npost-shield (only applied after learning a controller). We experimentally\ndemonstrate superiority of the pre-shielding approach. We apply our technique\non a range of case studies, including two industrial examples, and further\nstudy post-optimization of the post-shielding approach.\n","authors":["Asger Horn Brorholt","Peter Gjøl Jensen","Kim Guldstrand Larsen","Florian Lorber","Christian Schilling"],"pdf_url":"https://arxiv.org/pdf/2308.14424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.11796v2","updated":"2023-08-28T09:00:50Z","published":"2022-10-21T08:19:45Z","title":"Differentiable Constrained Imitation Learning for Robot Motion Planning\n and Control","summary":" Motion planning and control are crucial components of robotics applications\nlike automated driving. Here, spatio-temporal hard constraints like system\ndynamics and safety boundaries (e.g., obstacles) restrict the robot's motions.\nDirect methods from optimal control solve a constrained optimization problem.\nHowever, in many applications finding a proper cost function is inherently\ndifficult because of the weighting of partially conflicting objectives. On the\nother hand, Imitation Learning (IL) methods such as Behavior Cloning (BC)\nprovide an intuitive framework for learning decision-making from offline\ndemonstrations and constitute a promising avenue for planning and control in\ncomplex robot applications. Prior work primarily relied on soft constraint\napproaches, which use additional auxiliary loss terms describing the\nconstraints. However, catastrophic safety-critical failures might occur in\nout-of-distribution (OOD) scenarios. This work integrates the flexibility of IL\nwith hard constraint handling in optimal control. Our approach constitutes a\ngeneral framework for constraint robotic motion planning and control, as well\nas traffic agent simulation, whereas we focus on mobile robot and automated\ndriving applications. Hard constraints are integrated into the learning problem\nin a differentiable manner, via explicit completion and gradient-based\ncorrection. Simulated experiments of mobile robot navigation and automated\ndriving provide evidence for the performance of the proposed method.\n","authors":["Christopher Diehl","Janis Adamek","Martin Krüger","Frank Hoffmann","Torsten Bertram"],"pdf_url":"https://arxiv.org/pdf/2210.11796v2.pdf","comment":"International Conference on Intelligent Robots and Systems Agents4AD\n Workshop, IROS 2023"},{"id":"http://arxiv.org/abs/2210.13533v2","updated":"2023-08-28T08:58:18Z","published":"2022-10-24T18:34:24Z","title":"Sufficient Invariant Learning for Distribution Shift","summary":" Machine learning algorithms have shown remarkable performance in diverse\napplications. However, it is still challenging to guarantee performance in\ndistribution shifts when distributions of training and test datasets are\ndifferent. There have been several approaches to improve the performance in\ndistribution shift cases by learning invariant features across groups or\ndomains. However, we observe that the previous works only learn invariant\nfeatures partially. While the prior works focus on the limited invariant\nfeatures, we first raise the importance of the sufficient invariant features.\nSince only training sets are given empirically, the learned partial invariant\nfeatures from training sets might not be present in the test sets under\ndistribution shift. Therefore, the performance improvement on distribution\nshifts might be limited. In this paper, we argue that learning sufficient\ninvariant features from the training set is crucial for the distribution shift\ncase. Concretely, we newly observe the connection between a) sufficient\ninvariant features and b) flatness differences between groups or domains.\nMoreover, we propose a new algorithm, Adaptive Sharpness-aware Group\nDistributionally Robust Optimization (ASGDRO), to learn sufficient invariant\nfeatures across domains or groups. ASGDRO learns sufficient invariant features\nby seeking common flat minima across all groups or domains. Therefore, ASGDRO\nimproves the performance on diverse distribution shift cases. Besides, we\nprovide a new simple dataset, Heterogeneous-CMNIST, to diagnose whether the\nvarious algorithms learn sufficient invariant features.\n","authors":["Taero Kim","Sungjun Lim","Kyungwoo Song"],"pdf_url":"https://arxiv.org/pdf/2210.13533v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12016v2","updated":"2023-08-28T08:51:56Z","published":"2023-08-23T09:18:41Z","title":"MKL-$L_{0/1}$-SVM","summary":" This paper presents a Multiple Kernel Learning (abbreviated as MKL) framework\nfor the Support Vector Machine (SVM) with the $(0, 1)$ loss function. Some\nKKT-like first-order optimality conditions are provided and then exploited to\ndevelop a fast ADMM algorithm to solve the nonsmooth nonconvex optimization\nproblem. Numerical experiments on synthetic and real datasets show that the\nperformance of our MKL-$L_{0/1}$-SVM is comparable with the one of the leading\napproaches called SimpleMKL developed by Rakotomamonjy, Bach, Canu, and\nGrandvalet [Journal of Machine Learning Research, vol.~9, pp.~2491--2521,\n2008].\n","authors":["Bin Zhu","Yijie Shi"],"pdf_url":"https://arxiv.org/pdf/2308.12016v2.pdf","comment":"26 pages in the JMLR template, 4 figures, and 2 tables. arXiv admin\n note: substantial text overlap with arXiv:2303.04445"},{"id":"http://arxiv.org/abs/2308.14412v1","updated":"2023-08-28T08:50:12Z","published":"2023-08-28T08:50:12Z","title":"Task-Aware Machine Unlearning and Its Application in Load Forecasting","summary":" Data privacy and security have become a non-negligible factor in load\nforecasting. Previous researches mainly focus on training stage enhancement.\nHowever, once the model is trained and deployed, it may need to `forget' (i.e.,\nremove the impact of) part of training data if the data is found to be\nmalicious or as requested by the data owner. This paper introduces machine\nunlearning algorithm which is specifically designed to remove the influence of\npart of the original dataset on an already trained forecaster. However, direct\nunlearning inevitably degrades the model generalization ability. To balance\nbetween unlearning completeness and performance degradation, a\nperformance-aware algorithm is proposed by evaluating the sensitivity of local\nmodel parameter change using influence function and sample re-weighting.\nMoreover, we observe that the statistic criterion cannot fully reflect the\noperation cost of down-stream tasks. Therefore, a task-aware machine unlearning\nis proposed whose objective is a tri-level optimization with dispatch and\nredispatch problems considered. We theoretically prove the existence of the\ngradient of such objective, which is key to re-weighting the remaining samples.\nWe test the unlearning algorithms on linear and neural network load forecasters\nwith realistic load dataset. The simulation demonstrates the balance on\nunlearning completeness and operational cost. All codes can be found at\nhttps://github.com/xuwkk/task_aware_machine_unlearning.\n","authors":["Wangkun Xu","Fei Teng"],"pdf_url":"https://arxiv.org/pdf/2308.14412v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14409v1","updated":"2023-08-28T08:47:06Z","published":"2023-08-28T08:47:06Z","title":"Steerable Conditional Diffusion for Out-of-Distribution Adaptation in\n Imaging Inverse Problems","summary":" Denoising diffusion models have emerged as the go-to framework for solving\ninverse problems in imaging. A critical concern regarding these models is their\nperformance on out-of-distribution (OOD) tasks, which remains an under-explored\nchallenge. Realistic reconstructions inconsistent with the measured data can be\ngenerated, hallucinating image features that are uniquely present in the\ntraining dataset. To simultaneously enforce data-consistency and leverage\ndata-driven priors, we introduce a novel sampling framework called Steerable\nConditional Diffusion. This framework adapts the denoising network specifically\nto the available measured data. Utilising our proposed method, we achieve\nsubstantial enhancements in OOD performance across diverse imaging modalities,\nadvancing the robust deployment of denoising diffusion models in real-world\napplications.\n","authors":["Riccardo Barbano","Alexander Denker","Hyungjin Chung","Tae Hoon Roh","Simon Arrdige","Peter Maass","Bangti Jin","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2308.14409v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14407v1","updated":"2023-08-28T08:42:06Z","published":"2023-08-28T08:42:06Z","title":"Identifying topology of leaky photonic lattices with machine learning","summary":" We show how machine learning techniques can be applied for the classification\nof topological phases in leaky photonic lattices using limited measurement\ndata. We propose an approach based solely on bulk intensity measurements, thus\nexempt from the need for complicated phase retrieval procedures. In particular,\nwe design a fully connected neural network that accurately determines\ntopological properties from the output intensity distribution in dimerized\nwaveguide arrays with leaky channels, after propagation of a spatially\nlocalized initial excitation at a finite distance, in a setting that closely\nemulates realistic experimental conditions.\n","authors":["Ekaterina O. Smolina","Lev A. Smirnov","Daniel Leykam","Franco Nori","Daria A. Smirnova"],"pdf_url":"https://arxiv.org/pdf/2308.14407v1.pdf","comment":"9 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.14400v1","updated":"2023-08-28T08:33:45Z","published":"2023-08-28T08:33:45Z","title":"Semi-Supervised Semantic Depth Estimation using Symbiotic Transformer\n and NearFarMix Augmentation","summary":" In computer vision, depth estimation is crucial for domains like robotics,\nautonomous vehicles, augmented reality, and virtual reality. Integrating\nsemantics with depth enhances scene understanding through reciprocal\ninformation sharing. However, the scarcity of semantic information in datasets\nposes challenges. Existing convolutional approaches with limited local\nreceptive fields hinder the full utilization of the symbiotic potential between\ndepth and semantics. This paper introduces a dataset-invariant semi-supervised\nstrategy to address the scarcity of semantic information. It proposes the Depth\nSemantics Symbiosis module, leveraging the Symbiotic Transformer for achieving\ncomprehensive mutual awareness by information exchange within both local and\nglobal contexts. Additionally, a novel augmentation, NearFarMix is introduced\nto combat overfitting and compensate both depth-semantic tasks by strategically\nmerging regions from two images, generating diverse and structurally consistent\nsamples with enhanced control. Extensive experiments on NYU-Depth-V2 and KITTI\ndatasets demonstrate the superiority of our proposed techniques in indoor and\noutdoor environments.\n","authors":["Md Awsafur Rahman","Shaikh Anowarul Fattah"],"pdf_url":"https://arxiv.org/pdf/2308.14400v1.pdf","comment":"Accepted at WACV 2024"},{"id":"http://arxiv.org/abs/2202.06599v3","updated":"2023-08-28T08:27:30Z","published":"2022-02-14T10:40:51Z","title":"Multi-Atlas Segmentation and Spatial Alignment of the Human Embryo in\n First Trimester 3D Ultrasound","summary":" Segmentation and spatial alignment of ultrasound (US) imaging data acquired\nin the in first trimester are crucial for monitoring human embryonic growth and\ndevelopment throughout this crucial period of life. Current approaches are\neither manual or semi-automatic and are therefore very time-consuming and prone\nto errors. To automate these tasks, we propose a multi-atlas framework for\nautomatic segmentation and spatial alignment of the embryo using deep learning\nwith minimal supervision. Our framework learns to register the embryo to an\natlas, which consists of the US images acquired at a range of gestational age\n(GA), segmented and spatially aligned to a predefined standard orientation.\nFrom this, we can derive the segmentation of the embryo and put the embryo in\nstandard orientation. US images acquired at 8+0 till 12+6 weeks GA were used\nand eight subjects were selected as atlas. We evaluated different fusion\nstrategies to incorporate multiple atlases: 1) training the framework using\natlas images from a single subject, 2) training the framework with data of all\navailable atlases and 3) ensembling of the frameworks trained per subject. To\nevaluate the performance, we calculated the Dice score over the test set. We\nfound that training the framework using all available atlases outperformed\nensembling and gave similar results compared to the best of all frameworks\ntrained on a single subject. Furthermore, we found that selecting images from\nthe four atlases closest in GA out of all available atlases, regardless of the\nindividual quality, gave the best results with a median Dice score of 0.72. We\nconclude that our framework can accurately segment and spatially align the\nembryo in first trimester 3D US images and is robust for the variation in\nquality that existed in the available atlases.\n","authors":["W. A. P. Bastiaansen","M. Rousian","R. P. M. Steegers-Theunissen","W. J. Niessen","A. H. J. Koning","S. Klein"],"pdf_url":"https://arxiv.org/pdf/2202.06599v3.pdf","comment":"Accepted for publication at the Journal of Machine Learning for\n Biomedical Imaging (MELBA) https://www.melba-journal.org/papers/2022:020.html"},{"id":"http://arxiv.org/abs/2308.13269v2","updated":"2023-08-28T08:09:52Z","published":"2023-08-25T09:42:54Z","title":"Heterogeneous Decentralized Machine Unlearning with Seed Model\n Distillation","summary":" As some recent information security legislation endowed users with\nunconditional rights to be forgotten by any trained machine learning model,\npersonalized IoT service providers have to put unlearning functionality into\ntheir consideration. The most straightforward method to unlearn users'\ncontribution is to retrain the model from the initial state, which is not\nrealistic in high throughput applications with frequent unlearning requests.\nThough some machine unlearning frameworks have been proposed to speed up the\nretraining process, they fail to match decentralized learning scenarios. In\nthis paper, we design a decentralized unlearning framework called HDUS, which\nuses distilled seed models to construct erasable ensembles for all clients.\nMoreover, the framework is compatible with heterogeneous on-device models,\nrepresenting stronger scalability in real-world applications. Extensive\nexperiments on three real-world datasets show that our HDUS achieves\nstate-of-the-art performance.\n","authors":["Guanhua Ye","Tong Chen","Quoc Viet Hung Nguyen","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2308.13269v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14388v1","updated":"2023-08-28T08:07:57Z","published":"2023-08-28T08:07:57Z","title":"Biclustering Methods via Sparse Penalty","summary":" In this paper, we first reviewed several biclustering methods that are used\nto identify the most significant clusters in gene expression data. Here we\nmainly focused on the SSVD(sparse SVD) method and tried a new sparse penalty\nnamed \"Prenet penalty\" which has been used only in factor analysis to gain\nsparsity. Then in the simulation study, we tried different types of generated\ndatasets (with different sparsity and dimension) and tried 1-layer\napproximation then for k-layers which shows the mixed Prenet penalty is very\neffective for non-overlapped data. Finally, we used some real gene expression\ndata to show the behavior of our methods.\n","authors":["Jiqiang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14388v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.04391v5","updated":"2023-08-28T08:02:47Z","published":"2023-02-09T01:09:57Z","title":"The Re-Label Method For Data-Centric Machine Learning","summary":" In industry deep learning application, our manually labeled data has a\ncertain number of noisy data. To solve this problem and achieve more than 90\nscore in dev dataset, we present a simple method to find the noisy data and\nre-label the noisy data by human, given the model predictions as references in\nhuman labeling. In this paper, we illustrate our idea for a broad set of deep\nlearning tasks, includes classification, sequence tagging, object detection,\nsequence generation, click-through rate prediction. The experimental results\nand human evaluation results verify our idea.\n","authors":["Tong Guo"],"pdf_url":"https://arxiv.org/pdf/2302.04391v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14380v1","updated":"2023-08-28T07:55:01Z","published":"2023-08-28T07:55:01Z","title":"Self-Supervision for Tackling Unsupervised Anomaly Detection: Pitfalls\n and Opportunities","summary":" Self-supervised learning (SSL) is a growing torrent that has recently\ntransformed machine learning and its many real world applications, by learning\non massive amounts of unlabeled data via self-generated supervisory signals.\nUnsupervised anomaly detection (AD) has also capitalized on SSL, by\nself-generating pseudo-anomalies through various data augmentation functions or\nexternal data exposure. In this vision paper, we first underline the importance\nof the choice of SSL strategies on AD performance, by presenting evidences and\nstudies from the AD literature. Equipped with the understanding that SSL incurs\nvarious hyperparameters (HPs) to carefully tune, we present recent developments\non unsupervised model selection and augmentation tuning for SSL-based AD. We\nthen highlight emerging challenges and future opportunities; on designing new\npretext tasks and augmentation functions for different data modalities,\ncreating novel model selection solutions for systematically tuning the SSL HPs,\nas well as on capitalizing on the potential of pretrained foundation models on\nAD through effective density estimation.\n","authors":["Leman Akoglu","Jaemin Yoo"],"pdf_url":"https://arxiv.org/pdf/2308.14380v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14377v1","updated":"2023-08-28T07:49:30Z","published":"2023-08-28T07:49:30Z","title":"Meta Attentive Graph Convolutional Recurrent Network for Traffic\n Forecasting","summary":" Traffic forecasting is a fundamental problem in intelligent transportation\nsystems. Existing traffic predictors are limited by their expressive power to\nmodel the complex spatial-temporal dependencies in traffic data, mainly due to\nthe following limitations. Firstly, most approaches are primarily designed to\nmodel the local shared patterns, which makes them insufficient to capture the\nspecific patterns associated with each node globally. Hence, they fail to learn\neach node's unique properties and diversified patterns. Secondly, most existing\napproaches struggle to accurately model both short- and long-term dependencies\nsimultaneously. In this paper, we propose a novel traffic predictor, named Meta\nAttentive Graph Convolutional Recurrent Network (MAGCRN). MAGCRN utilizes a\nGraph Convolutional Recurrent Network (GCRN) as a core module to model local\ndependencies and improves its operation with two novel modules: 1) a\nNode-Specific Meta Pattern Learning (NMPL) module to capture node-specific\npatterns globally and 2) a Node Attention Weight Generation Module (NAWG)\nmodule to capture short- and long-term dependencies by connecting the\nnode-specific features with the ones learned initially at each time step during\nGCRN operation. Experiments on six real-world traffic datasets demonstrate that\nNMPL and NAWG together enable MAGCRN to outperform state-of-the-art baselines\non both short- and long-term predictions.\n","authors":["Adnan Zeb","Yongchao Ye","Shiyao Zhang","James J. Q. Yu"],"pdf_url":"https://arxiv.org/pdf/2308.14377v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14376v1","updated":"2023-08-28T07:49:01Z","published":"2023-08-28T07:49:01Z","title":"Are Existing Out-Of-Distribution Techniques Suitable for Network\n Intrusion Detection?","summary":" Machine learning (ML) has become increasingly popular in network intrusion\ndetection. However, ML-based solutions always respond regardless of whether the\ninput data reflects known patterns, a common issue across safety-critical\napplications. While several proposals exist for detecting Out-Of-Distribution\n(OOD) in other fields, it remains unclear whether these approaches can\neffectively identify new forms of intrusions for network security. New attacks,\nnot necessarily affecting overall distributions, are not guaranteed to be\nclearly OOD as instead, images depicting new classes are in computer vision. In\nthis work, we investigate whether existing OOD detectors from other fields\nallow the identification of unknown malicious traffic. We also explore whether\nmore discriminative and semantically richer embedding spaces within models,\nsuch as those created with contrastive learning and multi-class tasks, benefit\ndetection. Our investigation covers a set of six OOD techniques that employ\ndifferent detection strategies. These techniques are applied to models trained\nin various ways and subsequently exposed to unknown malicious traffic from the\nsame and different datasets (network environments). Our findings suggest that\nexisting detectors can identify a consistent portion of new malicious traffic,\nand that improved embedding spaces enhance detection. We also demonstrate that\nsimple combinations of certain detectors can identify almost 100% of malicious\ntraffic in our tested scenarios.\n","authors":["Andrea Corsini","Shanchieh Jay Yang"],"pdf_url":"https://arxiv.org/pdf/2308.14376v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14374v1","updated":"2023-08-28T07:42:26Z","published":"2023-08-28T07:42:26Z","title":"Online Continual Learning on Hierarchical Label Expansion","summary":" Continual learning (CL) enables models to adapt to new tasks and environments\nwithout forgetting previously learned knowledge. While current CL setups have\nignored the relationship between labels in the past task and the new task with\nor without small task overlaps, real-world scenarios often involve hierarchical\nrelationships between old and new tasks, posing another challenge for\ntraditional CL approaches. To address this challenge, we propose a novel\nmulti-level hierarchical class incremental task configuration with an online\nlearning constraint, called hierarchical label expansion (HLE). Our\nconfiguration allows a network to first learn coarse-grained classes, with data\nlabels continually expanding to more fine-grained classes in various hierarchy\ndepths. To tackle this new setup, we propose a rehearsal-based method that\nutilizes hierarchy-aware pseudo-labeling to incorporate hierarchical class\ninformation. Additionally, we propose a simple yet effective memory management\nand sampling strategy that selectively adopts samples of newly encountered\nclasses. Our experiments demonstrate that our proposed method can effectively\nuse hierarchy on our HLE setup to improve classification accuracy across all\nlevels of hierarchies, regardless of depth and class imbalance ratio,\noutperforming prior state-of-the-art works by significant margins while also\noutperforming them on the conventional disjoint, blurry and i-Blurry CL setups.\n","authors":["Byung Hyun Lee","Okchul Jung","Jonghyun Choi","Se Young Chun"],"pdf_url":"https://arxiv.org/pdf/2308.14374v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2102.00877v2","updated":"2023-08-28T07:41:55Z","published":"2021-02-01T14:36:34Z","title":"A probabilistic Taylor expansion with Gaussian processes","summary":" We study a class of Gaussian processes for which the posterior mean, for a\nparticular choice of data, replicates a truncated Taylor expansion of any\norder. The data consist of derivative evaluations at the expansion point and\nthe prior covariance kernel belongs to the class of Taylor kernels, which can\nbe written in a certain power series form. We discuss and prove some results on\nmaximum likelihood estimation of parameters of Taylor kernels. The proposed\nframework is a special case of Gaussian process regression based on data that\nis orthogonal in the reproducing kernel Hilbert space of the covariance kernel.\n","authors":["Toni Karvonen","Jon Cockayne","Filip Tronarp","Simo Särkkä"],"pdf_url":"https://arxiv.org/pdf/2102.00877v2.pdf","comment":"To appear in Transactions on Machine Learning Research"},{"id":"http://arxiv.org/abs/2210.00637v4","updated":"2023-08-28T07:41:52Z","published":"2022-10-02T21:36:27Z","title":"Benign Autoencoders","summary":" Recent progress in Generative Artificial Intelligence (AI) relies on\nefficient data representations, often featuring encoder-decoder architectures.\nWe formalize the mathematical problem of finding the optimal encoder-decoder\npair and characterize its solution, which we name the \"benign autoencoder\"\n(BAE). We prove that BAE projects data onto a manifold whose dimension is the\noptimal compressibility dimension of the generative problem. We highlight\nsurprising connections between BAE and several recent developments in AI, such\nas conditional GANs, context encoders, stable diffusion, stacked autoencoders,\nand the learning capabilities of generative models. As an illustration, we show\nhow BAE can find optimal, low-dimensional latent representations that improve\nthe performance of a discriminator under a distribution shift. By compressing\n\"malignant\" data dimensions, BAE leads to smoother and more stable gradients.\n","authors":["Semyon Malamud","Teng Andrea Xu","Antoine Didisheim"],"pdf_url":"https://arxiv.org/pdf/2210.00637v4.pdf","comment":"This paper replaces and subsumes arXiv:2110.08884"},{"id":"http://arxiv.org/abs/2308.09729v3","updated":"2023-08-28T07:37:36Z","published":"2023-08-17T16:59:50Z","title":"MindMap: Knowledge Graph Prompting Sparks Graph of Thoughts in Large\n Language Models","summary":" LLMs usually exhibit limitations in their ability to incorporate new\nknowledge, the generation of hallucinations, and the transparency of their\ndecision-making process. In this paper, we explore how to prompt LLMs with\nknowledge graphs (KG), working as a remedy to engage LLMs with up-to-date\nknowledge and elicit the reasoning pathways from LLMs. Specifically, we build a\nprompting pipeline that endows LLMs with the capability of comprehending KG\ninputs and inferring with a combined implicit knowledge and the retrieved\nexternal knowledge. In addition, we investigate eliciting the mind map on which\nLLMs perform the reasoning and generate the answers. It is identified that the\nproduced mind map exhibits the reasoning pathways of LLMs grounded on the\nontology of knowledge, hence bringing the prospects of probing and gauging LLM\ninference in production. The experiments on three question & answering datasets\nalso show that MindMap prompting leads to a striking empirical gain. For\ninstance, prompting a GPT-3.5 with MindMap yields an overwhelming performance\nover GPT-4 consistently. We also demonstrate that with structured facts\nretrieved from KG, MindMap can outperform a series of\nprompting-with-document-retrieval methods, benefiting from more accurate,\nconcise, and comprehensive knowledge from KGs.\n","authors":["Yilin Wen","Zifeng Wang","Jimeng Sun"],"pdf_url":"https://arxiv.org/pdf/2308.09729v3.pdf","comment":"7 pages, 8 figures, 9 tables"},{"id":"http://arxiv.org/abs/2209.14013v3","updated":"2023-08-28T07:32:23Z","published":"2022-09-28T11:41:38Z","title":"On the Robustness of Random Forest Against Untargeted Data Poisoning: An\n Ensemble-Based Approach","summary":" Machine learning is becoming ubiquitous. From finance to medicine, machine\nlearning models are boosting decision-making processes and even outperforming\nhumans in some tasks. This huge progress in terms of prediction quality does\nnot however find a counterpart in the security of such models and corresponding\npredictions, where perturbations of fractions of the training set (poisoning)\ncan seriously undermine the model accuracy. Research on poisoning attacks and\ndefenses received increasing attention in the last decade, leading to several\npromising solutions aiming to increase the robustness of machine learning.\nAmong them, ensemble-based defenses, where different models are trained on\nportions of the training set and their predictions are then aggregated, provide\nstrong theoretical guarantees at the price of a linear overhead. Surprisingly,\nensemble-based defenses, which do not pose any restrictions on the base model,\nhave not been applied to increase the robustness of random forest models. The\nwork in this paper aims to fill in this gap by designing and implementing a\nnovel hash-based ensemble approach that protects random forest against\nuntargeted, random poisoning attacks. An extensive experimental evaluation\nmeasures the performance of our approach against a variety of attacks, as well\nas its sustainability in terms of resource consumption and performance, and\ncompares it with a traditional monolithic model based on random forest. A final\ndiscussion presents our main findings and compares our approach with existing\npoisoning defenses targeting random forests.\n","authors":["Marco Anisetti","Claudio A. Ardagna","Alessandro Balestrucci","Nicola Bena","Ernesto Damiani","Chan Yeob Yeun"],"pdf_url":"https://arxiv.org/pdf/2209.14013v3.pdf","comment":"Accepted in IEEE Transactions on Sustainable Computing; 15 pages, 8\n figures"},{"id":"http://arxiv.org/abs/2302.02092v3","updated":"2023-08-28T07:25:10Z","published":"2023-02-04T04:52:22Z","title":"Interpolation for Robust Learning: Data Augmentation on Wasserstein\n Geodesics","summary":" We propose to study and promote the robustness of a model as per its\nperformance through the interpolation of training data distributions.\nSpecifically, (1) we augment the data by finding the worst-case Wasserstein\nbarycenter on the geodesic connecting subpopulation distributions of different\ncategories. (2) We regularize the model for smoother performance on the\ncontinuous geodesic path connecting subpopulation distributions. (3)\nAdditionally, we provide a theoretical guarantee of robustness improvement and\ninvestigate how the geodesic location and the sample size contribute,\nrespectively. Experimental validations of the proposed strategy on\n\\textit{four} datasets, including CIFAR-100 and ImageNet, establish the\nefficacy of our method, e.g., our method improves the baselines' certifiable\nrobustness on CIFAR10 up to $7.7\\%$, with $16.8\\%$ on empirical robustness on\nCIFAR-100. Our work provides a new perspective of model robustness through the\nlens of Wasserstein geodesic-based interpolation with a practical off-the-shelf\nstrategy that can be combined with existing robust training methods.\n","authors":["Jiacheng Zhu","Jielin Qiu","Aritra Guha","Zhuolin Yang","Xuanlong Nguyen","Bo Li","Ding Zhao"],"pdf_url":"https://arxiv.org/pdf/2302.02092v3.pdf","comment":"34 pages, 3 figures, 18 tables"},{"id":"http://arxiv.org/abs/2308.14364v1","updated":"2023-08-28T07:23:03Z","published":"2023-08-28T07:23:03Z","title":"Target-independent XLA optimization using Reinforcement Learning","summary":" An important challenge in Machine Learning compilers like XLA is multi-pass\noptimization and analysis. There has been recent interest chiefly in XLA\ntarget-dependent optimization on the graph-level, subgraph-level, and\nkernel-level phases. We specifically focus on target-independent optimization\nXLA HLO pass ordering: our approach aims at finding the optimal sequence of\ncompiler optimization passes, which is decoupled from target-dependent\noptimization. However, there is little domain specific study in pass ordering\nfor XLA HLO. To this end, we propose introducing deep Reinforcement Learning\n(RL) based search for optimal XLA HLO pass ordering. We also propose\nenhancements to the deep RL algorithms to further improve optimal search\nperformance and open the research direction for domain-specific guidance for\nRL. We create an XLA Gym experimentation framework as a tool to enable RL\nalgorithms to interact with the compiler for passing optimizations and thereby\ntrain agents. Overall, in our experimentation we observe an average of $13.3\\%$\nimprovement in operation count reduction on a benchmark of GPT-2 training\ngraphs and $10.4\\%$ improvement on a diverse benchmark including GPT-2, BERT,\nand ResNet graphs using the proposed approach over the compiler's default phase\nordering.\n","authors":["Milan Ganai","Haichen Li","Theodore Enns","Yida Wang","Randy Huang"],"pdf_url":"https://arxiv.org/pdf/2308.14364v1.pdf","comment":"Workshop on ML for Systems @ NeurIPS 2022"},{"id":"http://arxiv.org/abs/2308.14355v1","updated":"2023-08-28T07:03:08Z","published":"2023-08-28T07:03:08Z","title":"Can Transformer and GNN Help Each Other?","summary":" Although Transformer has achieved great success in natural language process\nand computer vision, it has difficulty generalizing to medium and large-scale\ngraph data for two important reasons: (i) High complexity. (ii) Failing to\ncapture the complex and entangled structure information. In graph\nrepresentation learning, Graph Neural Networks(GNNs) can fuse the graph\nstructure and node attributes but have limited receptive fields. Therefore, we\nquestion whether can we combine Transformers and GNNs to help each other. In\nthis paper, we propose a new model named TransGNN where the Transformer layer\nand GNN layer are used alternately to improve each other. Specifically, to\nexpand the receptive field and disentangle the information aggregation from\nedges, we propose using Transformer to aggregate more relevant nodes'\ninformation to improve the message passing of GNNs. Besides, to capture the\ngraph structure information, we utilize positional encoding and make use of the\nGNN layer to fuse the structure into node attributes, which improves the\nTransformer in graph data. We also propose to sample the most relevant nodes\nfor Transformer and two efficient samples update strategies to lower the\ncomplexity. At last, we theoretically prove that TransGNN is more expressive\nthan GNNs only with extra linear complexity. The experiments on eight datasets\ncorroborate the effectiveness of TransGNN on node and graph classification\ntasks.\n","authors":["Peiyan Zhang","Yuchen Yan","Chaozhuo Li","Senzhang Wang","Xing Xie","Sunghun Kim"],"pdf_url":"https://arxiv.org/pdf/2308.14355v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14352v1","updated":"2023-08-28T06:56:08Z","published":"2023-08-28T06:56:08Z","title":"EdgeMoE: Fast On-Device Inference of MoE-based Large Language Models","summary":" Large Language Models (LLMs) such as GPTs and LLaMa have ushered in a\nrevolution in machine intelligence, owing to their exceptional capabilities in\na wide range of machine learning tasks. However, the transition of LLMs from\ndata centers to edge devices presents a set of challenges and opportunities.\nWhile this shift can enhance privacy and availability, it is hampered by the\nenormous parameter sizes of these models, leading to impractical runtime costs.\nIn light of these considerations, we introduce EdgeMoE, the first on-device\ninference engine tailored for mixture-of-expert (MoE) LLMs, a popular variant\nof sparse LLMs that exhibit nearly constant computational complexity as their\nparameter size scales. EdgeMoE achieves both memory and computational\nefficiency by strategically partitioning the model across the storage\nhierarchy. Specifically, non-expert weights are stored in the device's memory,\nwhile expert weights are kept in external storage and are fetched into memory\nonly when they are activated. This design is underpinned by a crucial insight\nthat expert weights, though voluminous, are infrequently accessed due to sparse\nactivation patterns. To further mitigate the overhead associated with expert\nI/O swapping, EdgeMoE incorporates two innovative techniques: (1) Expert-wise\nbitwidth adaptation: This method reduces the size of expert weights with an\nacceptable level of accuracy loss. (2) Expert management: It predicts the\nexperts that will be activated in advance and preloads them into the\ncompute-I/O pipeline, thus further optimizing the process. In empirical\nevaluations conducted on well-established MoE LLMs and various edge devices,\nEdgeMoE demonstrates substantial memory savings and performance improvements\nwhen compared to competitive baseline solutions.\n","authors":["Rongjie Yi","Liwei Guo","Shiyun Wei","Ao Zhou","Shangguang Wang","Mengwei Xu"],"pdf_url":"https://arxiv.org/pdf/2308.14352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.12191v2","updated":"2023-08-28T06:54:12Z","published":"2022-12-23T08:20:37Z","title":"Deep Unfolding-based Weighted Averaging for Federated Learning in\n Heterogeneous Environments","summary":" Federated learning is a collaborative model training method that iterates\nmodel updates by multiple clients and aggregation of the updates by a central\nserver. Device and statistical heterogeneity of participating clients cause\nsignificant performance degradation so that an appropriate aggregation weight\nshould be assigned to each client in the aggregation phase of the server. To\nadjust the aggregation weights, this paper employs deep unfolding, which is\nknown as the parameter tuning method that leverages both learning capability\nusing training data like deep learning and domain knowledge. This enables us to\ndirectly incorporate the heterogeneity of the environment of interest into the\ntuning of the aggregation weights. The proposed approach can be combined with\nvarious federated learning algorithms. The results of numerical experiments\nindicate that a higher test accuracy for unknown class-balanced data can be\nobtained with the proposed method than that with conventional heuristic\nweighting methods. The proposed method can handle large-scale learning models\nwith the aid of pretrained models such that it can perform practical real-world\ntasks. Convergence rate of federated learning algorithms with the proposed\nmethod is also provided in this paper.\n","authors":["Ayano Nakai-Kasai","Tadashi Wadayama"],"pdf_url":"https://arxiv.org/pdf/2212.12191v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14350v1","updated":"2023-08-28T06:53:31Z","published":"2023-08-28T06:53:31Z","title":"Simple Modification of the Upper Confidence Bound Algorithm by\n Generalized Weighted Averages","summary":" The multi-armed bandit (MAB) problem is a classical problem that models\nsequential decision-making under uncertainty in reinforcement learning. In this\nstudy, we propose a new generalized upper confidence bound (UCB) algorithm\n(GWA-UCB1) by extending UCB1, which is a representative algorithm for MAB\nproblems, using generalized weighted averages, and present an effective\nalgorithm for various problem settings. GWA-UCB1 is a two-parameter\ngeneralization of the balance between exploration and exploitation in UCB1 and\ncan be implemented with a simple modification of the UCB1 formula. Therefore,\nthis algorithm can be easily applied to UCB-based reinforcement learning\nmodels. In preliminary experiments, we investigated the optimal parameters of a\nsimple generalized UCB1 (G-UCB1), prepared for comparison and GWA-UCB1, in a\nstochastic MAB problem with two arms. Subsequently, we confirmed the\nperformance of the algorithms with the investigated parameters on stochastic\nMAB problems when arm reward probabilities were sampled from uniform or normal\ndistributions and on survival MAB problems assuming more realistic situations.\nGWA-UCB1 outperformed G-UCB1, UCB1-Tuned, and Thompson sampling in most problem\nsettings and can be useful in many situations. The code is available at\nhttps://github.com/manome/python-mab.\n","authors":["Nobuhito Manome","Shuji Shinohara","Ung-il Chung"],"pdf_url":"https://arxiv.org/pdf/2308.14350v1.pdf","comment":"8 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.14348v1","updated":"2023-08-28T06:48:06Z","published":"2023-08-28T06:48:06Z","title":"Label-free Deep Learning Driven Secure Access Selection in\n Space-Air-Ground Integrated Networks","summary":" In Space-air-ground integrated networks (SAGIN), the inherent openness and\nextensive broadcast coverage expose these networks to significant eavesdropping\nthreats. Considering the inherent co-channel interference due to spectrum\nsharing among multi-tier access networks in SAGIN, it can be leveraged to\nassist the physical layer security among heterogeneous transmissions. However,\nit is challenging to conduct a secrecy-oriented access strategy due to both\nheterogeneous resources and different eavesdropping models. In this paper, we\nexplore secure access selection for a scenario involving multi-mode users\ncapable of accessing satellites, unmanned aerial vehicles, or base stations in\nthe presence of eavesdroppers. Particularly, we propose a Q-network\napproximation based deep learning approach for selecting the optimal access\nstrategy for maximizing the sum secrecy rate. Meanwhile, the power optimization\nis also carried out by an unsupervised learning approach to improve the secrecy\nperformance. Remarkably, two neural networks are trained by unsupervised\nlearning and Q-network approximation which are both label-free methods without\nknowing the optimal solution as labels. Numerical results verify the efficiency\nof our proposed power optimization approach and access strategy, leading to\nenhanced secure transmission performance.\n","authors":["Zhaowei Wang","Zhisheng Yin","Xiucheng Wang","Nan Cheng","Yuan Zhang","Tom H. Luan"],"pdf_url":"https://arxiv.org/pdf/2308.14348v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14343v1","updated":"2023-08-28T06:40:02Z","published":"2023-08-28T06:40:02Z","title":"Buy when? Survival machine learning model comparison for purchase timing","summary":" The value of raw data is unlocked by converting it into information and\nknowledge that drives decision-making. Machine Learning (ML) algorithms are\ncapable of analysing large datasets and making accurate predictions. Market\nsegmentation, client lifetime value, and marketing techniques have all made use\nof machine learning. This article examines marketing machine learning\ntechniques such as Support Vector Machines, Genetic Algorithms, Deep Learning,\nand K-Means. ML is used to analyse consumer behaviour, propose items, and make\nother customer choices about whether or not to purchase a product or service,\nbut it is seldom used to predict when a person will buy a product or a basket\nof products. In this paper, the survival models Kernel SVM, DeepSurv, Survival\nRandom Forest, and MTLR are examined to predict tine-purchase individual\ndecisions. Gender, Income, Location, PurchaseHistory, OnlineBehavior,\nInterests, PromotionsDiscounts and CustomerExperience all have an influence on\npurchasing time, according to the analysis. The study shows that the DeepSurv\nmodel predicted purchase completion the best. These insights assist marketers\nin increasing conversion rates.\n","authors":["Diego Vallarino"],"pdf_url":"https://arxiv.org/pdf/2308.14343v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14340v1","updated":"2023-08-28T06:32:09Z","published":"2023-08-28T06:32:09Z","title":"HRGCN: Heterogeneous Graph-level Anomaly Detection with Hierarchical\n Relation-augmented Graph Neural Networks","summary":" This work considers the problem of heterogeneous graph-level anomaly\ndetection. Heterogeneous graphs are commonly used to represent behaviours\nbetween different types of entities in complex industrial systems for capturing\nas much information about the system operations as possible. Detecting\nanomalous heterogeneous graphs from a large set of system behaviour graphs is\ncrucial for many real-world applications like online web/mobile service and\ncloud access control. To address the problem, we propose HRGCN, an unsupervised\ndeep heterogeneous graph neural network, to model complex heterogeneous\nrelations between different entities in the system for effectively identifying\nthese anomalous behaviour graphs. HRGCN trains a hierarchical\nrelation-augmented Heterogeneous Graph Neural Network (HetGNN), which learns\nbetter graph representations by modelling the interactions among all the system\nentities and considering both source-to-destination entity (node) types and\ntheir relation (edge) types. Extensive evaluation on two real-world application\ndatasets shows that HRGCN outperforms state-of-the-art competing anomaly\ndetection approaches. We further present a real-world industrial case study to\njustify the effectiveness of HRGCN in detecting anomalous (e.g., congested)\nnetwork devices in a mobile communication service. HRGCN is available at\nhttps://github.com/jiaxililearn/HRGCN.\n","authors":["Jiaxi Li","Guansong Pang","Ling Chen","Mohammad-Reza Namazi-Rad"],"pdf_url":"https://arxiv.org/pdf/2308.14340v1.pdf","comment":"12 pages, 10 figures, 6 tables. Accepted"},{"id":"http://arxiv.org/abs/2308.14338v1","updated":"2023-08-28T06:31:37Z","published":"2023-08-28T06:31:37Z","title":"Fair Few-shot Learning with Auxiliary Sets","summary":" Recently, there has been a growing interest in developing machine learning\n(ML) models that can promote fairness, i.e., eliminating biased predictions\ntowards certain populations (e.g., individuals from a specific demographic\ngroup). Most existing works learn such models based on well-designed fairness\nconstraints in optimization. Nevertheless, in many practical ML tasks, only\nvery few labeled data samples can be collected, which can lead to inferior\nfairness performance. This is because existing fairness constraints are\ndesigned to restrict the prediction disparity among different sensitive groups,\nbut with few samples, it becomes difficult to accurately measure the disparity,\nthus rendering ineffective fairness optimization. In this paper, we define the\nfairness-aware learning task with limited training samples as the \\emph{fair\nfew-shot learning} problem. To deal with this problem, we devise a novel\nframework that accumulates fairness-aware knowledge across different\nmeta-training tasks and then generalizes the learned knowledge to meta-test\ntasks. To compensate for insufficient training samples, we propose an essential\nstrategy to select and leverage an auxiliary set for each meta-test task. These\nauxiliary sets contain several labeled training samples that can enhance the\nmodel performance regarding fairness in meta-test tasks, thereby allowing for\nthe transfer of learned useful fairness-oriented knowledge to meta-test tasks.\nFurthermore, we conduct extensive experiments on three real-world datasets to\nvalidate the superiority of our framework against the state-of-the-art\nbaselines.\n","authors":["Song Wang","Jing Ma","Lu Cheng","Jundong Li"],"pdf_url":"https://arxiv.org/pdf/2308.14338v1.pdf","comment":"ECAI 2023"},{"id":"http://arxiv.org/abs/2102.03895v5","updated":"2023-08-28T06:26:04Z","published":"2021-02-07T19:29:28Z","title":"Functional optimal transport: map estimation and domain adaptation for\n functional data","summary":" We introduce a formulation of optimal transport problem for distributions on\nfunction spaces, where the stochastic map between functional domains can be\npartially represented in terms of an (infinite-dimensional) Hilbert-Schmidt\noperator mapping a Hilbert space of functions to another. For numerous machine\nlearning tasks, data can be naturally viewed as samples drawn from spaces of\nfunctions, such as curves and surfaces, in high dimensions. Optimal transport\nfor functional data analysis provides a useful framework of treatment for such\ndomains. { Since probability measures in infinite dimensional spaces generally\nlack absolute continuity (that is, with respect to non-degenerate Gaussian\nmeasures), the Monge map in the standard optimal transport theory for finite\ndimensional spaces may not exist. Our approach to the optimal transport problem\nin infinite dimensions is by a suitable regularization technique -- we restrict\nthe class of transport maps to be a Hilbert-Schmidt space of operators.} To\nthis end, we develop an efficient algorithm for finding the stochastic\ntransport map between functional domains and provide theoretical guarantees on\nthe existence, uniqueness, and consistency of our estimate for the\nHilbert-Schmidt operator. We validate our method on synthetic datasets and\nexamine the functional properties of the transport map. Experiments on\nreal-world datasets of robot arm trajectories further demonstrate the\neffectiveness of our method on applications in domain adaptation.\n","authors":["Jiacheng Zhu","Aritra Guha","Dat Do","Mengdi Xu","XuanLong Nguyen","Ding Zhao"],"pdf_url":"https://arxiv.org/pdf/2102.03895v5.pdf","comment":"48 pages, 10 figures, 3 tables"},{"id":"http://arxiv.org/abs/2308.14333v1","updated":"2023-08-28T06:22:43Z","published":"2023-08-28T06:22:43Z","title":"DiffSmooth: Certifiably Robust Learning via Diffusion Models and Local\n Smoothing","summary":" Diffusion models have been leveraged to perform adversarial purification and\nthus provide both empirical and certified robustness for a standard model. On\nthe other hand, different robustly trained smoothed models have been studied to\nimprove the certified robustness. Thus, it raises a natural question: Can\ndiffusion model be used to achieve improved certified robustness on those\nrobustly trained smoothed models? In this work, we first theoretically show\nthat recovered instances by diffusion models are in the bounded neighborhood of\nthe original instance with high probability; and the \"one-shot\" denoising\ndiffusion probabilistic models (DDPM) can approximate the mean of the generated\ndistribution of a continuous-time diffusion model, which approximates the\noriginal instance under mild conditions. Inspired by our analysis, we propose a\ncertifiably robust pipeline DiffSmooth, which first performs adversarial\npurification via diffusion models and then maps the purified instances to a\ncommon region via a simple yet effective local smoothing strategy. We conduct\nextensive experiments on different datasets and show that DiffSmooth achieves\nSOTA-certified robustness compared with eight baselines. For instance,\nDiffSmooth improves the SOTA-certified accuracy from $36.0\\%$ to $53.0\\%$ under\n$\\ell_2$ radius $1.5$ on ImageNet. The code is available at\n[https://github.com/javyduck/DiffSmooth].\n","authors":["Jiawei Zhang","Zhongzhu Chen","Huan Zhang","Chaowei Xiao","Bo Li"],"pdf_url":"https://arxiv.org/pdf/2308.14333v1.pdf","comment":"Accepted in 32nd USENIX Security, 2023"},{"id":"http://arxiv.org/abs/2308.14328v1","updated":"2023-08-28T06:15:14Z","published":"2023-08-28T06:15:14Z","title":"Reinforcement Learning for Generative AI: A Survey","summary":" Deep Generative AI has been a long-standing essential topic in the machine\nlearning community, which can impact a number of application areas like text\ngeneration and computer vision. The major paradigm to train a generative model\nis maximum likelihood estimation, which pushes the learner to capture and\napproximate the target data distribution by decreasing the divergence between\nthe model distribution and the target distribution. This formulation\nsuccessfully establishes the objective of generative tasks, while it is\nincapable of satisfying all the requirements that a user might expect from a\ngenerative model. Reinforcement learning, serving as a competitive option to\ninject new training signals by creating new objectives that exploit novel\nsignals, has demonstrated its power and flexibility to incorporate human\ninductive bias from multiple angles, such as adversarial learning,\nhand-designed rules and learned reward model to build a performant model.\nThereby, reinforcement learning has become a trending research field and has\nstretched the limits of generative AI in both model design and application. It\nis reasonable to summarize and conclude advances in recent years with a\ncomprehensive review. Although there are surveys in different application areas\nrecently, this survey aims to shed light on a high-level review that spans a\nrange of application areas. We provide a rigorous taxonomy in this area and\nmake sufficient coverage on various models and applications. Notably, we also\nsurveyed the fast-developing large language model area. We conclude this survey\nby showing the potential directions that might tackle the limit of current\nmodels and expand the frontiers for generative AI.\n","authors":["Yuanjiang Cao","Lina Yao","Julian McAuley","Quan Z. Sheng"],"pdf_url":"https://arxiv.org/pdf/2308.14328v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.13268v2","updated":"2023-08-28T06:08:49Z","published":"2023-02-26T08:43:08Z","title":"Revolutionizing Genomics with Reinforcement Learning Techniques","summary":" In recent years, Reinforcement Learning (RL) has emerged as a powerful tool\nfor solving a wide range of problems, including decision-making and genomics.\nThe exponential growth of raw genomic data over the past two decades has\nexceeded the capacity of manual analysis, leading to a growing interest in\nautomatic data analysis and processing. RL algorithms are capable of learning\nfrom experience with minimal human supervision, making them well-suited for\ngenomic data analysis and interpretation. One of the key benefits of using RL\nis the reduced cost associated with collecting labeled training data, which is\nrequired for supervised learning. While there have been numerous studies\nexamining the applications of Machine Learning (ML) in genomics, this survey\nfocuses exclusively on the use of RL in various genomics research fields,\nincluding gene regulatory networks (GRNs), genome assembly, and sequence\nalignment. We present a comprehensive technical overview of existing studies on\nthe application of RL in genomics, highlighting the strengths and limitations\nof these approaches. We then discuss potential research directions that are\nworthy of future exploration, including the development of more sophisticated\nreward functions as RL heavily depends on the accuracy of the reward function,\nthe integration of RL with other machine learning techniques, and the\napplication of RL to new and emerging areas in genomics research. Finally, we\npresent our findings and conclude by summarizing the current state of the field\nand the future outlook for RL in genomics.\n","authors":["Mohsen Karami","Roohallah Alizadehsani"," Khadijeh"," Jahanian","Ahmadreza Argha","Iman Dehzangi","Hamid Alinejad-Rokny"],"pdf_url":"https://arxiv.org/pdf/2302.13268v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14322v1","updated":"2023-08-28T06:05:23Z","published":"2023-08-28T06:05:23Z","title":"Machine Unlearning Methodology base on Stochastic Teacher Network","summary":" The rise of the phenomenon of the \"right to be forgotten\" has prompted\nresearch on machine unlearning, which grants data owners the right to actively\nwithdraw data that has been used for model training, and requires the\nelimination of the contribution of that data to the model. A simple method to\nachieve this is to use the remaining data to retrain the model, but this is not\nacceptable for other data owners who continue to participate in training.\nExisting machine unlearning methods have been found to be ineffective in\nquickly removing knowledge from deep learning models. This paper proposes using\na stochastic network as a teacher to expedite the mitigation of the influence\ncaused by forgotten data on the model. We performed experiments on three\ndatasets, and the findings demonstrate that our approach can efficiently\nmitigate the influence of target data on the model within a single epoch. This\nallows for one-time erasure and reconstruction of the model, and the\nreconstruction model achieves the same performance as the retrained model.\n","authors":["Xulong Zhang","Jianzong Wang","Ning Cheng","Yifu Sun","Chuanyao Zhang","Jing Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.14322v1.pdf","comment":"Accepted by 19th International Conference on Advanced Data Mining and\n Applications. (ADMA 2023)"},{"id":"http://arxiv.org/abs/2307.03380v3","updated":"2023-08-28T05:47:12Z","published":"2023-07-07T04:20:36Z","title":"On Formal Feature Attribution and Its Approximation","summary":" Recent years have witnessed the widespread use of artificial intelligence\n(AI) algorithms and machine learning (ML) models. Despite their tremendous\nsuccess, a number of vital problems like ML model brittleness, their fairness,\nand the lack of interpretability warrant the need for the active developments\nin explainable artificial intelligence (XAI) and formal ML model verification.\nThe two major lines of work in XAI include feature selection methods, e.g.\nAnchors, and feature attribution techniques, e.g. LIME and SHAP. Despite their\npromise, most of the existing feature selection and attribution approaches are\nsusceptible to a range of critical issues, including explanation unsoundness\nand out-of-distribution sampling. A recent formal approach to XAI (FXAI)\nalthough serving as an alternative to the above and free of these issues\nsuffers from a few other limitations. For instance and besides the scalability\nlimitation, the formal approach is unable to tackle the feature attribution\nproblem. Additionally, a formal explanation despite being formally sound is\ntypically quite large, which hampers its applicability in practical settings.\nMotivated by the above, this paper proposes a way to apply the apparatus of\nformal XAI to the case of feature attribution based on formal explanation\nenumeration. Formal feature attribution (FFA) is argued to be advantageous over\nthe existing methods, both formal and non-formal. Given the practical\ncomplexity of the problem, the paper then proposes an efficient technique for\napproximating exact FFA. Finally, it offers experimental evidence of the\neffectiveness of the proposed approximate FFA in comparison to the existing\nfeature attribution algorithms not only in terms of feature importance and but\nalso in terms of their relative order.\n","authors":["Jinqiang Yu","Alexey Ignatiev","Peter J. Stuckey"],"pdf_url":"https://arxiv.org/pdf/2307.03380v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14308v1","updated":"2023-08-28T05:23:16Z","published":"2023-08-28T05:23:16Z","title":"Policy Diversity for Cooperative Agents","summary":" Standard cooperative multi-agent reinforcement learning (MARL) methods aim to\nfind the optimal team cooperative policy to complete a task. However there may\nexist multiple different ways of cooperating, which usually are very needed by\ndomain experts. Therefore, identifying a set of significantly different\npolicies can alleviate the task complexity for them. Unfortunately, there is a\ngeneral lack of effective policy diversity approaches specifically designed for\nthe multi-agent domain. In this work, we propose a method called\nMoment-Matching Policy Diversity to alleviate this problem. This method can\ngenerate different team policies to varying degrees by formalizing the\ndifference between team policies as the difference in actions of selected\nagents in different policies. Theoretically, we show that our method is a\nsimple way to implement a constrained optimization problem that regularizes the\ndifference between two trajectory distributions by using the maximum mean\ndiscrepancy. The effectiveness of our approach is demonstrated on a challenging\nteam-based shooter.\n","authors":["Mingxi Tan","Andong Tian","Ludovic Denoyer"],"pdf_url":"https://arxiv.org/pdf/2308.14308v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.05490v7","updated":"2023-08-28T05:03:48Z","published":"2022-04-12T02:49:27Z","title":"Continuous-Time User Preference Modelling for Temporal Sets Prediction","summary":" Given a sequence of sets, where each set has a timestamp and contains an\narbitrary number of elements, temporal sets prediction aims to predict the\nelements in the subsequent set. Previous studies for temporal sets prediction\nmainly focus on the modelling of elements and implicitly represent each user's\npreference based on his/her interacted elements. However, user preferences are\noften continuously evolving and the evolutionary trend cannot be fully captured\nwith the indirect learning paradigm of user preferences. To this end, we\npropose a continuous-time user preference modelling framework for temporal sets\nprediction, which explicitly models the evolving preference of each user by\nmaintaining a memory bank to store the states of all the users and elements.\nSpecifically, we first construct a universal sequence by arranging all the\nuser-set interactions in a non-descending temporal order, and then\nchronologically learn from each user-set interaction. For each interaction, we\ncontinuously update the memories of the related user and elements based on\ntheir currently encoded messages and past memories. Moreover, we present a\npersonalized user behavior learning module to discover user-specific\ncharacteristics based on each user's historical sequence, which aggregates the\npreviously interacted elements from dual perspectives according to the user and\nelements. Finally, we develop a set-batch algorithm to improve the model\nefficiency, which can create time-consistent batches in advance and achieve\n3.5x and 3.0x speedups in the training and evaluation process on average.\nExperiments on four real-world datasets demonstrate the superiority of our\napproach over state-of-the-arts under both transductive and inductive settings.\nThe good interpretability of our method is also shown.\n","authors":["Le Yu","Zihang Liu","Leilei Sun","Bowen Du","Chuanren Liu","Weifeng Lv"],"pdf_url":"https://arxiv.org/pdf/2204.05490v7.pdf","comment":"Accepted by the TKDE journal"},{"id":"http://arxiv.org/abs/2308.03312v3","updated":"2023-08-28T04:53:52Z","published":"2023-08-07T05:40:58Z","title":"Symmetry-Preserving Program Representations for Learning Code Semantics","summary":" Large Language Models (LLMs) have shown promise in automated program\nreasoning, a crucial aspect of many security tasks. However, existing LLM\narchitectures for code are often borrowed from other domains like natural\nlanguage processing, raising concerns about their generalization and robustness\nto unseen code. A key generalization challenge is to incorporate the knowledge\nof code semantics, including control and data flow, into the LLM architectures.\n Drawing inspiration from examples of convolution layers exploiting\ntranslation symmetry, we explore how code symmetries can enhance LLM\narchitectures for program analysis and modeling. We present a rigorous\ngroup-theoretic framework that formally defines code symmetries as\nsemantics-preserving transformations and provides techniques for precisely\nreasoning about symmetry preservation within LLM architectures. Using this\nframework, we introduce a novel variant of self-attention that preserves\nprogram symmetries, demonstrating its effectiveness in generalization and\nrobustness through detailed experimental evaluations across different binary\nand source code analysis tasks. Overall, our code symmetry framework offers\nrigorous and powerful reasoning techniques that can guide the future\ndevelopment of specialized LLMs for code and advance LLM-guided program\nreasoning tasks.\n","authors":["Kexin Pei","Weichen Li","Qirui Jin","Shuyang Liu","Scott Geng","Lorenzo Cavallaro","Junfeng Yang","Suman Jana"],"pdf_url":"https://arxiv.org/pdf/2308.03312v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.13799v3","updated":"2023-08-28T04:52:53Z","published":"2022-02-28T13:48:41Z","title":"One-shot Ultra-high-Resolution Generative Adversarial Network That\n Synthesizes 16K Images On A Single GPU","summary":" We propose a one-shot ultra-high-resolution generative adversarial network\n(OUR-GAN) framework that generates non-repetitive 16K (16, 384 x 8, 640) images\nfrom a single training image and is trainable on a single consumer GPU. OUR-GAN\ngenerates an initial image that is visually plausible and varied in shape at\nlow resolution, and then gradually increases the resolution by adding detail\nthrough super-resolution. Since OUR-GAN learns from a real\nultra-high-resolution (UHR) image, it can synthesize large shapes with fine\ndetails and long-range coherence, which is difficult to achieve with\nconventional generative models that rely on the patch distribution learned from\nrelatively small images. OUR-GAN can synthesize high-quality 16K images with\n12.5 GB of GPU memory and 4K images with only 4.29 GB as it synthesizes a UHR\nimage part by part through seamless subregion-wise super-resolution.\nAdditionally, OUR-GAN improves visual coherence while maintaining diversity by\napplying vertical positional convolution. In experiments on the ST4K and RAISE\ndatasets, OUR-GAN exhibited improved fidelity, visual coherency, and diversity\ncompared with the baseline one-shot synthesis models. To the best of our\nknowledge, OUR-GAN is the first one-shot image synthesizer that generates\nnon-repetitive UHR images on a single consumer GPU. The synthesized image\nsamples are presented at https://our-gan.github.io.\n","authors":["Junseok Oh","Donghwee Yoon","Injung Kim"],"pdf_url":"https://arxiv.org/pdf/2202.13799v3.pdf","comment":"36 pages, 26 figures"},{"id":"http://arxiv.org/abs/2303.12091v2","updated":"2023-08-28T04:50:57Z","published":"2023-03-21T09:07:15Z","title":"Adaptive Negative Evidential Deep Learning for Open-set Semi-supervised\n Learning","summary":" Semi-supervised learning (SSL) methods assume that labeled data, unlabeled\ndata and test data are from the same distribution. Open-set semi-supervised\nlearning (Open-set SSL) considers a more practical scenario, where unlabeled\ndata and test data contain new categories (outliers) not observed in labeled\ndata (inliers). Most previous works focused on outlier detection via binary\nclassifiers, which suffer from insufficient scalability and inability to\ndistinguish different types of uncertainty. In this paper, we propose a novel\nframework, Adaptive Negative Evidential Deep Learning (ANEDL) to tackle these\nlimitations. Concretely, we first introduce evidential deep learning (EDL) as\nan outlier detector to quantify different types of uncertainty, and design\ndifferent uncertainty metrics for self-training and inference. Furthermore, we\npropose a novel adaptive negative optimization strategy, making EDL more\ntailored to the unlabeled dataset containing both inliers and outliers. As\ndemonstrated empirically, our proposed method outperforms existing\nstate-of-the-art methods across four datasets.\n","authors":["Yang Yu","Danruo Deng","Furui Liu","Yueming Jin","Qi Dou","Guangyong Chen","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2303.12091v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.11760v2","updated":"2023-08-28T04:50:40Z","published":"2022-11-21T07:26:56Z","title":"A Low Latency Adaptive Coding Spiking Framework for Deep Reinforcement\n Learning","summary":" In recent years, spiking neural networks (SNNs) have been used in\nreinforcement learning (RL) due to their low power consumption and event-driven\nfeatures. However, spiking reinforcement learning (SRL), which suffers from\nfixed coding methods, still faces the problems of high latency and poor\nversatility. In this paper, we use learnable matrix multiplication to encode\nand decode spikes, improving the flexibility of the coders and thus reducing\nlatency. Meanwhile, we train the SNNs using the direct training method and use\ntwo different structures for online and offline RL algorithms, which gives our\nmodel a wider range of applications. Extensive experiments have revealed that\nour method achieves optimal performance with ultra-low latency (as low as 0.8%\nof other SRL methods) and excellent energy efficiency (up to 5X the DNNs) in\ndifferent algorithms and different environments.\n","authors":["Lang Qin","Rui Yan","Huajin Tang"],"pdf_url":"https://arxiv.org/pdf/2211.11760v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14853v2","updated":"2023-08-28T04:46:52Z","published":"2023-06-26T17:07:54Z","title":"Near-Optimal Nonconvex-Strongly-Convex Bilevel Optimization with Fully\n First-Order Oracles","summary":" Bilevel optimization has wide applications such as hyperparameter tuning,\nneural architecture search, and meta-learning. Designing efficient algorithms\nfor bilevel optimization is challenging because the lower-level problem defines\na feasibility set implicitly via another optimization problem. In this work, we\nconsider one tractable case when the lower-level problem is strongly convex.\nRecent works show that with a Hessian-vector product oracle, one can provably\nfind an $\\epsilon$-first-order stationary point within\n$\\tilde{\\mathcal{O}}(\\epsilon^{-2})$ oracle calls. However, Hessian-vector\nproduct may be inaccessible or expensive in practice. Kwon et al. (ICML 2023)\naddressed this issue by proposing a first-order method that can achieve the\nsame goal at a slower rate of $\\tilde{\\mathcal{O}}(\\epsilon^{-3})$. In this\nwork, we provide a tighter analysis demonstrating that this method can converge\nat the near-optimal $\\tilde {\\mathcal{O}}(\\epsilon^{-2})$ rate as second-order\nmethods. Our analysis further leads to simple first-order algorithms that\nachieve similar convergence rates for finding second-order stationary points\nand for distributed bilevel problems.\n","authors":["Lesi Chen","Yaohua Ma","Jingzhao Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.14853v2.pdf","comment":"slightly change the title"},{"id":"http://arxiv.org/abs/2308.14304v1","updated":"2023-08-28T04:37:38Z","published":"2023-08-28T04:37:38Z","title":"Solving Attention Kernel Regression Problem via Pre-conditioner","summary":" Large language models have shown impressive performance in many tasks. One of\nthe major features from the computation perspective is computing the attention\nmatrix. Previous works [Zandieh, Han, Daliri, and Karba 2023, Alman and Song\n2023] have formally studied the possibility and impossibility of approximating\nthe attention matrix. In this work, we define and study a new problem which is\ncalled the attention kernel regression problem. We show how to solve the\nattention kernel regression in the input sparsity time of the data matrix.\n","authors":["Zhao Song","Junze Yin","Lichen Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.14304v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14295v1","updated":"2023-08-28T04:29:49Z","published":"2023-08-28T04:29:49Z","title":"Traffic Light Control with Reinforcement Learning","summary":" Traffic light control is important for reducing congestion in urban mobility\nsystems. This paper proposes a real-time traffic light control method using\ndeep Q learning. Our approach incorporates a reward function considering queue\nlengths, delays, travel time, and throughput. The model dynamically decides\nphase changes based on current traffic conditions. The training of the deep Q\nnetwork involves an offline stage from pre-generated data with fixed schedules\nand an online stage using real-time traffic data. A deep Q network structure\nwith a \"phase gate\" component is used to simplify the model's learning task\nunder different phases. A \"memory palace\" mechanism is used to address sample\nimbalance during the training process. We validate our approach using both\nsynthetic and real-world traffic flow data on a road intersecting in Hangzhou,\nChina. Results demonstrate significant performance improvements of the proposed\nmethod in reducing vehicle waiting time (57.1% to 100%), queue lengths (40.9%\nto 100%), and total travel time (16.8% to 68.0%) compared to traditional fixed\nsignal plans.\n","authors":["Taoyu Pan"],"pdf_url":"https://arxiv.org/pdf/2308.14295v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13298v2","updated":"2023-08-28T04:21:38Z","published":"2023-08-25T10:47:37Z","title":"Federated Linear Bandit Learning via Over-the-Air Computation","summary":" In this paper, we investigate federated contextual linear bandit learning\nwithin a wireless system that comprises a server and multiple devices. Each\ndevice interacts with the environment, selects an action based on the received\nreward, and sends model updates to the server. The primary objective is to\nminimize cumulative regret across all devices within a finite time horizon. To\nreduce the communication overhead, devices communicate with the server via\nover-the-air computation (AirComp) over noisy fading channels, where the\nchannel noise may distort the signals. In this context, we propose a customized\nfederated linear bandits scheme, where each device transmits an analog signal,\nand the server receives a superposition of these signals distorted by channel\nnoise. A rigorous mathematical analysis is conducted to determine the regret\nbound of the proposed scheme. Both theoretical analysis and numerical\nexperiments demonstrate the competitive performance of our proposed scheme in\nterms of regret bounds in various settings.\n","authors":["Jiali Wang","Yuning Jiang","Xin Liu","Ting Wang","Yuanming Shi"],"pdf_url":"https://arxiv.org/pdf/2308.13298v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14272v1","updated":"2023-08-28T03:03:03Z","published":"2023-08-28T03:03:03Z","title":"Goodhart's Law Applies to NLP's Explanation Benchmarks","summary":" Despite the rising popularity of saliency-based explanations, the research\ncommunity remains at an impasse, facing doubts concerning their purpose,\nefficacy, and tendency to contradict each other. Seeking to unite the\ncommunity's efforts around common goals, several recent works have proposed\nevaluation metrics. In this paper, we critically examine two sets of metrics:\nthe ERASER metrics (comprehensiveness and sufficiency) and the EVAL-X metrics,\nfocusing our inquiry on natural language processing. First, we show that we can\ninflate a model's comprehensiveness and sufficiency scores dramatically without\naltering its predictions or explanations on in-distribution test inputs. Our\nstrategy exploits the tendency for extracted explanations and their complements\nto be \"out-of-support\" relative to each other and in-distribution inputs. Next,\nwe demonstrate that the EVAL-X metrics can be inflated arbitrarily by a simple\nmethod that encodes the label, even though EVAL-X is precisely motivated to\naddress such exploits. Our results raise doubts about the ability of current\nmetrics to guide explainability research, underscoring the need for a broader\nreassessment of what precisely these metrics are intended to capture.\n","authors":["Jennifer Hsia","Danish Pruthi","Aarti Singh","Zachary C. Lipton"],"pdf_url":"https://arxiv.org/pdf/2308.14272v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.01079v4","updated":"2023-08-28T02:58:42Z","published":"2022-01-04T10:49:30Z","title":"Incomplete Multi-View Weak-Label Learning","summary":" A variety of modern applications exhibit multi-view multi-label learning,\nwhere each sample has multi-view features, and multiple labels are correlated\nvia common views. Current methods usually fail to directly deal with the\nsetting where only a subset of features and labels are observed for each\nsample, and ignore the presence of noisy views and imbalanced labels in\nreal-world problems. In this paper, we propose a novel method to overcome the\nlimitations. It jointly embeds incomplete views and weak labels into a\nlow-dimensional subspace with adaptive weights, and facilitates the difference\nbetween embedding weight matrices via auto-weighted Hilbert-Schmidt\nIndependence Criterion (HSIC) to reduce the redundancy. Moreover, it adaptively\nlearns view-wise importance for embedding to detect noisy views, and mitigates\nthe label imbalance problem by focal loss. Experimental results on four\nreal-world multi-view multi-label datasets demonstrate the effectiveness of the\nproposed method.\n","authors":["Zhiwei Li","Zijian Yang","Lu Sun","Mineichi Kudo","Kego Kimura"],"pdf_url":"https://arxiv.org/pdf/2201.01079v4.pdf","comment":"6 pages, 2 figures, conference"},{"id":"http://arxiv.org/abs/2308.14267v1","updated":"2023-08-28T02:49:07Z","published":"2023-08-28T02:49:07Z","title":"Unleash Model Potential: Bootstrapped Meta Self-supervised Learning","summary":" The long-term goal of machine learning is to learn general visual\nrepresentations from a small amount of data without supervision, mimicking\nthree advantages of human cognition: i) no need for labels, ii) robustness to\ndata scarcity, and iii) learning from experience. Self-supervised learning and\nmeta-learning are two promising techniques to achieve this goal, but they both\nonly partially capture the advantages and fail to address all the problems.\nSelf-supervised learning struggles to overcome the drawbacks of data scarcity,\nwhile ignoring prior knowledge that can facilitate learning and generalization.\nMeta-learning relies on supervised information and suffers from a bottleneck of\ninsufficient learning. To address these issues, we propose a novel Bootstrapped\nMeta Self-Supervised Learning (BMSSL) framework that aims to simulate the human\nlearning process. We first analyze the close relationship between meta-learning\nand self-supervised learning. Based on this insight, we reconstruct tasks to\nleverage the strengths of both paradigms, achieving advantages i and ii.\nMoreover, we employ a bi-level optimization framework that alternates between\nsolving specific tasks with a learned ability (first level) and improving this\nability (second level), attaining advantage iii. To fully harness its power, we\nintroduce a bootstrapped target based on meta-gradient to make the model its\nown teacher. We validate the effectiveness of our approach with comprehensive\ntheoretical and empirical study.\n","authors":["Jingyao Wang","Zeen Song","Wenwen Qiang","Changwen Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.14267v1.pdf","comment":"submitted to NIPS"},{"id":"http://arxiv.org/abs/2308.14258v1","updated":"2023-08-28T02:25:11Z","published":"2023-08-28T02:25:11Z","title":"Breaking Boundaries: Distributed Domain Decomposition with Scalable\n Physics-Informed Neural PDE Solvers","summary":" Mosaic Flow is a novel domain decomposition method designed to scale\nphysics-informed neural PDE solvers to large domains. Its unique approach\nleverages pre-trained networks on small domains to solve partial differential\nequations on large domains purely through inference, resulting in high\nreusability. This paper presents an end-to-end parallelization of Mosaic Flow,\ncombining data parallel training and domain parallelism for inference on\nlarge-scale problems. By optimizing the network architecture and data parallel\ntraining, we significantly reduce the training time for learning the Laplacian\noperator to minutes on 32 GPUs. Moreover, our distributed domain decomposition\nalgorithm enables scalable inferences for solving the Laplace equation on\ndomains 4096 times larger than the training domain, demonstrating strong\nscaling while maintaining accuracy on 32 GPUs. The reusability of Mosaic Flow,\ncombined with the improved performance achieved through the distributed-memory\nalgorithms, makes it a promising tool for modeling complex physical phenomena\nand accelerating scientific discovery.\n","authors":["Arthur Feeney","Zitong Li","Ramin Bostanabad","Aparna Chandramowlishwaran"],"pdf_url":"https://arxiv.org/pdf/2308.14258v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14253v1","updated":"2023-08-28T02:10:38Z","published":"2023-08-28T02:10:38Z","title":"The Promise and Peril of Artificial Intelligence -- Violet Teaming\n Offers a Balanced Path Forward","summary":" Artificial intelligence (AI) promises immense benefits across sectors, yet\nalso poses risks from dual-use potentials, biases, and unintended behaviors.\nThis paper reviews emerging issues with opaque and uncontrollable AI systems\nand proposes an integrative framework called violet teaming to develop reliable\nand responsible AI. Violet teaming combines adversarial vulnerability probing\n(red teaming) with solutions for safety and security (blue teaming) while\nprioritizing ethics and social benefit. It emerged from AI safety research to\nmanage risks proactively by design. The paper traces the evolution of red,\nblue, and purple teaming toward violet teaming, and then discusses applying\nviolet techniques to address biosecurity risks of AI in biotechnology.\nAdditional sections review key perspectives across law, ethics, cybersecurity,\nmacrostrategy, and industry best practices essential for operationalizing\nresponsible AI through holistic technical and social considerations. Violet\nteaming provides both philosophy and method for steering AI trajectories toward\nsocietal good. With conscience and wisdom, the extraordinary capabilities of AI\ncan enrich humanity. But without adequate precaution, the risks could prove\ncatastrophic. Violet teaming aims to empower moral technology for the common\nwelfare.\n","authors":["Alexander J. Titus","Adam H. Russell"],"pdf_url":"https://arxiv.org/pdf/2308.14253v1.pdf","comment":"14 pages, 1 figure"},{"id":"http://arxiv.org/abs/2308.14250v1","updated":"2023-08-28T01:57:38Z","published":"2023-08-28T01:57:38Z","title":"Rule-Based Error Detection and Correction to Operationalize Movement\n Trajectory Classification","summary":" Classification of movement trajectories has many applications in\ntransportation. Supervised neural models represent the current\nstate-of-the-art. Recent security applications require this task to be rapidly\nemployed in environments that may differ from the data used to train such\nmodels for which there is little training data. We provide a neuro-symbolic\nrule-based framework to conduct error correction and detection of these models\nto support eventual deployment in security applications. We provide a suite of\nexperiments on several recent and state-of-the-art models and show an accuracy\nimprovement of 1.7% over the SOTA model in the case where all classes are\npresent in training and when 40% of classes are omitted from training, we\nobtain a 5.2% improvement (zero-shot) and 23.9% (few-shot) improvement over the\nSOTA model without resorting to retraining of the base model.\n","authors":["Bowen Xi","Kevin Scaria","Paulo Shakarian"],"pdf_url":"https://arxiv.org/pdf/2308.14250v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11339v2","updated":"2023-08-28T01:50:00Z","published":"2023-08-22T10:36:56Z","title":"ProAgent: Building Proactive Cooperative AI with Large Language Models","summary":" Building AIs with adaptive behaviors in human-AI cooperation stands as a\npivotal focus in AGI research. Current methods for developing cooperative\nagents predominantly rely on learning-based methods, where policy\ngeneralization heavily hinges on past interactions with specific teammates.\nThese approaches constrain the agent's capacity to recalibrate its strategy\nwhen confronted with novel teammates. We propose \\textbf{ProAgent}, a novel\nframework that harnesses large language models (LLMs) to fashion a\n\\textit{pro}active \\textit{agent} empowered with the ability to anticipate\nteammates' forthcoming decisions and formulate enhanced plans for itself.\nProAgent excels at cooperative reasoning with the capacity to dynamically adapt\nits behavior to enhance collaborative efforts with teammates. Moreover, the\nProAgent framework exhibits a high degree of modularity and interpretability,\nfacilitating seamless integration to address a wide array of coordination\nscenarios. Experimental evaluations conducted within the framework of\n\\textit{Overcook-AI} unveil the remarkable performance superiority of ProAgent,\noutperforming five methods based on self-play and population-based training in\ncooperation with AI agents. Further, when cooperating with human proxy models,\nits performance exhibits an average improvement exceeding 10\\% compared to the\ncurrent state-of-the-art, COLE. The advancement was consistently observed\nacross diverse scenarios involving interactions with both AI agents of varying\ncharacteristics and human counterparts. These findings inspire future research\nfor human-robot collaborations. For a hands-on demonstration, please visit\n\\url{https://pku-proagent.github.io}.\n","authors":["Ceyao Zhang","Kaijie Yang","Siyi Hu","Zihao Wang","Guanghe Li","Yihang Sun","Cheng Zhang","Zhaowei Zhang","Anji Liu","Song-Chun Zhu","Xiaojun Chang","Junge Zhang","Feng Yin","Yitao Liang","Yaodong Yang"],"pdf_url":"https://arxiv.org/pdf/2308.11339v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04263v2","updated":"2023-08-28T01:39:00Z","published":"2023-08-08T13:59:56Z","title":"BarlowRL: Barlow Twins for Data-Efficient Reinforcement Learning","summary":" This paper introduces BarlowRL, a data-efficient reinforcement learning agent\nthat combines the Barlow Twins self-supervised learning framework with DER\n(Data-Efficient Rainbow) algorithm. BarlowRL outperforms both DER and its\ncontrastive counterpart CURL on the Atari 100k benchmark. BarlowRL avoids\ndimensional collapse by enforcing information spread to the whole space. This\nhelps RL algorithms to utilize uniformly spread state representation that\neventually results in a remarkable performance. The integration of Barlow Twins\nwith DER enhances data efficiency and achieves superior performance in the RL\ntasks. BarlowRL demonstrates the potential of incorporating self-supervised\nlearning techniques to improve RL algorithms.\n","authors":["Omer Veysel Cagatan","Baris Akgun"],"pdf_url":"https://arxiv.org/pdf/2308.04263v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14245v1","updated":"2023-08-28T01:21:22Z","published":"2023-08-28T01:21:22Z","title":"A Comparison of Personalized and Generalized Approaches to Emotion\n Recognition Using Consumer Wearable Devices: Machine Learning Study","summary":" Background: Studies have shown the potential adverse health effects, ranging\nfrom headaches to cardiovascular disease, associated with long-term negative\nemotions and chronic stress. Since many indicators of stress are imperceptible\nto observers, the early detection and intervention of stress remains a pressing\nmedical need. Physiological signals offer a non-invasive method of monitoring\nemotions and are easily collected by smartwatches. Existing research primarily\nfocuses on developing generalized machine learning-based models for emotion\nclassification. Objective: We aim to study the differences between personalized\nand generalized machine learning models for three-class emotion classification\n(neutral, stress, and amusement) using wearable biosignal data. Methods: We\ndeveloped a convolutional encoder for the three-class emotion classification\nproblem using data from WESAD, a multimodal dataset with physiological signals\nfor 15 subjects. We compared the results between a subject-exclusive\ngeneralized, subject-inclusive generalized, and personalized model. Results:\nFor the three-class classification problem, our personalized model achieved an\naverage accuracy of 95.06% and F1-score of 91.71, our subject-inclusive\ngeneralized model achieved an average accuracy of 66.95% and F1-score of 42.50,\nand our subject-exclusive generalized model achieved an average accuracy of\n67.65% and F1-score of 43.05. Conclusions: Our results emphasize the need for\nincreased research in personalized emotion recognition models given that they\noutperform generalized models in certain contexts. We also demonstrate that\npersonalized machine learning models for emotion classification are viable and\ncan achieve high performance.\n","authors":["Joe Li","Peter Washington"],"pdf_url":"https://arxiv.org/pdf/2308.14245v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.17262v2","updated":"2023-08-28T01:17:32Z","published":"2022-10-31T12:36:37Z","title":"QNet: A Quantum-native Sequence Encoder Architecture","summary":" This work proposes QNet, a novel sequence encoder model that entirely\ninferences on the quantum computer using a minimum number of qubits. Let $n$\nand $d$ represent the length of the sequence and the embedding size,\nrespectively. The dot-product attention mechanism requires a time complexity of\n$O(n^2 \\cdot d)$, while QNet has merely $O(n+d)$ quantum circuit depth. In\naddition, we introduce ResQNet, a quantum-classical hybrid model composed of\nseveral QNet blocks linked by residual connections, as an isomorph Transformer\nEncoder. We evaluated our work on various natural language processing tasks,\nincluding text classification, rating score prediction, and named entity\nrecognition. Our models exhibit compelling performance over classical\nstate-of-the-art models with a thousand times fewer parameters. In summary,\nthis work investigates the advantage of machine learning on near-term quantum\ncomputers in sequential data by experimenting with natural language processing\ntasks.\n","authors":["Wei Day","Hao-Sheng Chen","Min-Te Sun"],"pdf_url":"https://arxiv.org/pdf/2210.17262v2.pdf","comment":"QCE23: 2023 IEEE International Conference on Quantum Computing &\n Engineering"},{"id":"http://arxiv.org/abs/2308.13111v2","updated":"2023-08-28T00:38:43Z","published":"2023-08-24T23:06:21Z","title":"Bayesian low-rank adaptation for large language models","summary":" Parameter-efficient fine-tuning (PEFT) has emerged as a new paradigm for\ncost-efficient fine-tuning of large language models (LLMs), with low-rank\nadaptation (LoRA) being a widely adopted choice. However, fine-tuned LLMs often\nbecome overconfident especially when fine-tuned on small datasets. Bayesian\nmethods, with their inherent ability to estimate uncertainty, serve as potent\ntools to mitigate overconfidence and enhance calibration. In this work, we\nintroduce Laplace-LoRA, a straightforward yet effective Bayesian method, which\napplies the Laplace approximation to the LoRA parameters and, considerably\nboosts the calibration of fine-tuned LLMs.\n","authors":["Adam X. Yang","Maxime Robeyns","Xi Wang","Laurence Aitchison"],"pdf_url":"https://arxiv.org/pdf/2308.13111v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09658v2","updated":"2023-08-28T00:36:11Z","published":"2023-03-16T21:31:55Z","title":"Energy Management of Multi-mode Plug-in Hybrid Electric Vehicle using\n Multi-agent Deep Reinforcement Learning","summary":" The recently emerging multi-mode plug-in hybrid electric vehicle (PHEV)\ntechnology is one of the pathways making contributions to decarbonization, and\nits energy management requires multiple-input and multipleoutput (MIMO)\ncontrol. At the present, the existing methods usually decouple the MIMO control\ninto singleoutput (MISO) control and can only achieve its local optimal\nperformance. To optimize the multi-mode vehicle globally, this paper studies a\nMIMO control method for energy management of the multi-mode PHEV based on\nmulti-agent deep reinforcement learning (MADRL). By introducing a relevance\nratio, a hand-shaking strategy is proposed to enable two learning agents to\nwork collaboratively under the MADRL framework using the deep deterministic\npolicy gradient (DDPG) algorithm. Unified settings for the DDPG agents are\nobtained through a sensitivity analysis of the influencing factors to the\nlearning performance. The optimal working mode for the hand-shaking strategy is\nattained through a parametric study on the relevance ratio. The advantage of\nthe proposed energy management method is demonstrated on a software-in-the-loop\ntesting platform. The result of the study indicates that the learning rate of\nthe DDPG agents is the greatest influencing factor for learning performance.\nUsing the unified DDPG settings and a relevance ratio of 0.2, the proposed\nMADRL system can save up to 4% energy compared to the single-agent learning\nsystem and up to 23.54% energy compared to the conventional rule-based system.\n","authors":["Min Hua","Cetengfei Zhang","Fanggang Zhang","Zhi Li","Xiaoli Yu","Hongming Xu","Quan Zhou"],"pdf_url":"https://arxiv.org/pdf/2303.09658v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14239v1","updated":"2023-08-28T00:34:40Z","published":"2023-08-28T00:34:40Z","title":"Quantum Next Generation Reservoir Computing: An Efficient Quantum\n Algorithm for Forecasting Quantum Dynamics","summary":" Next Generation Reservoir Computing (NG-RC) is a modern class of model-free\nmachine learning that enables an accurate forecasting of time series data\ngenerated by dynamical systems. We demonstrate that NG-RC can accurately\npredict full many-body quantum dynamics, instead of merely concentrating on the\ndynamics of observables, which is the conventional application of reservoir\ncomputing. In addition, we apply a technique which we refer to as skipping\nahead to predict far future states accurately without the need to extract\ninformation about the intermediate states. However, adopting a classical NG-RC\nfor many-body quantum dynamics prediction is computationally prohibitive due to\nthe large Hilbert space of sample input data. In this work, we propose an\nend-to-end quantum algorithm for many-body quantum dynamics forecasting with a\nquantum computational speedup via the block-encoding technique. This proposal\npresents an efficient model-free quantum scheme to forecast quantum dynamics\ncoherently, bypassing inductive biases incurred in a model-based approach.\n","authors":["Apimuk Sornsaeng","Ninnat Dangniam","Thiparat Chotibut"],"pdf_url":"https://arxiv.org/pdf/2308.14239v1.pdf","comment":"14 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.14946v1","updated":"2023-08-28T23:55:23Z","published":"2023-08-28T23:55:23Z","title":"Reinforcement Learning for Sampling on Temporal Medical Imaging\n Sequences","summary":" Accelerated magnetic resonance imaging resorts to either Fourier-domain\nsubsampling or better reconstruction algorithms to deal with fewer measurements\nwhile still generating medical images of high quality. Determining the optimal\nsampling strategy given a fixed reconstruction protocol often has combinatorial\ncomplexity. In this work, we apply double deep Q-learning and REINFORCE\nalgorithms to learn the sampling strategy for dynamic image reconstruction. We\nconsider the data in the format of time series, and the reconstruction method\nis a pre-trained autoencoder-typed neural network. We present a proof of\nconcept that reinforcement learning algorithms are effective to discover the\noptimal sampling pattern which underlies the pre-trained reconstructor network\n(i.e., the dynamics in the environment). The code for replicating experiments\ncan be found at https://github.com/zhishenhuang/RLsamp.\n","authors":["Zhishen Huang"],"pdf_url":"https://arxiv.org/pdf/2308.14946v1.pdf","comment":"ICML 2023 Workshop SODS"},{"id":"http://arxiv.org/abs/2308.14945v1","updated":"2023-08-28T23:51:33Z","published":"2023-08-28T23:51:33Z","title":"Noise-Free Sampling Algorithms via Regularized Wasserstein Proximals","summary":" We consider the problem of sampling from a distribution governed by a\npotential function. This work proposes an explicit score-based MCMC method that\nis deterministic, resulting in a deterministic evolution for particles rather\nthan a stochastic differential equation evolution. The score term is given in\nclosed form by a regularized Wasserstein proximal, using a kernel convolution\nthat is approximated by sampling. We demonstrate fast convergence on various\nproblems and show improved dimensional dependence of mixing time bounds for the\ncase of Gaussian distributions compared to the unadjusted Langevin algorithm\n(ULA) and the Metropolis-adjusted Langevin algorithm (MALA). We additionally\nderive closed form expressions for the distributions at each iterate for\nquadratic potential functions, characterizing the variance reduction. Empirical\nresults demonstrate that the particles behave in an organized manner, lying on\nlevel set contours of the potential. Moreover, the posterior mean estimator of\nthe proposed method is shown to be closer to the maximum a-posteriori estimator\ncompared to ULA and MALA, in the context of Bayesian logistic regression.\n","authors":["Hong Ye Tan","Stanley Osher","Wuchen Li"],"pdf_url":"https://arxiv.org/pdf/2308.14945v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14938v1","updated":"2023-08-28T23:33:07Z","published":"2023-08-28T23:33:07Z","title":"Entropy-based Guidance of Deep Neural Networks for Accelerated\n Convergence and Improved Performance","summary":" Neural networks have dramatically increased our capacity to learn from large,\nhigh-dimensional datasets across innumerable disciplines. However, their\ndecisions are not easily interpretable, their computational costs are high, and\nbuilding and training them are uncertain processes. To add structure to these\nefforts, we derive new mathematical results to efficiently measure the changes\nin entropy as fully-connected and convolutional neural networks process data,\nand introduce entropy-based loss terms. Experiments in image compression and\nimage classification on benchmark datasets demonstrate these losses guide\nneural networks to learn rich latent data representations in fewer dimensions,\nconverge in fewer training epochs, and achieve better test metrics.\n","authors":["Mackenzie J. Meni","Ryan T. White","Michael Mayo","Kevin Pilkiewicz"],"pdf_url":"https://arxiv.org/pdf/2308.14938v1.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2204.01585v4","updated":"2023-08-28T23:24:59Z","published":"2022-04-04T15:33:21Z","title":"Differentially Private Sampling from Rashomon Sets, and the Universality\n of Langevin Diffusion for Convex Optimization","summary":" In this paper we provide an algorithmic framework based on Langevin diffusion\n(LD) and its corresponding discretizations that allow us to simultaneously\nobtain: i) An algorithm for sampling from the exponential mechanism, whose\nprivacy analysis does not depend on convexity and which can be stopped at\nanytime without compromising privacy, and ii) tight uniform stability\nguarantees for the exponential mechanism. As a direct consequence, we obtain\noptimal excess empirical and population risk guarantees for (strongly) convex\nlosses under both pure and approximate differential privacy (DP). The framework\nallows us to design a DP uniform sampler from the Rashomon set. Rashomon sets\nare widely used in interpretable and robust machine learning, understanding\nvariable importance, and characterizing fairness.\n","authors":["Arun Ganesh","Abhradeep Thakurta","Jalaj Upadhyay"],"pdf_url":"https://arxiv.org/pdf/2204.01585v4.pdf","comment":"Appeared in COLT 2023. For ease of presentation, some results appear\n in the previous version of this paper on arXiv (v3) that do not appear in\n this version, nor are subsumed by results in this version. Please see Section\n 1.4 for more details"},{"id":"http://arxiv.org/abs/2306.16740v3","updated":"2023-08-28T23:19:14Z","published":"2023-06-29T07:31:43Z","title":"Principles and Guidelines for Evaluating Social Robot Navigation\n Algorithms","summary":" A major challenge to deploying robots widely is navigation in human-populated\nenvironments, commonly referred to as social robot navigation. While the field\nof social navigation has advanced tremendously in recent years, the fair\nevaluation of algorithms that tackle social navigation remains hard because it\ninvolves not just robotic agents moving in static environments but also dynamic\nhuman agents and their perceptions of the appropriateness of robot behavior. In\ncontrast, clear, repeatable, and accessible benchmarks have accelerated\nprogress in fields like computer vision, natural language processing and\ntraditional robot navigation by enabling researchers to fairly compare\nalgorithms, revealing limitations of existing solutions and illuminating\npromising new directions. We believe the same approach can benefit social\nnavigation. In this paper, we pave the road towards common, widely accessible,\nand repeatable benchmarking criteria to evaluate social robot navigation. Our\ncontributions include (a) a definition of a socially navigating robot as one\nthat respects the principles of safety, comfort, legibility, politeness, social\ncompetency, agent understanding, proactivity, and responsiveness to context,\n(b) guidelines for the use of metrics, development of scenarios, benchmarks,\ndatasets, and simulators to evaluate social navigation, and (c) a design of a\nsocial navigation metrics framework to make it easier to compare results from\ndifferent simulators, robots and datasets.\n","authors":["Anthony Francis","Claudia Pérez-D'Arpino","Chengshu Li","Fei Xia","Alexandre Alahi","Rachid Alami","Aniket Bera","Abhijat Biswas","Joydeep Biswas","Rohan Chandra","Hao-Tien Lewis Chiang","Michael Everett","Sehoon Ha","Justin Hart","Jonathan P. How","Haresh Karnan","Tsang-Wei Edward Lee","Luis J. Manso","Reuth Mirksy","Sören Pirk","Phani Teja Singamaneni","Peter Stone","Ada V. Taylor","Peter Trautman","Nathan Tsoi","Marynel Vázquez","Xuesu Xiao","Peng Xu","Naoki Yokoyama","Alexander Toshev","Roberto Martín-Martín"],"pdf_url":"https://arxiv.org/pdf/2306.16740v3.pdf","comment":"42 pages, 11 figures, 6 tables"},{"id":"http://arxiv.org/abs/2308.14930v1","updated":"2023-08-28T23:08:32Z","published":"2023-08-28T23:08:32Z","title":"Application of Quantum Pre-Processing Filter for Binary Image\n Classification with Small Samples","summary":" Over the past few years, there has been significant interest in Quantum\nMachine Learning (QML) among researchers, as it has the potential to transform\nthe field of machine learning. Several models that exploit the properties of\nquantum mechanics have been developed for practical applications. In this\nstudy, we investigated the application of our previously proposed quantum\npre-processing filter (QPF) to binary image classification. We evaluated the\nQPF on four datasets: MNIST (handwritten digits), EMNIST (handwritten digits\nand alphabets), CIFAR-10 (photographic images) and GTSRB (real-life traffic\nsign images). Similar to our previous multi-class classification results, the\napplication of QPF improved the binary image classification accuracy using\nneural network against MNIST, EMNIST, and CIFAR-10 from 98.9% to 99.2%, 97.8%\nto 98.3%, and 71.2% to 76.1%, respectively, but degraded it against GTSRB from\n93.5% to 92.0%. We then applied QPF in cases using a smaller number of training\nand testing samples, i.e. 80 and 20 samples per class, respectively. In order\nto derive statistically stable results, we conducted the experiment with 100\ntrials choosing randomly different training and testing samples and averaging\nthe results. The result showed that the application of QPF did not improve the\nimage classification accuracy against MNIST and EMNIST but improved it against\nCIFAR-10 and GTSRB from 65.8% to 67.2% and 90.5% to 91.8%, respectively.\nFurther research will be conducted as part of future work to investigate the\npotential of QPF to assess the scalability of the proposed approach to larger\nand complex datasets.\n","authors":["Farina Riaz","Shahab Abdulla","Hajime Suzuki","Srinjoy Ganguly","Ravinesh C. Deo","Susan Hopkins"],"pdf_url":"https://arxiv.org/pdf/2308.14930v1.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.14929v1","updated":"2023-08-28T23:08:15Z","published":"2023-08-28T23:08:15Z","title":"Maestro: Uncovering Low-Rank Structures via Trainable Decomposition","summary":" Deep Neural Networks (DNNs) have been a large driver and enabler for AI\nbreakthroughs in recent years. These models have been getting larger in their\nattempt to become more accurate and tackle new upcoming use-cases, including\nAR/VR and intelligent assistants. However, the training process of such large\nmodels is a costly and time-consuming process, which typically yields a single\nmodel to fit all targets. To mitigate this, various techniques have been\nproposed in the literature, including pruning, sparsification or quantization\nof the model weights and updates. While able to achieve high compression rates,\nthey often incur computational overheads or accuracy penalties. Alternatively,\nfactorization methods have been leveraged to incorporate low-rank compression\nin the training process. Similarly, such techniques (e.g.,~SVD) frequently rely\non the computationally expensive decomposition of layers and are potentially\nsub-optimal for non-linear models, such as DNNs. In this work, we take a\nfurther step in designing efficient low-rank models and propose Maestro, a\nframework for trainable low-rank layers. Instead of regularly applying a priori\ndecompositions such as SVD, the low-rank structure is built into the training\nprocess through a generalized variant of Ordered Dropout. This method imposes\nan importance ordering via sampling on the decomposed DNN structure. Our\ntheoretical analysis demonstrates that our method recovers the SVD\ndecomposition of linear mapping on uniformly distributed data and PCA for\nlinear autoencoders. We further apply our technique on DNNs and empirically\nillustrate that Maestro enables the extraction of lower footprint models that\npreserve model performance while allowing for graceful accuracy-latency\ntradeoff for the deployment to devices of different capabilities.\n","authors":["Samuel Horvath","Stefanos Laskaridis","Shashank Rajput","Hongyi Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14929v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2305.17118v2","updated":"2023-08-28T22:48:46Z","published":"2023-05-26T17:39:58Z","title":"Scissorhands: Exploiting the Persistence of Importance Hypothesis for\n LLM KV Cache Compression at Test Time","summary":" Large language models(LLMs) have sparked a new wave of exciting AI\napplications. Hosting these models at scale requires significant memory\nresources. One crucial memory bottleneck for the deployment stems from the\ncontext window. It is commonly recognized that model weights are memory hungry;\nhowever, the size of key-value embedding stored during the generation process\n(KV cache) can easily surpass the model size. The enormous size of the KV cache\nputs constraints on the inference batch size, which is crucial for high\nthroughput inference workload. Inspired by an interesting observation of the\nattention scores, we hypothesize the persistence of importance: only pivotal\ntokens, which had a substantial influence at one step, will significantly\ninfluence future generations. Based on our empirical verification and\ntheoretical analysis around this hypothesis, we propose Scissorhands, a system\nthat maintains the memory usage of the KV cache at a fixed budget without\nfinetuning the model. In essence, Scissorhands manages the KV cache by storing\nthe pivotal tokens with a higher probability. We validate that Scissorhands\nreduces the inference memory usage of the KV cache by up to 5X without\ncompromising model quality. We further demonstrate that Scissorhands can be\ncombined with 4-bit quantization, traditionally used to compress model weights,\nto achieve up to 20X compression.\n","authors":["Zichang Liu","Aditya Desai","Fangshuo Liao","Weitao Wang","Victor Xie","Zhaozhuo Xu","Anastasios Kyrillidis","Anshumali Shrivastava"],"pdf_url":"https://arxiv.org/pdf/2305.17118v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14924v1","updated":"2023-08-28T22:42:51Z","published":"2023-08-28T22:42:51Z","title":"Optimal Economic Gas Turbine Dispatch with Deep Reinforcement Learning","summary":" Dispatching strategies for gas turbines (GTs) are changing in modern\nelectricity grids. A growing incorporation of intermittent renewable energy\nrequires GTs to operate more but shorter cycles and more frequently on partial\nloads. Deep reinforcement learning (DRL) has recently emerged as a tool that\ncan cope with this development and dispatch GTs economically. The key\nadvantages of DRL are a model-free optimization and the ability to handle\nuncertainties, such as those introduced by varying loads or renewable energy\nproduction. In this study, three popular DRL algorithms are implemented for an\neconomic GT dispatch problem on a case study in Alberta, Canada. We highlight\nthe benefits of DRL by incorporating an existing thermodynamic software\nprovided by Siemens Energy into the environment model and by simulating\nuncertainty via varying electricity prices, loads, and ambient conditions.\nAmong the tested algorithms and baseline methods, Deep Q-Networks (DQN)\nobtained the highest rewards while Proximal Policy Optimization (PPO) was the\nmost sample efficient. We further propose and implement a method to assign GT\noperation and maintenance cost dynamically based on operating hours and cycles.\nCompared to existing methods, our approach better approximates the true cost of\nmodern GT dispatch and hence leads to more realistic policies.\n","authors":["Manuel Sage","Martin Staniszewski","Yaoyao Fiona Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.14924v1.pdf","comment":"This work has been accepted to IFAC for publication under a Creative\n Commons Licence CC-BY-NC-ND"},{"id":"http://arxiv.org/abs/2308.14921v1","updated":"2023-08-28T22:32:05Z","published":"2023-08-28T22:32:05Z","title":"Gender bias and stereotypes in Large Language Models","summary":" Large Language Models (LLMs) have made substantial progress in the past\nseveral months, shattering state-of-the-art benchmarks in many domains. This\npaper investigates LLMs' behavior with respect to gender stereotypes, a known\nissue for prior models. We use a simple paradigm to test the presence of gender\nbias, building on but differing from WinoBias, a commonly used gender bias\ndataset, which is likely to be included in the training data of current LLMs.\nWe test four recently published LLMs and demonstrate that they express biased\nassumptions about men and women's occupations. Our contributions in this paper\nare as follows: (a) LLMs are 3-6 times more likely to choose an occupation that\nstereotypically aligns with a person's gender; (b) these choices align with\npeople's perceptions better than with the ground truth as reflected in official\njob statistics; (c) LLMs in fact amplify the bias beyond what is reflected in\nperceptions or the ground truth; (d) LLMs ignore crucial ambiguities in\nsentence structure 95% of the time in our study items, but when explicitly\nprompted, they recognize the ambiguity; (e) LLMs provide explanations for their\nchoices that are factually inaccurate and likely obscure the true reason behind\ntheir predictions. That is, they provide rationalizations of their biased\nbehavior. This highlights a key property of these models: LLMs are trained on\nimbalanced datasets; as such, even with the recent successes of reinforcement\nlearning with human feedback, they tend to reflect those imbalances back at us.\nAs with other types of societal biases, we suggest that LLMs must be carefully\ntested to ensure that they treat minoritized individuals and communities\nequitably.\n","authors":["Hadas Kotek","Rikker Dockum","David Q. Sun"],"pdf_url":"https://arxiv.org/pdf/2308.14921v1.pdf","comment":"ACM Collective Intelligence"},{"id":"http://arxiv.org/abs/2308.14920v1","updated":"2023-08-28T22:29:57Z","published":"2023-08-28T22:29:57Z","title":"Matbench Discovery -- An evaluation framework for machine learning\n crystal stability prediction","summary":" Matbench Discovery simulates the deployment of machine learning (ML) energy\nmodels in a high-throughput search for stable inorganic crystals. We address\nthe disconnect between (i) thermodynamic stability and formation energy and\n(ii) in-domain vs out-of-distribution performance. Alongside this paper, we\npublish a Python package to aid with future model submissions and a growing\nonline leaderboard with further insights into trade-offs between various\nperformance metrics. To answer the question which ML methodology performs best\nat materials discovery, our initial release explores a variety of models\nincluding random forests, graph neural networks (GNN), one-shot predictors,\niterative Bayesian optimizers and universal interatomic potentials (UIP).\nRanked best-to-worst by their test set F1 score on thermodynamic stability\nprediction, we find CHGNet > M3GNet > MACE > ALIGNN > MEGNet > CGCNN > CGCNN+P\n> Wrenformer > BOWSR > Voronoi tessellation fingerprints with random forest.\nThe top 3 models are UIPs, the winning methodology for ML-guided materials\ndiscovery, achieving F1 scores of ~0.6 for crystal stability classification and\ndiscovery acceleration factors (DAF) of up to 5x on the first 10k most stable\npredictions compared to dummy selection from our test set. We also highlight a\nsharp disconnect between commonly used global regression metrics and more\ntask-relevant classification metrics. Accurate regressors are susceptible to\nunexpectedly high false-positive rates if those accurate predictions lie close\nto the decision boundary at 0 eV/atom above the convex hull where most\nmaterials are. Our results highlight the need to focus on classification\nmetrics that actually correlate with improved stability hit rate.\n","authors":["Janosh Riebesell","Rhys E. A. Goodall","Anubhav Jain","Philipp Benner","Kristin A. Persson","Alpha A. Lee"],"pdf_url":"https://arxiv.org/pdf/2308.14920v1.pdf","comment":"18 pages, 9 figures, 3 tables"},{"id":"http://arxiv.org/abs/2308.14919v1","updated":"2023-08-28T22:29:16Z","published":"2023-08-28T22:29:16Z","title":"On Reward Structures of Markov Decision Processes","summary":" A Markov decision process can be parameterized by a transition kernel and a\nreward function. Both play essential roles in the study of reinforcement\nlearning as evidenced by their presence in the Bellman equations. In our\ninquiry of various kinds of ``costs'' associated with reinforcement learning\ninspired by the demands in robotic applications, rewards are central to\nunderstanding the structure of a Markov decision process and reward-centric\nnotions can elucidate important concepts in reinforcement learning.\nSpecifically, we studied the sample complexity of policy evaluation and\ndeveloped a novel estimator with an instance-specific error bound of\n$\\tilde{O}(\\sqrt{\\frac{\\tau_s}{n}})$ for estimating a single state value. Under\nthe online regret minimization setting, we refined the transition-based MDP\nconstant, diameter, into a reward-based constant, maximum expected hitting\ncost, and with it, provided a theoretical explanation for how a well-known\ntechnique, potential-based reward shaping, could accelerate learning with\nexpert knowledge. In an attempt to study safe reinforcement learning, we\nmodeled hazardous environments with irrecoverability and proposed a\nquantitative notion of safe learning via reset efficiency. In this setting, we\nmodified a classic algorithm to account for resets achieving promising\npreliminary numerical results. Lastly, for MDPs with multiple reward functions,\nwe developed a planning algorithm that computationally efficiently finds Pareto\noptimal stochastic policies.\n","authors":["Falcon Z. Dai"],"pdf_url":"https://arxiv.org/pdf/2308.14919v1.pdf","comment":"This PhD thesis draws heavily from arXiv:1907.02114 and\n arXiv:2002.06299"},{"id":"http://arxiv.org/abs/2308.14916v1","updated":"2023-08-28T22:26:50Z","published":"2023-08-28T22:26:50Z","title":"RecRec: Algorithmic Recourse for Recommender Systems","summary":" Recommender systems play an essential role in the choices people make in\ndomains such as entertainment, shopping, food, news, employment, and education.\nThe machine learning models underlying these recommender systems are often\nenormously large and black-box in nature for users, content providers, and\nsystem developers alike. It is often crucial for all stakeholders to understand\nthe model's rationale behind making certain predictions and recommendations.\nThis is especially true for the content providers whose livelihoods depend on\nthe recommender system. Drawing motivation from the practitioners' need, in\nthis work, we propose a recourse framework for recommender systems, targeted\ntowards the content providers. Algorithmic recourse in the recommendation\nsetting is a set of actions that, if executed, would modify the recommendations\n(or ranking) of an item in the desired manner. A recourse suggests actions of\nthe form: \"if a feature changes X to Y, then the ranking of that item for a set\nof users will change to Z.\" Furthermore, we demonstrate that RecRec is highly\neffective in generating valid, sparse, and actionable recourses through an\nempirical evaluation of recommender systems trained on three real-world\ndatasets. To the best of our knowledge, this work is the first to conceptualize\nand empirically test a generalized framework for generating recourses for\nrecommender systems.\n","authors":["Sahil Verma","Ashudeep Singh","Varich Boonsanong","John P. Dickerson","Chirag Shah"],"pdf_url":"https://arxiv.org/pdf/2308.14916v1.pdf","comment":"Accepted as a short paper at CIKM 2023"},{"id":"http://arxiv.org/abs/2206.08464v2","updated":"2023-08-28T22:09:07Z","published":"2022-06-16T22:03:35Z","title":"PRANC: Pseudo RAndom Networks for Compacting deep models","summary":" We demonstrate that a deep model can be reparametrized as a linear\ncombination of several randomly initialized and frozen deep models in the\nweight space. During training, we seek local minima that reside within the\nsubspace spanned by these random models (i.e., `basis' networks). Our\nframework, PRANC, enables significant compaction of a deep model. The model can\nbe reconstructed using a single scalar `seed,' employed to generate the\npseudo-random `basis' networks, together with the learned linear mixture\ncoefficients.\n In practical applications, PRANC addresses the challenge of efficiently\nstoring and communicating deep models, a common bottleneck in several\nscenarios, including multi-agent learning, continual learners, federated\nsystems, and edge devices, among others. In this study, we employ PRANC to\ncondense image classification models and compress images by compacting their\nassociated implicit neural networks. PRANC outperforms baselines with a large\nmargin on image classification when compressing a deep model almost $100$\ntimes. Moreover, we show that PRANC enables memory-efficient inference by\ngenerating layer-wise weights on the fly. The source code of PRANC is here:\n\\url{https://github.com/UCDvision/PRANC}\n","authors":["Parsa Nooralinejad","Ali Abbasi","Soroush Abbasi Koohpayegani","Kossar Pourahmadi Meibodi","Rana Muhammad Shahroz Khan","Soheil Kolouri","Hamed Pirsiavash"],"pdf_url":"https://arxiv.org/pdf/2206.08464v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.00270v2","updated":"2023-08-28T21:54:46Z","published":"2022-12-31T19:08:49Z","title":"Discovery and Exploitation of Generalized Network Effects","summary":" Given a large graph with few node labels, how can we (a) identify whether\nthere is generalized network-effects (GNE) of the graph or not, (b) estimate\nGNE to explain the interrelations among node classes, and (c) exploit GNE to\nimprove downstream tasks such as predicting the unknown labels accurately and\nefficiently? The knowledge of GNE is valuable for various tasks like node\nclassification and targeted advertising. However, identifying and understanding\nGNE such as homophily, heterophily or their combination is challenging in\nreal-world graphs due to limited availability of node labels and noisy edges.\nWe propose NetEffect, a graph mining approach to address the above issues,\nenjoying the following properties: (i) Principled: a statistical test to\ndetermine the presence of GNE in a graph with few node labels; (ii) General and\nExplainable: a closed-form solution to estimate the specific type of GNE\nobserved; and (iii) Accurate and Scalable: the integration of GNE for accurate\nand fast node classification. Applied on public, real-world graphs, NetEffect\ndiscovers the unexpected absence of GNE in numerous graphs, which previously\nthought to exhibit heterophily. Further, we show that incorporating GNE is\neffective on node classification. On a large real-world graph with 1.6M nodes\nand 22.3M edges, NetEffect achieves over 7 times speedup (14 minutes vs. 2\nhours) compared to most competitors.\n","authors":["Meng-Chieh Lee","Shubhranshu Shekhar","Jaemin Yoo","Christos Faloutsos"],"pdf_url":"https://arxiv.org/pdf/2301.00270v2.pdf","comment":"Under Submission"},{"id":"http://arxiv.org/abs/2304.12534v2","updated":"2023-08-28T21:30:20Z","published":"2023-04-25T03:00:18Z","title":"Mobilizing Personalized Federated Learning in Infrastructure-Less and\n Heterogeneous Environments via Random Walk Stochastic ADMM","summary":" This paper explores the challenges of implementing Federated Learning (FL) in\npractical scenarios featuring isolated nodes with data heterogeneity, which can\nonly be connected to the server through wireless links in an\ninfrastructure-less environment. To overcome these challenges, we propose a\nnovel mobilizing personalized FL approach, which aims to facilitate mobility\nand resilience. Specifically, we develop a novel optimization algorithm called\nRandom Walk Stochastic Alternating Direction Method of Multipliers (RWSADMM).\nRWSADMM capitalizes on the server's random movement toward clients and\nformulates local proximity among their adjacent clients based on hard\ninequality constraints rather than requiring consensus updates or introducing\nbias via regularization methods. To mitigate the computational burden on the\nclients, an efficient stochastic solver of the approximated optimization\nproblem is designed in RWSADMM, which provably converges to the stationary\npoint almost surely in expectation. Our theoretical and empirical results\ndemonstrate the provable fast convergence and substantial accuracy improvements\nachieved by RWSADMM compared to baseline methods, along with its benefits of\nreduced communication costs and enhanced scalability.\n","authors":["Ziba Parsons","Fei Dou","Houyi Du","Zheng Song","Jin Lu"],"pdf_url":"https://arxiv.org/pdf/2304.12534v2.pdf","comment":"28 pages, 7 figures, 3 tables, 1 algorithm. Proof details are\n provided in the main body of the paper"},{"id":"http://arxiv.org/abs/2308.14909v1","updated":"2023-08-28T21:25:05Z","published":"2023-08-28T21:25:05Z","title":"Pruning Self-Attention for Zero-Shot Multi-Speaker Text-to-Speech","summary":" For personalized speech generation, a neural text-to-speech (TTS) model must\nbe successfully implemented with limited data from a target speaker. To this\nend, the baseline TTS model needs to be amply generalized to out-of-domain data\n(i.e., target speaker's speech). However, approaches to address this\nout-of-domain generalization problem in TTS have yet to be thoroughly studied.\nIn this work, we propose an effective pruning method for a transformer known as\nsparse attention, to improve the TTS model's generalization abilities. In\nparticular, we prune off redundant connections from self-attention layers whose\nattention weights are below the threshold. To flexibly determine the pruning\nstrength for searching optimal degree of generalization, we also propose a new\ndifferentiable pruning method that allows the model to automatically learn the\nthresholds. Evaluations on zero-shot multi-speaker TTS verify the effectiveness\nof our method in terms of voice quality and speaker similarity.\n","authors":["Hyungchan Yoon","Changhwan Kim","Eunwoo Song","Hyun-Wook Yoon","Hong-Goo Kang"],"pdf_url":"https://arxiv.org/pdf/2308.14909v1.pdf","comment":"INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2308.14906v1","updated":"2023-08-28T21:17:12Z","published":"2023-08-28T21:17:12Z","title":"BayOTIDE: Bayesian Online Multivariate Time series Imputation with\n functional decomposition","summary":" In real-world scenarios like traffic and energy, massive time-series data\nwith missing values and noises are widely observed, even sampled irregularly.\nWhile many imputation methods have been proposed, most of them work with a\nlocal horizon, which means models are trained by splitting the long sequence\ninto batches of fit-sized patches. This local horizon can make models ignore\nglobal trends or periodic patterns. More importantly, almost all methods assume\nthe observations are sampled at regular time stamps, and fail to handle complex\nirregular sampled time series arising from different applications. Thirdly,\nmost existing methods are learned in an offline manner. Thus, it is not\nsuitable for many applications with fast-arriving streaming data. To overcome\nthese limitations, we propose \\ours: Bayesian Online Multivariate Time series\nImputation with functional decomposition. We treat the multivariate time series\nas the weighted combination of groups of low-rank temporal factors with\ndifferent patterns. We apply a group of Gaussian Processes (GPs) with different\nkernels as functional priors to fit the factors. For computational efficiency,\nwe further convert the GPs into a state-space prior by constructing an\nequivalent stochastic differential equation (SDE), and developing a scalable\nalgorithm for online inference. The proposed method can not only handle\nimputation over arbitrary time stamps, but also offer uncertainty\nquantification and interpretability for the downstream application. We evaluate\nour method on both synthetic and real-world datasets.\n","authors":["Shikai Fang","Qingsong Wen","Shandian Zhe","Liang Sun"],"pdf_url":"https://arxiv.org/pdf/2308.14906v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06826v2","updated":"2023-08-28T21:14:35Z","published":"2023-06-12T02:26:00Z","title":"When Do Annotator Demographics Matter? Measuring the Influence of\n Annotator Demographics with the POPQUORN Dataset","summary":" Annotators are not fungible. Their demographics, life experiences, and\nbackgrounds all contribute to how they label data. However, NLP has only\nrecently considered how annotator identity might influence their decisions.\nHere, we present POPQUORN (the POtato-Prolific dataset for QUestion-Answering,\nOffensiveness, text Rewriting, and politeness rating with demographic Nuance).\nPOPQUORN contains 45,000 annotations from 1,484 annotators, drawn from a\nrepresentative sample regarding sex, age, and race as the US population.\nThrough a series of analyses, we show that annotators' background plays a\nsignificant role in their judgments. Further, our work shows that backgrounds\nnot previously considered in NLP (e.g., education), are meaningful and should\nbe considered. Our study suggests that understanding the background of\nannotators and collecting labels from a demographically balanced pool of crowd\nworkers is important to reduce the bias of datasets. The dataset, annotator\nbackground, and annotation interface are available at\nhttps://github.com/Jiaxin-Pei/potato-prolific-dataset .\n","authors":["Jiaxin Pei","David Jurgens"],"pdf_url":"https://arxiv.org/pdf/2306.06826v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14904v1","updated":"2023-08-28T21:13:04Z","published":"2023-08-28T21:13:04Z","title":"Maturity-Aware Active Learning for Semantic Segmentation with\n Hierarchically-Adaptive Sample Assessment","summary":" Active Learning (AL) for semantic segmentation is challenging due to heavy\nclass imbalance and different ways of defining \"sample\" (pixels, areas, etc.),\nleaving the interpretation of the data distribution ambiguous. We propose\n\"Maturity-Aware Distribution Breakdown-based Active Learning'' (MADBAL), an AL\nmethod that benefits from a hierarchical approach to define a multiview data\ndistribution, which takes into account the different \"sample\" definitions\njointly, hence able to select the most impactful segmentation pixels with\ncomprehensive understanding. MADBAL also features a novel uncertainty\nformulation, where AL supporting modules are included to sense the features'\nmaturity whose weighted influence continuously contributes to the uncertainty\ndetection. In this way, MADBAL makes significant performance leaps even in the\nearly AL stage, hence reducing the training burden significantly. It\noutperforms state-of-the-art methods on Cityscapes and PASCAL VOC datasets as\nverified in our extensive experiments.\n","authors":["Amirsaeed Yazdani","Xuelu Li","Vishal Monga"],"pdf_url":"https://arxiv.org/pdf/2308.14904v1.pdf","comment":"Accepted to the 34th British Machine Vision Conference (BMVC 2023)"},{"id":"http://arxiv.org/abs/2002.08907v3","updated":"2023-08-28T21:10:38Z","published":"2020-02-20T17:52:18Z","title":"Second-order Conditional Gradient Sliding","summary":" Constrained second-order convex optimization algorithms are the method of\nchoice when a high accuracy solution to a problem is needed, due to their local\nquadratic convergence. These algorithms require the solution of a constrained\nquadratic subproblem at every iteration. We present the \\emph{Second-Order\nConditional Gradient Sliding} (SOCGS) algorithm, which uses a projection-free\nalgorithm to solve the constrained quadratic subproblems inexactly. When the\nfeasible region is a polytope the algorithm converges quadratically in primal\ngap after a finite number of linearly convergent iterations. Once in the\nquadratic regime the SOCGS algorithm requires $\\mathcal{O}(\\log(\\log\n1/\\varepsilon))$ first-order and Hessian oracle calls and $\\mathcal{O}(\\log\n(1/\\varepsilon) \\log(\\log1/\\varepsilon))$ linear minimization oracle calls to\nachieve an $\\varepsilon$-optimal solution. This algorithm is useful when the\nfeasible region can only be accessed efficiently through a linear optimization\noracle, and computing first-order information of the function, although\npossible, is costly.\n","authors":["Alejandro Carderera","Sebastian Pokutta"],"pdf_url":"https://arxiv.org/pdf/2002.08907v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14902v1","updated":"2023-08-28T21:08:06Z","published":"2023-08-28T21:08:06Z","title":"Ad-Rec: Advanced Feature Interactions to Address Covariate-Shifts in\n Recommendation Networks","summary":" Recommendation models are vital in delivering personalized user experiences\nby leveraging the correlation between multiple input features. However, deep\nlearning-based recommendation models often face challenges due to evolving user\nbehaviour and item features, leading to covariate shifts. Effective\ncross-feature learning is crucial to handle data distribution drift and\nadapting to changing user behaviour. Traditional feature interaction techniques\nhave limitations in achieving optimal performance in this context.\n This work introduces Ad-Rec, an advanced network that leverages feature\ninteraction techniques to address covariate shifts. This helps eliminate\nirrelevant interactions in recommendation tasks. Ad-Rec leverages masked\ntransformers to enable the learning of higher-order cross-features while\nmitigating the impact of data distribution drift. Our approach improves model\nquality, accelerates convergence, and reduces training time, as measured by the\nArea Under Curve (AUC) metric. We demonstrate the scalability of Ad-Rec and its\nability to achieve superior model quality through comprehensive ablation\nstudies.\n","authors":["Muhammad Adnan","Yassaman Ebrahimzadeh Maboud","Divya Mahajan","Prashant J. Nair"],"pdf_url":"https://arxiv.org/pdf/2308.14902v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14897v1","updated":"2023-08-28T20:46:07Z","published":"2023-08-28T20:46:07Z","title":"Statistically Efficient Variance Reduction with Double Policy Estimation\n for Off-Policy Evaluation in Sequence-Modeled Reinforcement Learning","summary":" Offline reinforcement learning aims to utilize datasets of previously\ngathered environment-action interaction records to learn a policy without\naccess to the real environment. Recent work has shown that offline\nreinforcement learning can be formulated as a sequence modeling problem and\nsolved via supervised learning with approaches such as decision transformer.\nWhile these sequence-based methods achieve competitive results over\nreturn-to-go methods, especially on tasks that require longer episodes or with\nscarce rewards, importance sampling is not considered to correct the policy\nbias when dealing with off-policy data, mainly due to the absence of behavior\npolicy and the use of deterministic evaluation policies. To this end, we\npropose DPE: an RL algorithm that blends offline sequence modeling and offline\nreinforcement learning with Double Policy Estimation (DPE) in a unified\nframework with statistically proven properties on variance reduction. We\nvalidate our method in multiple tasks of OpenAI Gym with D4RL benchmarks. Our\nmethod brings a performance improvements on selected methods which outperforms\nSOTA baselines in several tasks, demonstrating the advantages of enabling\ndouble policy estimation for sequence-modeled reinforcement learning.\n","authors":["Hanhan Zhou","Tian Lan","Vaneet Aggarwal"],"pdf_url":"https://arxiv.org/pdf/2308.14897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14895v1","updated":"2023-08-28T20:32:22Z","published":"2023-08-28T20:32:22Z","title":"Conformal Meta-learners for Predictive Inference of Individual Treatment\n Effects","summary":" We investigate the problem of machine learning-based (ML) predictive\ninference on individual treatment effects (ITEs). Previous work has focused\nprimarily on developing ML-based meta-learners that can provide point estimates\nof the conditional average treatment effect (CATE); these are model-agnostic\napproaches for combining intermediate nuisance estimates to produce estimates\nof CATE. In this paper, we develop conformal meta-learners, a general framework\nfor issuing predictive intervals for ITEs by applying the standard conformal\nprediction (CP) procedure on top of CATE meta-learners. We focus on a broad\nclass of meta-learners based on two-stage pseudo-outcome regression and develop\na stochastic ordering framework to study their validity. We show that inference\nwith conformal meta-learners is marginally valid if their (pseudo outcome)\nconformity scores stochastically dominate oracle conformity scores evaluated on\nthe unobserved ITEs. Additionally, we prove that commonly used CATE\nmeta-learners, such as the doubly-robust learner, satisfy a model- and\ndistribution-free stochastic (or convex) dominance condition, making their\nconformal inferences valid for practically-relevant levels of target coverage.\nWhereas existing procedures conduct inference on nuisance parameters (i.e.,\npotential outcomes) via weighted CP, conformal meta-learners enable direct\ninference on the target parameter (ITE). Numerical experiments show that\nconformal meta-learners provide valid intervals with competitive efficiency\nwhile retaining the favorable point estimation properties of CATE\nmeta-learners.\n","authors":["Ahmed Alaa","Zaid Ahmad","Mark van der Laan"],"pdf_url":"https://arxiv.org/pdf/2308.14895v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14893v1","updated":"2023-08-28T20:30:10Z","published":"2023-08-28T20:30:10Z","title":"When hard negative sampling meets supervised contrastive learning","summary":" State-of-the-art image models predominantly follow a two-stage strategy:\npre-training on large datasets and fine-tuning with cross-entropy loss. Many\nstudies have shown that using cross-entropy can result in sub-optimal\ngeneralisation and stability. While the supervised contrastive loss addresses\nsome limitations of cross-entropy loss by focusing on intra-class similarities\nand inter-class differences, it neglects the importance of hard negative\nmining. We propose that models will benefit from performance improvement by\nweighting negative samples based on their dissimilarity to positive\ncounterparts. In this paper, we introduce a new supervised contrastive learning\nobjective, SCHaNe, which incorporates hard negative sampling during the\nfine-tuning phase. Without requiring specialized architectures, additional\ndata, or extra computational resources, experimental results indicate that\nSCHaNe outperforms the strong baseline BEiT-3 in Top-1 accuracy across various\nbenchmarks, with significant gains of up to $3.32\\%$ in few-shot learning\nsettings and $3.41\\%$ in full dataset fine-tuning. Importantly, our proposed\nobjective sets a new state-of-the-art for base models on ImageNet-1k, achieving\nan 86.14\\% accuracy. Furthermore, we demonstrate that the proposed objective\nyields better embeddings and explains the improved effectiveness observed in\nour experiments.\n","authors":["Zijun Long","George Killick","Richard McCreadie","Gerardo Aragon Camarasa","Zaiqiao Meng"],"pdf_url":"https://arxiv.org/pdf/2308.14893v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15786v3","updated":"2023-08-28T20:29:41Z","published":"2023-05-25T07:01:02Z","title":"Theoretical Guarantees of Learning Ensembling Strategies with\n Applications to Time Series Forecasting","summary":" Ensembling is among the most popular tools in machine learning (ML) due to\nits effectiveness in minimizing variance and thus improving generalization.\nMost ensembling methods for black-box base learners fall under the umbrella of\n\"stacked generalization,\" namely training an ML algorithm that takes the\ninferences from the base learners as input. While stacking has been widely\napplied in practice, its theoretical properties are poorly understood. In this\npaper, we prove a novel result, showing that choosing the best stacked\ngeneralization from a (finite or finite-dimensional) family of stacked\ngeneralizations based on cross-validated performance does not perform \"much\nworse\" than the oracle best. Our result strengthens and significantly extends\nthe results in Van der Laan et al. (2007). Inspired by the theoretical\nanalysis, we further propose a particular family of stacked generalizations in\nthe context of probabilistic forecasting, each one with a different sensitivity\nfor how much the ensemble weights are allowed to vary across items, timestamps\nin the forecast horizon, and quantiles. Experimental results demonstrate the\nperformance gain of the proposed method.\n","authors":["Hilaf Hasson","Danielle C. Maddix","Yuyang Wang","Gaurav Gupta","Youngsuk Park"],"pdf_url":"https://arxiv.org/pdf/2305.15786v3.pdf","comment":"ICML 2023"},{"id":"http://arxiv.org/abs/2306.04930v2","updated":"2023-08-28T20:21:59Z","published":"2023-06-08T04:24:24Z","title":"When to Show a Suggestion? Integrating Human Feedback in AI-Assisted\n Programming","summary":" AI powered code-recommendation systems, such as Copilot and CodeWhisperer,\nprovide code suggestions inside a programmer's environment (e.g., an IDE) with\nthe aim to improve their productivity. Since, in these scenarios, programmers\naccept and reject suggestions, ideally, such a system should use this feedback\nin furtherance of this goal. In this work, we leverage prior data of\nprogrammers interacting with GitHub Copilot, a system used by millions of\nprogrammers, to develop interventions that can save programmer time. We propose\na utility theory framework, which models this interaction with programmers and\ndecides which suggestions to display. Our framework Conditional suggestion\nDisplay from Human Feedback (CDHF), relies on a cascade of models that predict\nsuggestion acceptance to selectively hide suggestions reducing both latency and\nprogrammer verification time. Using data from 535 programmers, we perform a\nretrospective evaluation of CDHF and show that we can avoid displaying a\nsignificant fraction of suggestions that would have been rejected doing so\nwithout total knowledge of the suggestions themselves. We further demonstrate\nthe importance of incorporating the programmer's latent unobserved state in\ndeciding when to display suggestions through ablations on user study data.\nFinally, we showcase that using suggestion acceptance as a reward signal to\nknow which suggestions to display leads to reduced quality suggestions\nindicating an unexpected pitfall.\n","authors":["Hussein Mozannar","Gagan Bansal","Adam Fourney","Eric Horvitz"],"pdf_url":"https://arxiv.org/pdf/2306.04930v2.pdf","comment":"Previous version of these results can be found in arXiv:2210.14306"},{"id":"http://arxiv.org/abs/2305.19370v3","updated":"2023-08-28T20:13:33Z","published":"2023-05-30T19:25:51Z","title":"Blockwise Parallel Transformer for Large Context Models","summary":" Transformers have emerged as the cornerstone of state-of-the-art natural\nlanguage processing models, showcasing exceptional performance across a wide\nrange of AI applications. However, the memory demands posed by the\nself-attention mechanism and the large feedforward network in Transformers\nlimit their ability to handle long sequences, thereby creating challenges for\ntasks involving multiple long sequences or long-term dependencies. We present a\ndistinct approach, Blockwise Parallel Transformer (BPT), that leverages\nblockwise computation of self-attention and feedforward network fusion to\nminimize memory costs. By processing longer input sequences while maintaining\nmemory efficiency, BPT enables training sequences 32 times longer than vanilla\nTransformers and up to 4 times longer than previous memory-efficient methods.\nExtensive experiments on language modeling and reinforcement learning tasks\ndemonstrate the effectiveness of BPT in reducing memory requirements and\nimproving performance.\n","authors":["Hao Liu","Pieter Abbeel"],"pdf_url":"https://arxiv.org/pdf/2305.19370v3.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.14524v1","updated":"2023-08-28T12:19:46Z","published":"2023-08-28T12:19:46Z","title":"Towards enabling reliable immersive teleoperation through Digital Twin:\n A UAV command and control use case","summary":" This paper addresses the challenging problem of enabling reliable immersive\nteleoperation in scenarios where an Unmanned Aerial Vehicle (UAV) is remotely\ncontrolled by an operator via a cellular network. Such scenarios can be quite\ncritical particularly when the UAV lacks advanced equipment (e.g., Lidar-based\nauto stop) or when the network is subject to some performance constraints\n(e.g., delay). To tackle these challenges, we propose a novel architecture\nleveraging Digital Twin (DT) technology to create a virtual representation of\nthe physical environment. This virtual environment accurately mirrors the\nphysical world, accounting for 3D surroundings, weather constraints, and\nnetwork limitations. To enhance teleoperation, the UAV in the virtual\nenvironment is equipped with advanced features that maybe absent in the real\nUAV. Furthermore, the proposed architecture introduces an intelligent logic\nthat utilizes information from both virtual and physical environments to\napprove, deny, or correct actions initiated by the UAV operator. This\nanticipatory approach helps to mitigate potential risks. Through a series of\nfield trials, we demonstrate the effectiveness of the proposed architecture in\nsignificantly improving the reliability of UAV teleoperation.\n","authors":["Nassim Sehad","Xinyi Tu","Akash Rajasekaran","Hamed Hellaoui","Riku Jäntti","Mérouane Debbah"],"pdf_url":"https://arxiv.org/pdf/2308.14524v1.pdf","comment":"Accepted by IEEE Globecom 2023"},{"id":"http://arxiv.org/abs/2308.14480v1","updated":"2023-08-28T10:40:16Z","published":"2023-08-28T10:40:16Z","title":"Priority-Centric Human Motion Generation in Discrete Latent Space","summary":" Text-to-motion generation is a formidable task, aiming to produce human\nmotions that align with the input text while also adhering to human\ncapabilities and physical laws. While there have been advancements in diffusion\nmodels, their application in discrete spaces remains underexplored. Current\nmethods often overlook the varying significance of different motions, treating\nthem uniformly. It is essential to recognize that not all motions hold the same\nrelevance to a particular textual description. Some motions, being more salient\nand informative, should be given precedence during generation. In response, we\nintroduce a Priority-Centric Motion Discrete Diffusion Model (M2DM), which\nutilizes a Transformer-based VQ-VAE to derive a concise, discrete motion\nrepresentation, incorporating a global self-attention mechanism and a\nregularization term to counteract code collapse. We also present a motion\ndiscrete diffusion model that employs an innovative noise schedule, determined\nby the significance of each motion token within the entire motion sequence.\nThis approach retains the most salient motions during the reverse diffusion\nprocess, leading to more semantically rich and varied motions. Additionally, we\nformulate two strategies to gauge the importance of motion tokens, drawing from\nboth textual and visual indicators. Comprehensive experiments on the HumanML3D\nand KIT-ML datasets confirm that our model surpasses existing techniques in\nfidelity and diversity, particularly for intricate textual descriptions.\n","authors":["Hanyang Kong","Kehong Gong","Dongze Lian","Michael Bi Mi","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14480v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.14395v1","updated":"2023-08-28T08:20:30Z","published":"2023-08-28T08:20:30Z","title":"UMMAFormer: A Universal Multimodal-adaptive Transformer Framework for\n Temporal Forgery Localization","summary":" The emergence of artificial intelligence-generated content (AIGC) has raised\nconcerns about the authenticity of multimedia content in various fields.\nHowever, existing research for forgery content detection has focused mainly on\nbinary classification tasks of complete videos, which has limited applicability\nin industrial settings. To address this gap, we propose UMMAFormer, a novel\nuniversal transformer framework for temporal forgery localization (TFL) that\npredicts forgery segments with multimodal adaptation. Our approach introduces a\nTemporal Feature Abnormal Attention (TFAA) module based on temporal feature\nreconstruction to enhance the detection of temporal differences. We also design\na Parallel Cross-Attention Feature Pyramid Network (PCA-FPN) to optimize the\nFeature Pyramid Network (FPN) for subtle feature enhancement. To evaluate the\nproposed method, we contribute a novel Temporal Video Inpainting Localization\n(TVIL) dataset specifically tailored for video inpainting scenes. Our\nexperiments show that our approach achieves state-of-the-art performance on\nbenchmark datasets, including Lav-DF, TVIL, and Psynd, significantly\noutperforming previous methods. The code and data are available at\nhttps://github.com/ymhzyj/UMMAFormer/.\n","authors":["Rui Zhang","Hongxia Wang","Mingshan Du","Hanqing Liu","Yang Zhou","Qiang Zeng"],"pdf_url":"https://arxiv.org/pdf/2308.14395v1.pdf","comment":"11 pages, 8 figures, 66 references. This paper has been accepted for\n ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.14316v1","updated":"2023-08-28T05:38:43Z","published":"2023-08-28T05:38:43Z","title":"UniPT: Universal Parallel Tuning for Transfer Learning with Efficient\n Parameter and Memory","summary":" Fine-tuning pre-trained models has emerged as a powerful technique in\nnumerous domains, owing to its ability to leverage enormous pre-existing\nknowledge and achieve remarkable performance on downstream tasks. However,\nupdating the parameters of entire networks is computationally intensive.\nAlthough state-of-the-art parameter-efficient transfer learning (PETL) methods\nsignificantly reduce the trainable parameters and storage demand, almost all of\nthem still need to back-propagate the gradients through large pre-trained\nnetworks. This memory-extensive characteristic extremely limits the\napplicability of PETL methods in real-world scenarios. To this end, we propose\na new memory-efficient PETL strategy, dubbed Universal Parallel Tuning (UniPT).\nSpecifically, we facilitate the transfer process via a lightweight learnable\nparallel network, which consists of two modules: 1) A parallel interaction\nmodule that decouples the inherently sequential connections and processes the\nintermediate activations detachedly of the pre-trained network. 2) A confidence\naggregation module that learns optimal strategies adaptively for integrating\ncross-layer features. We evaluate UniPT with different backbones (e.g.,\nVSE$\\infty$, CLIP4Clip, Clip-ViL, and MDETR) on five challenging\nvision-and-language tasks (i.e., image-text retrieval, video-text retrieval,\nvisual question answering, compositional question answering, and visual\ngrounding). Extensive ablations on ten datasets have validated that our UniPT\ncan not only dramatically reduce memory consumption and outperform the best\nmemory-efficient competitor, but also achieve higher performance than existing\nPETL methods in a low-memory scenario on different architectures. Our code is\npublicly available at: https://github.com/Paranioar/UniPT.\n","authors":["Haiwen Diao","Bo Wan","Ying Zhang","Xu Jia","Huchuan Lu","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2308.14316v1.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.06725v2","updated":"2023-08-28T04:27:35Z","published":"2023-08-13T09:05:56Z","title":"CLE Diffusion: Controllable Light Enhancement Diffusion Model","summary":" Low light enhancement has gained increasing importance with the rapid\ndevelopment of visual creation and editing. However, most existing enhancement\nalgorithms are designed to homogeneously increase the brightness of images to a\npre-defined extent, limiting the user experience. To address this issue, we\npropose Controllable Light Enhancement Diffusion Model, dubbed CLE Diffusion, a\nnovel diffusion framework to provide users with rich controllability. Built\nwith a conditional diffusion model, we introduce an illumination embedding to\nlet users control their desired brightness level. Additionally, we incorporate\nthe Segment-Anything Model (SAM) to enable user-friendly region\ncontrollability, where users can click on objects to specify the regions they\nwish to enhance. Extensive experiments demonstrate that CLE Diffusion achieves\ncompetitive performance regarding quantitative metrics, qualitative results,\nand versatile controllability. Project page:\nhttps://yuyangyin.github.io/CLEDiffusion/\n","authors":["Yuyang Yin","Dejia Xu","Chuangchuang Tan","Ping Liu","Yao Zhao","Yunchao Wei"],"pdf_url":"https://arxiv.org/pdf/2308.06725v2.pdf","comment":"Accepted In Proceedings of the 31st ACM International Conference on\n Multimedia (MM' 23)"},{"id":"http://arxiv.org/abs/2308.14274v1","updated":"2023-08-28T03:13:27Z","published":"2023-08-28T03:13:27Z","title":"Parameter-Efficient Transfer Learning for Audio-Visual-Language Tasks","summary":" The pretrain-then-finetune paradigm has been widely used in various unimodal\nand multimodal tasks. However, finetuning all the parameters of a pre-trained\nmodel becomes prohibitive as the model size grows exponentially. To address\nthis issue, the adapter mechanism that freezes the pre-trained model and only\nfinetunes a few extra parameters is introduced and delivers promising results.\nMost studies on adapter architectures are dedicated to unimodal or bimodal\ntasks, while the adapter architectures for trimodal tasks have not been\ninvestigated yet. This paper introduces a novel Long Short-Term Trimodal\nAdapter (LSTTA) approach for video understanding tasks involving audio, visual,\nand language modalities. Based on the pre-trained from the three modalities,\nthe designed adapter module is inserted between the sequential blocks to model\nthe dense interactions across the three modalities. Specifically, LSTTA\nconsists of two types of complementary adapter modules, namely the long-term\nsemantic filtering module and the short-term semantic interaction module. The\nlong-term semantic filtering aims to characterize the temporal importance of\nthe video frames and the short-term semantic interaction module models local\ninteractions within short periods. Compared to previous state-of-the-art\ntrimodal learning methods pre-trained on a large-scale trimodal corpus, LSTTA\nis more flexible and can inherit any powerful unimodal or bimodal models.\nExperimental results on four typical trimodal learning tasks show the\neffectiveness of LSTTA over existing state-of-the-art methods.\n","authors":["Hongye Liu","Xianhai Xie","Yang Gao","Size Li","Zhou YU"],"pdf_url":"https://arxiv.org/pdf/2308.14274v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14263v1","updated":"2023-08-28T02:38:17Z","published":"2023-08-28T02:38:17Z","title":"Cross-Modal Retrieval: A Systematic Review of Methods and Future\n Directions","summary":" With the exponential surge in diverse multi-modal data, traditional uni-modal\nretrieval methods struggle to meet the needs of users demanding access to data\nfrom various modalities. To address this, cross-modal retrieval has emerged,\nenabling interaction across modalities, facilitating semantic matching, and\nleveraging complementarity and consistency between different modal data.\nAlthough prior literature undertook a review of the cross-modal retrieval\nfield, it exhibits numerous deficiencies pertaining to timeliness, taxonomy,\nand comprehensiveness. This paper conducts a comprehensive review of\ncross-modal retrieval's evolution, spanning from shallow statistical analysis\ntechniques to vision-language pre-training models. Commencing with a\ncomprehensive taxonomy grounded in machine learning paradigms, mechanisms, and\nmodels, the paper then delves deeply into the principles and architectures\nunderpinning existing cross-modal retrieval methods. Furthermore, it offers an\noverview of widely used benchmarks, metrics, and performances. Lastly, the\npaper probes the prospects and challenges that confront contemporary\ncross-modal retrieval, while engaging in a discourse on potential directions\nfor further progress in the field. To facilitate the research on cross-modal\nretrieval, we develop an open-source code repository at\nhttps://github.com/BMC-SDNU/Cross-Modal-Retrieval.\n","authors":["Lei Zhu","Tianshi Wang","Fengling Li","Jingjing Li","Zheng Zhang","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2308.14263v1.pdf","comment":null}]},"2023-08-27T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2308.10335v2","updated":"2023-08-27T23:06:07Z","published":"2023-08-20T18:36:28Z","title":"A Study on Robustness and Reliability of Large Language Model Code\n Generation","summary":" Recently, the large language models (LLMs) have shown extraordinary ability\nin understanding natural language and generating programming code. It has been\na common practice of software engineers to consult LLMs when encountering\ncoding questions. Although efforts have been made to avoid syntax errors and\nalign the code with the intended semantics, the reliability and robustness of\nthe code generationfrom LLMs have not yet been thoroughly studied. The\nexecutable code is not equivalent to the reliable and robust code, especially\nin the context of real-world software development. The misuse of APIs in the\ngenerated code could lead to severe problem, such as resource leaks, program\ncrashes. To make things worse, the users of LLM code generation services are\nactually the developers that are most vulnerable to these code that seems right\n-- They are always novice developers that are not familiar with the APIs that\nLLMs generate code for them. Therefore, they could hardly tell the misuse in\nthe code generated by LLMs, which further facilitates the incorrect code\napplied in real-world software. Existing code evaluation benchmark and datasets\nfocus on crafting small tasks such as programming questions in coding\ninterviews, which however deviates from the problem that developers would ask\nLLM for real-world coding help. To fill the missing piece, in this work, we\npropose a dataset RobustAPI for evaluating the reliability and robustness of\ncode generated by LLMs. We collect 1208 coding questions from StackOverflow on\n24 representative Java APIs. We summarize thecommon misuse patterns of these\nAPIs and evaluate them oncurrent popular LLMs. The evaluation results show that\nevenfor GPT-4, 62% of the generated code contains API misuses,which would cause\nunexpected consequences if the code isintroduced into real-world software.\n","authors":["Li Zhong","Zilong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.10335v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14217v1","updated":"2023-08-27T22:35:27Z","published":"2023-08-27T22:35:27Z","title":"Generations of Knowledge Graphs: The Crazy Ideas and the Business Impact","summary":" Knowledge Graphs (KGs) have been used to support a wide range of\napplications, from web search to personal assistant. In this paper, we describe\nthree generations of knowledge graphs: entity-based KGs, which have been\nsupporting general search and question answering (e.g., at Google and Bing);\ntext-rich KGs, which have been supporting search and recommendations for\nproducts, bio-informatics, etc. (e.g., at Amazon and Alibaba); and the emerging\nintegration of KGs and LLMs, which we call dual neural KGs. We describe the\ncharacteristics of each generation of KGs, the crazy ideas behind the scenes in\nconstructing such KGs, and the techniques developed over time to enable\nindustry impact. In addition, we use KGs as examples to demonstrate a recipe to\nevolve research ideas from innovations to production practice, and then to the\nnext level of innovations, to advance both science and business.\n","authors":["Xin Luna Dong"],"pdf_url":"https://arxiv.org/pdf/2308.14217v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14199v1","updated":"2023-08-27T20:24:33Z","published":"2023-08-27T20:24:33Z","title":"Symbolic and Language Agnostic Large Language Models","summary":" We argue that the relative success of large language models (LLMs) is not a\nreflection on the symbolic vs. subsymbolic debate but a reflection on employing\nan appropriate strategy of bottom-up reverse engineering of language at scale.\nHowever, due to the subsymbolic nature of these models whatever knowledge these\nsystems acquire about language will always be buried in millions of\nmicrofeatures (weights) none of which is meaningful on its own. Moreover, and\ndue to their stochastic nature, these models will often fail in capturing\nvarious inferential aspects that are prevalent in natural language. What we\nsuggest here is employing the successful bottom-up strategy in a symbolic\nsetting, producing symbolic, language agnostic and ontologically grounded large\nlanguage models.\n","authors":["Walid S. Saba"],"pdf_url":"https://arxiv.org/pdf/2308.14199v1.pdf","comment":"4 pages - draft. arXiv admin note: substantial text overlap with\n arXiv:2306.00017"},{"id":"http://arxiv.org/abs/2308.14186v1","updated":"2023-08-27T19:22:12Z","published":"2023-08-27T19:22:12Z","title":"Empowering Cross-lingual Abilities of Instruction-tuned Large Language\n Models by Translation-following demonstrations","summary":" The language ability of Large Language Models (LLMs) is often unbalanced\ntowards English because of the imbalance in the distribution of the\npre-training data. This disparity is demanded in further fine-tuning and\naffecting the cross-lingual abilities of LLMs. In this paper, we propose to\nempower Instructiontuned LLMs (It-LLMs) in languages other than English by\nbuilding semantic alignment between them. Hence, we propose CrossAlpaca, an\nIt-LLM with cross-lingual instruction-following and Translation-following\ndemonstrations to improve semantic alignment between languages. We validate our\napproach on the multilingual Question Answering (QA) benchmarks XQUAD and MLQA\nand adapted versions of MMLU and BBH. Our models, tested over six different\nlanguages, outperform the It-LLMs tuned on monolingual data. The final results\nshow that instruction tuning on non-English data is not enough and that\nsemantic alignment can be further improved by Translation-following\ndemonstrations.\n","authors":["Leonardo Ranaldi","Giulia Pucci","Andre Freitas"],"pdf_url":"https://arxiv.org/pdf/2308.14186v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14182v1","updated":"2023-08-27T19:03:12Z","published":"2023-08-27T19:03:12Z","title":"Generative AI for Business Strategy: Using Foundation Models to Create\n Business Strategy Tools","summary":" Generative models (foundation models) such as LLMs (large language models)\nare having a large impact on multiple fields. In this work, we propose the use\nof such models for business decision making. In particular, we combine\nunstructured textual data sources (e.g., news data) with multiple foundation\nmodels (namely, GPT4, transformer-based Named Entity Recognition (NER) models\nand Entailment-based Zero-shot Classifiers (ZSC)) to derive IT (information\ntechnology) artifacts in the form of a (sequence of) signed business networks.\nWe posit that such artifacts can inform business stakeholders about the state\nof the market and their own positioning as well as provide quantitative\ninsights into improving their future outlook.\n","authors":["Son The Nguyen","Theja Tulabandhula"],"pdf_url":"https://arxiv.org/pdf/2308.14182v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14179v1","updated":"2023-08-27T18:46:47Z","published":"2023-08-27T18:46:47Z","title":"Towards Vision-Language Mechanistic Interpretability: A Causal Tracing\n Tool for BLIP","summary":" Mechanistic interpretability seeks to understand the neural mechanisms that\nenable specific behaviors in Large Language Models (LLMs) by leveraging\ncausality-based methods. While these approaches have identified neural circuits\nthat copy spans of text, capture factual knowledge, and more, they remain\nunusable for multimodal models since adapting these tools to the\nvision-language domain requires considerable architectural changes. In this\nwork, we adapt a unimodal causal tracing tool to BLIP to enable the study of\nthe neural mechanisms underlying image-conditioned text generation. We\ndemonstrate our approach on a visual question answering dataset, highlighting\nthe causal relevance of later layer representations for all tokens.\nFurthermore, we release our BLIP causal tracing tool as open source to enable\nfurther experimentation in vision-language mechanistic interpretability by the\ncommunity. Our code is available at\nhttps://github.com/vedantpalit/Towards-Vision-Language-Mechanistic-Interpretability.\n","authors":["Vedant Palit","Rohan Pandey","Aryaman Arora","Paul Pu Liang"],"pdf_url":"https://arxiv.org/pdf/2308.14179v1.pdf","comment":"Final version for 5th Workshop on Closing the Loop Between Vision and\n Language (CLVL) @ ICCV 2023. 4 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.14149v1","updated":"2023-08-27T16:14:19Z","published":"2023-08-27T16:14:19Z","title":"Examining User-Friendly and Open-Sourced Large GPT Models: A Survey on\n Language, Multimodal, and Scientific GPT Models","summary":" Generative pre-trained transformer (GPT) models have revolutionized the field\nof natural language processing (NLP) with remarkable performance in various\ntasks and also extend their power to multimodal domains. Despite their success,\nlarge GPT models like GPT-4 face inherent limitations such as considerable\nsize, high computational requirements, complex deployment processes, and closed\ndevelopment loops. These constraints restrict their widespread adoption and\nraise concerns regarding their responsible development and usage. The need for\nuser-friendly, relatively small, and open-sourced alternative GPT models arises\nfrom the desire to overcome these limitations while retaining high performance.\nIn this survey paper, we provide an examination of alternative open-sourced\nmodels of large GPTs, focusing on user-friendly and relatively small models\nthat facilitate easier deployment and accessibility. Through this extensive\nsurvey, we aim to equip researchers, practitioners, and enthusiasts with a\nthorough understanding of user-friendly and relatively small open-sourced\nmodels of large GPTs, their current state, challenges, and future research\ndirections, inspiring the development of more efficient, accessible, and\nversatile GPT models that cater to the broader scientific community and advance\nthe field of general artificial intelligence. The source contents are\ncontinuously updating in https://github.com/GPT-Alternatives/gpt_alternatives.\n","authors":["Kaiyuan Gao","Sunan He","Zhenyu He","Jiacheng Lin","QiZhi Pei","Jie Shao","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.14149v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14132v1","updated":"2023-08-27T15:20:06Z","published":"2023-08-27T15:20:06Z","title":"Detecting Language Model Attacks with Perplexity","summary":" A novel hack involving Large Language Models (LLMs) has emerged, leveraging\nadversarial suffixes to trick models into generating perilous responses. This\nmethod has garnered considerable attention from reputable media outlets such as\nthe New York Times and Wired, thereby influencing public perception regarding\nthe security and safety of LLMs. In this study, we advocate the utilization of\nperplexity as one of the means to recognize such potential attacks. The\nunderlying concept behind these hacks revolves around appending an unusually\nconstructed string of text to a harmful query that would otherwise be blocked.\nThis maneuver confuses the protective mechanisms and tricks the model into\ngenerating a forbidden response. Such scenarios could result in providing\ndetailed instructions to a malicious user for constructing explosives or\norchestrating a bank heist. Our investigation demonstrates the feasibility of\nemploying perplexity, a prevalent natural language processing metric, to detect\nthese adversarial tactics before generating a forbidden response. By evaluating\nthe perplexity of queries with and without such adversarial suffixes using an\nopen-source LLM, we discovered that nearly 90 percent were above a perplexity\nof 1000. This contrast underscores the efficacy of perplexity for detecting\nthis type of exploit.\n","authors":["Gabriel Alon","Michael Kamfonas"],"pdf_url":"https://arxiv.org/pdf/2308.14132v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14120v1","updated":"2023-08-27T14:28:38Z","published":"2023-08-27T14:28:38Z","title":"Empowering Clinicians and Democratizing Data Science: Large Language\n Models Automate Machine Learning for Clinical Studies","summary":" A knowledge gap persists between Machine Learning (ML) developers (e.g., data\nscientists) and practitioners (e.g., clinicians), hampering the full\nutilization of ML for clinical data analysis. We investigated the potential of\nthe chatGPT Code Interpreter (CI), an extension of GPT-4, to bridge this gap\nand perform ML analyses efficiently. Real-world clinical datasets and study\ndetails from large trials across various medical specialties were presented to\nchatGPT CI without specific guidance. ChatGPT CI autonomously developed\nstate-of-the-art ML models based on the original study's training data to\npredict clinical outcomes such as cancer development, cancer progression,\ndisease complications, or biomarkers such as pathogenic gene sequences.\nStrikingly, these ML models matched or outperformed their published\ncounterparts. We conclude that chatGPT CI offers a promising avenue to\ndemocratize ML in medicine, making advanced analytics accessible to non-ML\nexperts and promoting broader applications in medical research and practice.\n","authors":["Soroosh Tayebi Arasteh","Tianyu Han","Mahshad Lotfinia","Christiane Kuhl","Jakob Nikolas Kather","Daniel Truhn","Sven Nebelung"],"pdf_url":"https://arxiv.org/pdf/2308.14120v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14115v1","updated":"2023-08-27T14:14:28Z","published":"2023-08-27T14:14:28Z","title":"Situated Natural Language Explanations","summary":" Natural language is among the most accessible tools for explaining decisions\nto humans, and large pretrained language models (PLMs) have demonstrated\nimpressive abilities to generate coherent natural language explanations (NLE).\nThe existing NLE research perspectives do not take the audience into account.\nAn NLE can have high textual quality, but it might not accommodate audiences'\nneeds and preference. To address this limitation, we propose an alternative\nperspective, situated NLE, including a situated generation framework and a\nsituated evaluation framework. On the generation side, we propose simple prompt\nengineering methods that adapt the NLEs to situations. In human studies, the\nannotators preferred the situated NLEs. On the evaluation side, we set up\nautomated evaluation scores in lexical, semantic, and pragmatic categories. The\nscores can be used to select the most suitable prompts to generate NLEs.\nSituated NLE provides a perspective to conduct further research on automatic\nNLE generations.\n","authors":["Zining Zhu","Haoming Jiang","Jingfeng Yang","Sreyashi Nag","Chao Zhang","Jie Huang","Yifan Gao","Frank Rudzicz","Bing Yin"],"pdf_url":"https://arxiv.org/pdf/2308.14115v1.pdf","comment":"A previous version was presented in ACL 2023 NLRSE workshop"},{"id":"http://arxiv.org/abs/2308.14089v1","updated":"2023-08-27T12:24:39Z","published":"2023-08-27T12:24:39Z","title":"MedAlign: A Clinician-Generated Dataset for Instruction Following with\n Electronic Medical Records","summary":" The ability of large language models (LLMs) to follow natural language\ninstructions with human-level fluency suggests many opportunities in healthcare\nto reduce administrative burden and improve quality of care. However,\nevaluating LLMs on realistic text generation tasks for healthcare remains\nchallenging. Existing question answering datasets for electronic health record\n(EHR) data fail to capture the complexity of information needs and\ndocumentation burdens experienced by clinicians. To address these challenges,\nwe introduce MedAlign, a benchmark dataset of 983 natural language instructions\nfor EHR data. MedAlign is curated by 15 clinicians (7 specialities), includes\nclinician-written reference responses for 303 instructions, and provides 276\nlongitudinal EHRs for grounding instruction-response pairs. We used MedAlign to\nevaluate 6 general domain LLMs, having clinicians rank the accuracy and quality\nof each LLM response. We found high error rates, ranging from 35% (GPT-4) to\n68% (MPT-7B-Instruct), and an 8.3% drop in accuracy moving from 32k to 2k\ncontext lengths for GPT-4. Finally, we report correlations between clinician\nrankings and automated natural language generation metrics as a way to rank\nLLMs without human review. We make MedAlign available under a research data use\nagreement to enable LLM evaluations on tasks aligned with clinician needs and\npreferences.\n","authors":["Scott L. Fleming","Alejandro Lozano","William J. Haberkorn","Jenelle A. Jindal","Eduardo P. Reis","Rahul Thapa","Louis Blankemeier","Julian Z. Genkins","Ethan Steinberg","Ashwin Nayak","Birju S. Patel","Chia-Chun Chiang","Alison Callahan","Zepeng Huo","Sergios Gatidis","Scott J. Adams","Oluseyi Fayanju","Shreya J. Shah","Thomas Savage","Ethan Goh","Akshay S. Chaudhari","Nima Aghaeepour","Christopher Sharp","Michael A. Pfeffer","Percy Liang","Jonathan H. Chen","Keith E. Morse","Emma P. Brunskill","Jason A. Fries","Nigam H. Shah"],"pdf_url":"https://arxiv.org/pdf/2308.14089v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14077v1","updated":"2023-08-27T11:51:27Z","published":"2023-08-27T11:51:27Z","title":"An Analysis of On-the-fly Determinization of Finite-state Automata","summary":" In this paper we establish an abstraction of on-the-fly determinization of\nfinite-state automata using transition monoids and demonstrate how it can be\napplied to bound the asymptotics. We present algebraic and combinatorial\nproperties that are sufficient for a polynomial state complexity of the\ndeterministic automaton constructed on-the-fly. A special case of our findings\nis that automata with many non-deterministic transitions almost always admit a\ndeterminization of polynomial complexity. Furthermore, we extend our ideas to\nweighted finite-state automata.\n","authors":["Ivan Baburin","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2308.14077v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07711v3","updated":"2023-08-27T11:21:38Z","published":"2023-08-15T11:45:34Z","title":"SPM: Structured Pretraining and Matching Architectures for Relevance\n Modeling in Meituan Search","summary":" In e-commerce search, relevance between query and documents is an essential\nrequirement for satisfying user experience. Different from traditional\ne-commerce platforms that offer products, users search on life service\nplatforms such as Meituan mainly for product providers, which usually have\nabundant structured information, e.g. name, address, category, thousands of\nproducts. Modeling search relevance with these rich structured contents is\nchallenging due to the following issues: (1) there is language distribution\ndiscrepancy among different fields of structured document, making it difficult\nto directly adopt off-the-shelf pretrained language model based methods like\nBERT. (2) different fields usually have different importance and their length\nvary greatly, making it difficult to extract document information helpful for\nrelevance matching.\n To tackle these issues, in this paper we propose a novel two-stage\npretraining and matching architecture for relevance matching with rich\nstructured documents. At pretraining stage, we propose an effective pretraining\nmethod that employs both query and multiple fields of document as inputs,\nincluding an effective information compression method for lengthy fields. At\nrelevance matching stage, a novel matching method is proposed by leveraging\ndomain knowledge in search query to generate more effective document\nrepresentations for relevance scoring. Extensive offline experiments and online\nA/B tests on millions of users verify that the proposed architectures\neffectively improve the performance of relevance modeling. The model has\nalready been deployed online, serving the search traffic of Meituan for over a\nyear.\n","authors":["Wen Zan","Yaopeng Han","Xiaotian Jiang","Yao Xiao","Yang Yang","Dayao Chen","Sheng Chen"],"pdf_url":"https://arxiv.org/pdf/2308.07711v3.pdf","comment":"Accepted by CIKM '23"},{"id":"http://arxiv.org/abs/2308.14034v1","updated":"2023-08-27T07:53:00Z","published":"2023-08-27T07:53:00Z","title":"Confucius: Iterative Tool Learning from Introspection Feedback by\n Easy-to-Difficult Curriculum","summary":" Augmenting large language models (LLMs) with external tools has emerged as a\npromising approach to extending the capability of LLMs. Although some works\nemploy open-source LLMs for the tool learning task, most of them are trained in\na controlled environment in which LLMs only learn to execute the human-provided\ntools. However, selecting proper tools from the large toolset is also a crucial\nability for the tool learning model to be applied in real-world applications.\nExisting methods usually directly employ self-instruction methods to train the\nmodel, which ignores differences in tool complexity. In this paper, we propose\nthe Confucius, a novel tool learning framework to train LLM to use complicated\ntools in real-world scenarios, which contains two main phases: (1) We first\npropose a multi-stage learning method to teach the LLM to use various tools\nfrom an easy-to-difficult curriculum; (2) thenceforth, we propose the Iterative\nSelf-instruct from Introspective Feedback (ISIF) to dynamically construct the\ndataset to improve the ability to use the complicated tool. Extensive\nexperiments conducted on both controlled and real-world settings demonstrate\nthe superiority of our tool learning framework in the real-world application\nscenarios compared to both tuning-free (e.g. ChatGPT, Claude) and tuning-based\nbaselines (e.g. GPT4Tools).\n","authors":["Shen Gao","Zhengliang Shi","Minghang Zhu","Bowen Fang","Xin Xin","Pengjie Ren","Zhumin Chen","Jun Ma"],"pdf_url":"https://arxiv.org/pdf/2308.14034v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06624v2","updated":"2023-08-27T02:55:36Z","published":"2023-06-11T08:53:12Z","title":"RestGPT: Connecting Large Language Models with Real-World RESTful APIs","summary":" Tool-augmented large language models (LLMs) have achieved remarkable progress\nin tackling a broad range of tasks. However, existing methods are mainly\nrestricted to specifically designed tools and fail to fulfill complex\ninstructions, having great limitations when confronted with real-world\nscenarios. In this paper, we explore a more realistic scenario by connecting\nLLMs with RESTful APIs, which adhere to the widely adopted REST software\narchitectural style for web service development. To address the practical\nchallenges of tackling complex instructions, we propose RestGPT, which exploits\nthe power of LLMs and conducts a coarse-to-fine online planning mechanism to\nenhance the abilities of task decomposition and API selection. RestGPT also\ncontains an API executor tailored for calling RESTful APIs, which can\nmeticulously formulate parameters and parse API responses. To fully evaluate\nthe performance of RestGPT, we propose RestBench, a high-quality benchmark\nwhich consists of two real-world scenarios and human-annotated instructions\nwith gold solution paths. Experiments show that RestGPT is able to achieve\nimpressive results in complex tasks and has strong robustness, which paves a\nnew way towards AGI. RestGPT and RestBench is publicly available at\nhttps://restgpt.github.io/.\n","authors":["Yifan Song","Weimin Xiong","Dawei Zhu","Wenhao Wu","Han Qian","Mingbo Song","Hailiang Huang","Cheng Li","Ke Wang","Rong Yao","Ye Tian","Sujian Li"],"pdf_url":"https://arxiv.org/pdf/2306.06624v2.pdf","comment":"Add RestBench to evaluate RestGPT"},{"id":"http://arxiv.org/abs/2209.07562v3","updated":"2023-08-27T02:42:16Z","published":"2022-09-15T19:01:21Z","title":"TwHIN-BERT: A Socially-Enriched Pre-trained Language Model for\n Multilingual Tweet Representations at Twitter","summary":" Pre-trained language models (PLMs) are fundamental for natural language\nprocessing applications. Most existing PLMs are not tailored to the noisy\nuser-generated text on social media, and the pre-training does not factor in\nthe valuable social engagement logs available in a social network. We present\nTwHIN-BERT, a multilingual language model productionized at Twitter, trained on\nin-domain data from the popular social network. TwHIN-BERT differs from prior\npre-trained language models as it is trained with not only text-based\nself-supervision, but also with a social objective based on the rich social\nengagements within a Twitter heterogeneous information network (TwHIN). Our\nmodel is trained on 7 billion tweets covering over 100 distinct languages,\nproviding a valuable representation to model short, noisy, user-generated text.\nWe evaluate our model on various multilingual social recommendation and\nsemantic understanding tasks and demonstrate significant metric improvement\nover established pre-trained language models. We open-source TwHIN-BERT and our\ncurated hashtag prediction and social engagement benchmark datasets to the\nresearch community.\n","authors":["Xinyang Zhang","Yury Malkov","Omar Florez","Serim Park","Brian McWilliams","Jiawei Han","Ahmed El-Kishky"],"pdf_url":"https://arxiv.org/pdf/2209.07562v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.01488v3","updated":"2023-08-27T02:38:30Z","published":"2022-12-02T23:43:18Z","title":"Event knowledge in large language models: the gap between the impossible\n and the unlikely","summary":" Word co-occurrence patterns in language corpora contain a surprising amount\nof conceptual knowledge. Large language models (LLMs), trained to predict words\nin context, leverage these patterns to achieve impressive performance on\ndiverse semantic tasks requiring world knowledge. An important but understudied\nquestion about LLMs' semantic abilities is whether they acquire generalized\nknowledge of common events. Here, we test whether five pre-trained LLMs (from\n2018's BERT to 2023's MPT) assign higher likelihood to plausible descriptions\nof agent-patient interactions than to minimally different implausible versions\nof the same event. Using three curated sets of minimal sentence pairs (total\nn=1,215), we found that pre-trained LLMs possess substantial event knowledge,\noutperforming other distributional language models. In particular, they almost\nalways assign higher likelihood to possible vs. impossible events (The teacher\nbought the laptop vs. The laptop bought the teacher). However, LLMs show less\nconsistent preferences for likely vs. unlikely events (The nanny tutored the\nboy vs. The boy tutored the nanny). In follow-up analyses, we show that (i) LLM\nscores are driven by both plausibility and surface-level sentence features,\n(ii) LLM scores generalize well across syntactic variants (active vs. passive\nconstructions) but less well across semantic variants (synonymous sentences),\n(iii) some LLM errors mirror human judgment ambiguity, and (iv) sentence\nplausibility serves as an organizing dimension in internal LLM representations.\nOverall, our results show that important aspects of event knowledge naturally\nemerge from distributional linguistic patterns, but also highlight a gap\nbetween representations of possible/impossible and likely/unlikely events.\n","authors":["Carina Kauf","Anna A. Ivanova","Giulia Rambelli","Emmanuele Chersoni","Jingyuan Selena She","Zawad Chowdhury","Evelina Fedorenko","Alessandro Lenci"],"pdf_url":"https://arxiv.org/pdf/2212.01488v3.pdf","comment":"The two lead authors have contributed equally to this work"},{"id":"http://arxiv.org/abs/2302.03162v3","updated":"2023-08-27T00:13:09Z","published":"2023-02-06T23:42:03Z","title":"Protecting Language Generation Models via Invisible Watermarking","summary":" Language generation models have been an increasingly powerful enabler for\nmany applications. Many such models offer free or affordable API access, which\nmakes them potentially vulnerable to model extraction attacks through\ndistillation. To protect intellectual property (IP) and ensure fair use of\nthese models, various techniques such as lexical watermarking and synonym\nreplacement have been proposed. However, these methods can be nullified by\nobvious countermeasures such as \"synonym randomization\". To address this issue,\nwe propose GINSEW, a novel method to protect text generation models from being\nstolen through distillation. The key idea of our method is to inject secret\nsignals into the probability vector of the decoding steps for each target\ntoken. We can then detect the secret message by probing a suspect model to tell\nif it is distilled from the protected one. Experimental results show that\nGINSEW can effectively identify instances of IP infringement with minimal\nimpact on the generation quality of protected APIs. Our method demonstrates an\nabsolute improvement of 19 to 29 points on mean average precision (mAP) in\ndetecting suspects compared to previous methods against watermark removal\nattacks.\n","authors":["Xuandong Zhao","Yu-Xiang Wang","Lei Li"],"pdf_url":"https://arxiv.org/pdf/2302.03162v3.pdf","comment":"ICML 2023"},{"id":"http://arxiv.org/abs/2308.14763v1","updated":"2023-08-27T07:35:30Z","published":"2023-08-27T07:35:30Z","title":"VoiceBank-2023: A Multi-Speaker Mandarin Speech Corpus for Constructing\n Personalized TTS Systems for the Speech Impaired","summary":" Services of personalized TTS systems for the Mandarin-speaking speech\nimpaired are rarely mentioned. Taiwan started the VoiceBanking project in 2020,\naiming to build a complete set of services to deliver personalized Mandarin TTS\nsystems to amyotrophic lateral sclerosis patients. This paper reports the\ncorpus design, corpus recording, data purging and correction for the corpus,\nand evaluations of the developed personalized TTS systems, for the VoiceBanking\nproject. The developed corpus is named after the VoiceBank-2023 speech corpus\nbecause of its release year. The corpus contains 29.78 hours of utterances with\nprompts of short paragraphs and common phrases spoken by 111 native Mandarin\nspeakers. The corpus is labeled with information about gender, degree of speech\nimpairment, types of users, transcription, SNRs, and speaking rates. The\nVoiceBank-2023 is available by request for non-commercial use and welcomes all\nparties to join the VoiceBanking project to improve the services for the speech\nimpaired.\n","authors":["Jia-Jyu Su","Pang-Chen Liao","Yen-Ting Lin","Wu-Hao Li","Guan-Ting Liou","Cheng-Che Kao","Wei-Cheng Chen","Jen-Chieh Chiang","Wen-Yang Chang","Pin-Han Lin","Chen-Yu Chiang"],"pdf_url":"https://arxiv.org/pdf/2308.14763v1.pdf","comment":"submitted to 26th International Conference of the ORIENTAL-COCOSDA"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.14221v1","updated":"2023-08-27T22:45:24Z","published":"2023-08-27T22:45:24Z","title":"High-Resolution Document Shadow Removal via A Large-Scale Real-World\n Dataset and A Frequency-Aware Shadow Erasing Net","summary":" Shadows often occur when we capture the documents with casual equipment,\nwhich influences the visual quality and readability of the digital copies.\nDifferent from the algorithms for natural shadow removal, the algorithms in\ndocument shadow removal need to preserve the details of fonts and figures in\nhigh-resolution input. Previous works ignore this problem and remove the\nshadows via approximate attention and small datasets, which might not work in\nreal-world situations. We handle high-resolution document shadow removal\ndirectly via a larger-scale real-world dataset and a carefully designed\nfrequency-aware network. As for the dataset, we acquire over 7k couples of\nhigh-resolution (2462 x 3699) images of real-world document pairs with various\nsamples under different lighting circumstances, which is 10 times larger than\nexisting datasets. As for the design of the network, we decouple the\nhigh-resolution images in the frequency domain, where the low-frequency details\nand high-frequency boundaries can be effectively learned via the carefully\ndesigned network structure. Powered by our network and dataset, the proposed\nmethod clearly shows a better performance than previous methods in terms of\nvisual quality and numerical results. The code, models, and dataset are\navailable at: https://github.com/CXH-Research/DocShadow-SD7K\n","authors":["Zinuo Li","Xuhang Chen","Chi-Man Pun","Xiaodong Cun"],"pdf_url":"https://arxiv.org/pdf/2308.14221v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14213v1","updated":"2023-08-27T22:07:42Z","published":"2023-08-27T22:07:42Z","title":"Post-Hoc Explainability of BI-RADS Descriptors in a Multi-task Framework\n for Breast Cancer Detection and Segmentation","summary":" Despite recent medical advancements, breast cancer remains one of the most\nprevalent and deadly diseases among women. Although machine learning-based\nComputer-Aided Diagnosis (CAD) systems have shown potential to assist\nradiologists in analyzing medical images, the opaque nature of the\nbest-performing CAD systems has raised concerns about their trustworthiness and\ninterpretability. This paper proposes MT-BI-RADS, a novel explainable deep\nlearning approach for tumor detection in Breast Ultrasound (BUS) images. The\napproach offers three levels of explanations to enable radiologists to\ncomprehend the decision-making process in predicting tumor malignancy. Firstly,\nthe proposed model outputs the BI-RADS categories used for BUS image analysis\nby radiologists. Secondly, the model employs multi-task learning to\nconcurrently segment regions in images that correspond to tumors. Thirdly, the\nproposed approach outputs quantified contributions of each BI-RADS descriptor\ntoward predicting the benign or malignant class using post-hoc explanations\nwith Shapley Values.\n","authors":["Mohammad Karimzadeh","Aleksandar Vakanski","Min Xian","Boyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.14213v1.pdf","comment":"11 pages, 5 figures. Published at 2023 IEEE Workshop on MLSP"},{"id":"http://arxiv.org/abs/2308.05695v2","updated":"2023-08-27T22:05:35Z","published":"2023-08-10T16:57:14Z","title":"Masked Diffusion as Self-supervised Representation Learner","summary":" Denoising diffusion probabilistic models have recently demonstrated\nstate-of-the-art generative performance and been used as strong pixel-level\nrepresentation learners. This paper decomposes the interrelation between the\ngenerative capability and representation learning ability inherent in diffusion\nmodels. We present masked diffusion model (MDM), a scalable self-supervised\nrepresentation learner that substitutes the conventional additive Gaussian\nnoise of traditional diffusion with a masking mechanism. Our proposed approach\nconvincingly surpasses prior benchmarks, demonstrating remarkable advancements\nin both medical and natural image semantic segmentation tasks, particularly\nwithin the context of few-shot scenario.\n","authors":["Zixuan Pan","Jianxu Chen","Yiyu Shi"],"pdf_url":"https://arxiv.org/pdf/2308.05695v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14212v1","updated":"2023-08-27T22:02:41Z","published":"2023-08-27T22:02:41Z","title":"Exploring the Transfer Learning Capabilities of CLIP in Domain\n Generalization for Diabetic Retinopathy","summary":" Diabetic Retinopathy (DR), a leading cause of vision impairment, requires\nearly detection and treatment. Developing robust AI models for DR\nclassification holds substantial potential, but a key challenge is ensuring\ntheir generalization in unfamiliar domains with varying data distributions. To\naddress this, our paper investigates cross-domain generalization, also known as\ndomain generalization (DG), within the context of DR classification. DG, a\nchallenging problem in the medical domain, is complicated by the difficulty of\ngathering labeled data across different domains, such as patient demographics\nand disease stages. Some recent studies have shown the effectiveness of using\nCLIP to handle the DG problem in natural images. In this study, we investigate\nCLIP's transfer learning capabilities and its potential for cross-domain\ngeneralization in diabetic retinopathy (DR) classification. We carry out\ncomprehensive experiments to assess the efficacy and potential of CLIP in\naddressing DG for DR classification. Further, we introduce a multi-modal\nfine-tuning strategy named Context Optimization with Learnable Visual Tokens\n(CoOpLVT), which enhances context optimization by conditioning on visual\nfeatures. Our findings demonstrate that the proposed method increases the\nF1-score by 1.8% over the baseline, thus underlining its promise for effective\nDG in DR classification. Our code is publicly available at\nhttps://github.com/Sanoojan/CLIP-DRDG.\n","authors":["Sanoojan Baliah","Fadillah A. Maani","Santosh Sanjeev","Muhammad Haris Khan"],"pdf_url":"https://arxiv.org/pdf/2308.14212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.00646v2","updated":"2023-08-27T20:24:37Z","published":"2022-11-01T00:40:09Z","title":"Learning Melanocytic Cell Masks from Adjacent Stained Tissue","summary":" Melanoma is one of the most aggressive forms of skin cancer, causing a large\nproportion of skin cancer deaths. However, melanoma diagnoses by pathologists\nshows low interrater reliability. As melanoma is a cancer of the melanocyte,\nthere is a clear need to develop a melanocytic cell segmentation tool that is\nagnostic to pathologist variability and automates pixel-level annotation.\nGigapixel-level pathologist labeling, however, is impractical. Herein, we\npropose a means to train deep neural networks for melanocytic cell segmentation\nfrom hematoxylin and eosin (H&E) stained slides using paired\nimmunohistochemical (IHC) slides of adjacent tissue sections, achieving a mean\nIOU of 0.64 despite imperfect ground-truth labels.\n","authors":["Mikio Tada","Maria L. Wei","Michael J. Keiser"],"pdf_url":"https://arxiv.org/pdf/2211.00646v2.pdf","comment":"{Medical Image Learning with Limited & Noisy Data Workshop at MICCAI\n 2022"},{"id":"http://arxiv.org/abs/2308.14191v1","updated":"2023-08-27T19:44:44Z","published":"2023-08-27T19:44:44Z","title":"SketchDreamer: Interactive Text-Augmented Creative Sketch Ideation","summary":" Artificial Intelligence Generated Content (AIGC) has shown remarkable\nprogress in generating realistic images. However, in this paper, we take a step\n\"backward\" and address AIGC for the most rudimentary visual modality of human\nsketches. Our objective is on the creative nature of sketches, and that\ncreative sketching should take the form of an interactive process. We further\nenable text to drive the sketch ideation process, allowing creativity to be\nfreely defined, while simultaneously tackling the challenge of \"I can't\nsketch\". We present a method to generate controlled sketches using a\ntext-conditioned diffusion model trained on pixel representations of images.\nOur proposed approach, referred to as SketchDreamer, integrates a\ndifferentiable rasteriser of Bezier curves that optimises an initial input to\ndistil abstract semantic knowledge from a pretrained diffusion model. We\nutilise Score Distillation Sampling to learn a sketch that aligns with a given\ncaption, which importantly enable both text and sketch to interact with the\nideation process. Our objective is to empower non-professional users to create\nsketches and, through a series of optimisation processes, transform a narrative\ninto a storyboard by expanding the text prompt while making minor adjustments\nto the sketch input. Through this work, we hope to aspire the way we create\nvisual content, democratise the creative process, and inspire further research\nin enhancing human creativity in AIGC. The code is available at\n\\url{https://github.com/WinKawaks/SketchDreamer}.\n","authors":["Zhiyu Qu","Tao Xiang","Yi-Zhe Song"],"pdf_url":"https://arxiv.org/pdf/2308.14191v1.pdf","comment":"BMVC 2023"},{"id":"http://arxiv.org/abs/2308.14190v1","updated":"2023-08-27T19:43:43Z","published":"2023-08-27T19:43:43Z","title":"Score-Based Generative Models for PET Image Reconstruction","summary":" Score-based generative models have demonstrated highly promising results for\nmedical image reconstruction tasks in magnetic resonance imaging or computed\ntomography. However, their application to Positron Emission Tomography (PET) is\nstill largely unexplored. PET image reconstruction involves a variety of\nchallenges, including Poisson noise with high variance and a wide dynamic\nrange. To address these challenges, we propose several PET-specific adaptations\nof score-based generative models. The proposed framework is developed for both\n2D and 3D PET. In addition, we provide an extension to guided reconstruction\nusing magnetic resonance images. We validate the approach through extensive 2D\nand 3D $\\textit{in-silico}$ experiments with a model trained on\npatient-realistic data without lesions, and evaluate on data without lesions as\nwell as out-of-distribution data with lesions. This demonstrates the proposed\nmethod's robustness and significant potential for improved PET reconstruction.\n","authors":["Imraj RD Singh","Alexander Denker","Riccardo Barbano","Željko Kereta","Bangti Jin","Kris Thielemans","Peter Maass","Simon Arridge"],"pdf_url":"https://arxiv.org/pdf/2308.14190v1.pdf","comment":"35 pages, 16 figures, submitted to Journal of Machine Learning for\n Biomedical Imaging (MELBA)"},{"id":"http://arxiv.org/abs/2212.02053v3","updated":"2023-08-27T19:41:53Z","published":"2022-12-05T06:14:23Z","title":"Day2Dark: Pseudo-Supervised Activity Recognition beyond Silent Daylight","summary":" This paper strives to recognize activities in the dark, as well as in the\nday. We first establish that state-of-the-art activity recognizers are\neffective during the day, but not trustworthy in the dark. The main causes are\nthe limited availability of labeled dark videos to learn from, as well as the\ndistribution shift towards the lower color contrast at test-time. To compensate\nfor the lack of labeled dark videos, we introduce a pseudo-supervised learning\nscheme, which utilizes easy to obtain unlabeled and task-irrelevant dark videos\nto improve an activity recognizer in low light. As the lower color contrast\nresults in visual information loss, we further propose to incorporate the\ncomplementary activity information within audio, which is invariant to\nillumination. Since the usefulness of audio and visual features differs\ndepending on the amount of illumination, we introduce our `darkness-adaptive'\naudio-visual recognizer. Experiments on EPIC-Kitchens, Kinetics-Sound, and\nCharades demonstrate our proposals are superior to image enhancement, domain\nadaptation and alternative audio-visual fusion methods, and can even improve\nrobustness to local darkness caused by occlusions. Project page:\nhttps://xiaobai1217.github.io/Day2Dark/\n","authors":["Yunhua Zhang","Hazel Doughty","Cees G. M. Snoek"],"pdf_url":"https://arxiv.org/pdf/2212.02053v3.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2308.14179v1","updated":"2023-08-27T18:46:47Z","published":"2023-08-27T18:46:47Z","title":"Towards Vision-Language Mechanistic Interpretability: A Causal Tracing\n Tool for BLIP","summary":" Mechanistic interpretability seeks to understand the neural mechanisms that\nenable specific behaviors in Large Language Models (LLMs) by leveraging\ncausality-based methods. While these approaches have identified neural circuits\nthat copy spans of text, capture factual knowledge, and more, they remain\nunusable for multimodal models since adapting these tools to the\nvision-language domain requires considerable architectural changes. In this\nwork, we adapt a unimodal causal tracing tool to BLIP to enable the study of\nthe neural mechanisms underlying image-conditioned text generation. We\ndemonstrate our approach on a visual question answering dataset, highlighting\nthe causal relevance of later layer representations for all tokens.\nFurthermore, we release our BLIP causal tracing tool as open source to enable\nfurther experimentation in vision-language mechanistic interpretability by the\ncommunity. Our code is available at\nhttps://github.com/vedantpalit/Towards-Vision-Language-Mechanistic-Interpretability.\n","authors":["Vedant Palit","Rohan Pandey","Aryaman Arora","Paul Pu Liang"],"pdf_url":"https://arxiv.org/pdf/2308.14179v1.pdf","comment":"Final version for 5th Workshop on Closing the Loop Between Vision and\n Language (CLVL) @ ICCV 2023. 4 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.14177v1","updated":"2023-08-27T18:38:57Z","published":"2023-08-27T18:38:57Z","title":"AIGC for Various Data Modalities: A Survey","summary":" AI-generated content (AIGC) methods aim to produce text, images, videos, 3D\nassets, and other media using AI algorithms. Due to its wide range of\napplications and the demonstrated potential of recent works, AIGC developments\nhave been attracting a lot of attention recently, and AIGC methods have been\ndeveloped for various data modalities, such as image, video, text, 3D shape (as\nvoxels, point clouds, meshes, and neural implicit fields), 3D scene, 3D human\navatar (body and head), 3D motion, and audio -- each presenting different\ncharacteristics and challenges. Furthermore, there have also been many\nsignificant developments in cross-modality AIGC methods, where generative\nmethods can receive conditioning input in one modality and produce outputs in\nanother. Examples include going from various modalities to image, video, 3D\nshape, 3D scene, 3D avatar (body and head), 3D motion (skeleton and avatar),\nand audio modalities. In this paper, we provide a comprehensive review of AIGC\nmethods across different data modalities, including both single-modal and\ncross-modality methods, highlighting the various challenges, representative\nworks, and recent technical directions in each setting. We also present\ncomparative results on several benchmark datasets in various modalities.\nMoreover, we also discuss the challenges and potential future research\ndirections.\n","authors":["Lin Geng Foo","Hossein Rahmani","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2308.14177v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.08244v2","updated":"2023-08-27T18:23:01Z","published":"2022-11-14T03:51:12Z","title":"Artificial Intelligence for Automatic Detection and Classification\n Disease on the X-Ray Images","summary":" Detecting and classifying diseases using X-ray images is one of the more\nchallenging core tasks in the medical and research world. Due to the recent\nhigh interest in radiological images and AI, early detection of diseases in\nX-ray images has become notably more essential to prevent further spreading and\nflatten the curve. Innovations and revolutions of Computer Vision with Deep\nlearning methods offer great promise for fast and accurate diagnosis of\nscreening and detection from chest X-ray images (CXR). This work presents rapid\ndetection of diseases in the lung using the efficient Deep learning pre-trained\nRepVGG algorithm for deep feature extraction and classification. We used X-ray\nimages as an example to show the model's efficiency. To perform this task, we\nclassify X-Ray images into Covid-19, Pneumonia, and Normal X-Ray images. Employ\nROI object to improve the detection accuracy for lung extraction, followed by\ndata pre-processing and augmentation. We are applying Artificial Intelligence\ntechnology for automatic highlighted detection of affected areas of people's\nlungs. Based on the X-Ray images, an algorithm was developed that classifies\nX-Ray images with height accuracy and power faster thanks to the architecture\ntransformation of the model. We compared deep learning frameworks' accuracy and\ndetection of disease. The study shows the high power of deep learning methods\nfor X-ray images based on COVID-19 detection utilizing chest X-rays. The\nproposed framework offers better diagnostic accuracy by comparing popular deep\nlearning models, i.e., VGG, ResNet50, inceptionV3, DenseNet, and\nInceptionResnetV2.\n","authors":["Liora Mayats-Alpay"],"pdf_url":"https://arxiv.org/pdf/2211.08244v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.00785v5","updated":"2023-08-27T18:01:10Z","published":"2022-01-03T18:05:52Z","title":"Implicit Autoencoder for Point-Cloud Self-Supervised Representation\n Learning","summary":" This paper advocates the use of implicit surface representation in\nautoencoder-based self-supervised 3D representation learning. The most popular\nand accessible 3D representation, i.e., point clouds, involves discrete samples\nof the underlying continuous 3D surface. This discretization process introduces\nsampling variations on the 3D shape, making it challenging to develop\ntransferable knowledge of the true 3D geometry. In the standard autoencoding\nparadigm, the encoder is compelled to encode not only the 3D geometry but also\ninformation on the specific discrete sampling of the 3D shape into the latent\ncode. This is because the point cloud reconstructed by the decoder is\nconsidered unacceptable unless there is a perfect mapping between the original\nand the reconstructed point clouds. This paper introduces the Implicit\nAutoEncoder (IAE), a simple yet effective method that addresses the sampling\nvariation issue by replacing the commonly-used point-cloud decoder with an\nimplicit decoder. The implicit decoder reconstructs a continuous representation\nof the 3D shape, independent of the imperfections in the discrete samples.\nExtensive experiments demonstrate that the proposed IAE achieves\nstate-of-the-art performance across various self-supervised learning\nbenchmarks.\n","authors":["Siming Yan","Zhenpei Yang","Haoxiang Li","Chen Song","Li Guan","Hao Kang","Gang Hua","Qixing Huang"],"pdf_url":"https://arxiv.org/pdf/2201.00785v5.pdf","comment":"Published in ICCV 2023. The code is available at\n https://github.com/SimingYan/IAE"},{"id":"http://arxiv.org/abs/2308.14161v1","updated":"2023-08-27T17:44:25Z","published":"2023-08-27T17:44:25Z","title":"Intergrated Segmentation and Detection Models for Dentex Challenge 2023","summary":" Dental panoramic x-rays are commonly used in dental diagnosing. With the\ndevelopment of deep learning, auto detection of diseases from dental panoramic\nx-rays can help dentists to diagnose diseases more efficiently.The Dentex\nChallenge 2023 is a competition for automatic detection of abnormal teeth along\nwith their enumeration ids from dental panoramic x-rays. In this paper, we\npropose a method integrating segmentation and detection models to detect\nabnormal teeth as well as obtain their enumeration ids.Our codes are available\nat https://github.com/xyzlancehe/DentexSegAndDet.\n","authors":["Lanshan He","Yusheng Liu","Lisheng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14161v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14160v1","updated":"2023-08-27T17:30:56Z","published":"2023-08-27T17:30:56Z","title":"A Unified Transformer-based Network for multimodal Emotion Recognition","summary":" The development of transformer-based models has resulted in significant\nadvances in addressing various vision and NLP-based research challenges.\nHowever, the progress made in transformer-based methods has not been\neffectively applied to biosensing research. This paper presents a novel Unified\nBiosensor-Vision Multi-modal Transformer-based (UBVMT) method to classify\nemotions in an arousal-valence space by combining a 2D representation of an\nECG/PPG signal with the face information. To achieve this goal, we first\ninvestigate and compare the unimodal emotion recognition performance of three\nimage-based representations of the ECG/PPG signal. We then present our UBVMT\nnetwork which is trained to perform emotion recognition by combining the 2D\nimage-based representation of the ECG/PPG signal and the facial expression\nfeatures. Our unified transformer model consists of homogeneous transformer\nblocks that take as an input the 2D representation of the ECG/PPG signal and\nthe corresponding face frame for emotion representation learning with minimal\nmodality-specific design. Our UBVMT model is trained by reconstructing masked\npatches of video frames and 2D images of ECG/PPG signals, and contrastive\nmodeling to align face and ECG/PPG data. Extensive experiments on the\nMAHNOB-HCI and DEAP datasets show that our Unified UBVMT-based model produces\ncomparable results to the state-of-the-art techniques.\n","authors":["Kamran Ali","Charles E. Hughes"],"pdf_url":"https://arxiv.org/pdf/2308.14160v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2204.05905v2","updated":"2023-08-27T16:55:27Z","published":"2022-04-12T16:05:10Z","title":"Few-shot Forgery Detection via Guided Adversarial Interpolation","summary":" The increase in face manipulation models has led to a critical issue in\nsociety - the synthesis of realistic visual media. With the emergence of new\nforgery approaches at an unprecedented rate, existing forgery detection methods\nsuffer from significant performance drops when applied to unseen novel forgery\napproaches. In this work, we address the few-shot forgery detection problem by\n1) designing a comprehensive benchmark based on coverage analysis among various\nforgery approaches, and 2) proposing Guided Adversarial Interpolation (GAI).\nOur key insight is that there exist transferable distribution characteristics\nbetween majority and minority forgery classes1. Specifically, we enhance the\ndiscriminative ability against novel forgery approaches via adversarially\ninterpolating the forgery artifacts of the minority samples to the majority\nsamples under the guidance of a teacher network. Unlike the standard\nre-balancing method which usually results in over-fitting to minority classes,\nour method simultaneously takes account of the diversity of majority\ninformation as well as the significance of minority information. Extensive\nexperiments demonstrate that our GAI achieves state-of-the-art performances on\nthe established few-shot forgery detection benchmark. Notably, our method is\nalso validated to be robust to choices of majority and minority forgery\napproaches. The formal publication version is available in Pattern Recognition.\n","authors":["Haonan Qiu","Siyu Chen","Bei Gan","Kun Wang","Huafeng Shi","Jing Shao","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2204.05905v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.12856v3","updated":"2023-08-27T16:46:38Z","published":"2022-08-26T20:08:40Z","title":"Local Context-Aware Active Domain Adaptation","summary":" Active Domain Adaptation (ADA) queries the labels of a small number of\nselected target samples to help adapting a model from a source domain to a\ntarget domain. The local context of queried data is important, especially when\nthe domain gap is large. However, this has not been fully explored by existing\nADA works. In this paper, we propose a Local context-aware ADA framework, named\nLADA, to address this issue. To select informative target samples, we devise a\nnovel criterion based on the local inconsistency of model predictions. Since\nthe labeling budget is usually small, fine-tuning model on only queried data\ncan be inefficient. We progressively augment labeled target data with the\nconfident neighbors in a class-balanced manner. Experiments validate that the\nproposed criterion chooses more informative target samples than existing active\nselection strategies. Furthermore, our full method clearly surpasses recent ADA\narts on various benchmarks. Code is available at https://github.com/tsun/LADA.\n","authors":["Tao Sun","Cheng Lu","Haibin Ling"],"pdf_url":"https://arxiv.org/pdf/2208.12856v3.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.14153v1","updated":"2023-08-27T16:33:11Z","published":"2023-08-27T16:33:11Z","title":"Sparse Sampling Transformer with Uncertainty-Driven Ranking for Unified\n Removal of Raindrops and Rain Streaks","summary":" In the real world, image degradations caused by rain often exhibit a\ncombination of rain streaks and raindrops, thereby increasing the challenges of\nrecovering the underlying clean image. Note that the rain streaks and raindrops\nhave diverse shapes, sizes, and locations in the captured image, and thus\nmodeling the correlation relationship between irregular degradations caused by\nrain artifacts is a necessary prerequisite for image deraining. This paper aims\nto present an efficient and flexible mechanism to learn and model degradation\nrelationships in a global view, thereby achieving a unified removal of\nintricate rain scenes. To do so, we propose a Sparse Sampling Transformer based\non Uncertainty-Driven Ranking, dubbed UDR-S2Former. Compared to previous\nmethods, our UDR-S2Former has three merits. First, it can adaptively sample\nrelevant image degradation information to model underlying degradation\nrelationships. Second, explicit application of the uncertainty-driven ranking\nstrategy can facilitate the network to attend to degradation features and\nunderstand the reconstruction process. Finally, experimental results show that\nour UDR-S2Former clearly outperforms state-of-the-art methods for all\nbenchmarks.\n","authors":["Sixiang Chen","Tian Ye","Jinbin Bai","Erkang Chen","Jun Shi","Lei Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.14153v1.pdf","comment":"Accepted by ICCV'23"},{"id":"http://arxiv.org/abs/2308.14152v1","updated":"2023-08-27T16:22:09Z","published":"2023-08-27T16:22:09Z","title":"Unaligned 2D to 3D Translation with Conditional Vector-Quantized Code\n Diffusion using Transformers","summary":" Generating 3D images of complex objects conditionally from a few 2D views is\na difficult synthesis problem, compounded by issues such as domain gap and\ngeometric misalignment. For instance, a unified framework such as Generative\nAdversarial Networks cannot achieve this unless they explicitly define both a\ndomain-invariant and geometric-invariant joint latent distribution, whereas\nNeural Radiance Fields are generally unable to handle both issues as they\noptimize at the pixel level. By contrast, we propose a simple and novel 2D to\n3D synthesis approach based on conditional diffusion with vector-quantized\ncodes. Operating in an information-rich code space enables high-resolution 3D\nsynthesis via full-coverage attention across the views. Specifically, we\ngenerate the 3D codes (e.g. for CT images) conditional on previously generated\n3D codes and the entire codebook of two 2D views (e.g. 2D X-rays). Qualitative\nand quantitative results demonstrate state-of-the-art performance over\nspecialized methods across varied evaluation criteria, including fidelity\nmetrics such as density, coverage, and distortion metrics for two complex\nvolumetric imagery datasets from in real-world scenarios.\n","authors":["Abril Corona-Figueroa","Sam Bond-Taylor","Neelanjan Bhowmik","Yona Falinie A. Gaus","Toby P. Breckon","Hubert P. H. Shum","Chris G. Willcocks"],"pdf_url":"https://arxiv.org/pdf/2308.14152v1.pdf","comment":"Camera-ready version for ICCV 2023"},{"id":"http://arxiv.org/abs/2303.09551v2","updated":"2023-08-27T15:33:19Z","published":"2023-03-16T17:59:08Z","title":"SurroundOcc: Multi-Camera 3D Occupancy Prediction for Autonomous Driving","summary":" 3D scene understanding plays a vital role in vision-based autonomous driving.\nWhile most existing methods focus on 3D object detection, they have difficulty\ndescribing real-world objects of arbitrary shapes and infinite classes. Towards\na more comprehensive perception of a 3D scene, in this paper, we propose a\nSurroundOcc method to predict the 3D occupancy with multi-camera images. We\nfirst extract multi-scale features for each image and adopt spatial 2D-3D\nattention to lift them to the 3D volume space. Then we apply 3D convolutions to\nprogressively upsample the volume features and impose supervision on multiple\nlevels. To obtain dense occupancy prediction, we design a pipeline to generate\ndense occupancy ground truth without expansive occupancy annotations.\nSpecifically, we fuse multi-frame LiDAR scans of dynamic objects and static\nscenes separately. Then we adopt Poisson Reconstruction to fill the holes and\nvoxelize the mesh to get dense occupancy labels. Extensive experiments on\nnuScenes and SemanticKITTI datasets demonstrate the superiority of our method.\nCode and dataset are available at https://github.com/weiyithu/SurroundOcc\n","authors":["Yi Wei","Linqing Zhao","Wenzhao Zheng","Zheng Zhu","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2303.09551v2.pdf","comment":"Accepted to ICCV 2023. Code is available at\n https://github.com/weiyithu/SurroundOcc"},{"id":"http://arxiv.org/abs/2308.14133v1","updated":"2023-08-27T15:21:25Z","published":"2023-08-27T15:21:25Z","title":"Cheap Lunch for Medical Image Segmentation by Fine-tuning SAM on Few\n Exemplars","summary":" The Segment Anything Model (SAM) has demonstrated remarkable capabilities of\nscaled-up segmentation models, enabling zero-shot generalization across a\nvariety of domains. By leveraging large-scale foundational models as\npre-trained models, it is a natural progression to fine-tune SAM for specific\ndomains to further enhance performances. However, the adoption of foundational\nmodels in the medical domain presents a challenge due to the difficulty and\nexpense of labeling sufficient data for adaptation within hospital systems. In\nthis paper, we introduce an efficient and practical approach for fine-tuning\nSAM using a limited number of exemplars, making it suitable for such scenarios.\nOur approach combines two established techniques from the literature: an\nexemplar-guided synthesis module and the widely recognized Low-Rank Adaptation\n(LoRA) fine-tuning strategy, serving as data-level and model-level attempts\nrespectively. Interestingly, our empirical findings suggest that SAM can be\neffectively aligned within the medical domain even with few labeled data. We\nvalidate our approach through experiments on brain tumor segmentation (BraTS)\nand multi-organ CT segmentation (Synapse). The comprehensive results underscore\nthe feasibility and effectiveness of such an approach, paving the way for the\npractical application of SAM in the medical domain.\n","authors":["Weijia Feng","Lingting Zhu","Lequan Yu"],"pdf_url":"https://arxiv.org/pdf/2308.14133v1.pdf","comment":"Accepted by Brain Lesion (BrainLes) workshop of International\n Conference on Medical Image Computing and Computer Assisted Intervention\n (MICCAI BrainLes 2023). 10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.14126v1","updated":"2023-08-27T15:03:10Z","published":"2023-08-27T15:03:10Z","title":"Synergizing Contrastive Learning and Optimal Transport for 3D Point\n Cloud Domain Adaptation","summary":" Recently, the fundamental problem of unsupervised domain adaptation (UDA) on\n3D point clouds has been motivated by a wide variety of applications in\nrobotics, virtual reality, and scene understanding, to name a few. The point\ncloud data acquisition procedures manifest themselves as significant domain\ndiscrepancies and geometric variations among both similar and dissimilar\nclasses. The standard domain adaptation methods developed for images do not\ndirectly translate to point cloud data because of their complex geometric\nnature. To address this challenge, we leverage the idea of multimodality and\nalignment between distributions. We propose a new UDA architecture for point\ncloud classification that benefits from multimodal contrastive learning to get\nbetter class separation in both domains individually. Further, the use of\noptimal transport (OT) aims at learning source and target data distributions\njointly to reduce the cross-domain shift and provide a better alignment. We\nconduct a comprehensive empirical study on PointDA-10 and GraspNetPC-10 and\nshow that our method achieves state-of-the-art performance on GraspNetPC-10\n(with approx 4-12% margin) and best average performance on PointDA-10. Our\nablation studies and decision boundary analysis also validate the significance\nof our contrastive learning module and OT alignment.\n","authors":["Siddharth Katageri","Arkadipta De","Chaitanya Devaguptapu","VSSV Prasad","Charu Sharma","Manohar Kaul"],"pdf_url":"https://arxiv.org/pdf/2308.14126v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14119v1","updated":"2023-08-27T14:25:07Z","published":"2023-08-27T14:25:07Z","title":"Semi-Supervised Learning in the Few-Shot Zero-Shot Scenario","summary":" Semi-Supervised Learning (SSL) leverages both labeled and unlabeled data to\nimprove model performance. Traditional SSL methods assume that labeled and\nunlabeled data share the same label space. However, in real-world applications,\nespecially when the labeled training set is small, there may be classes that\nare missing from the labeled set. Existing frameworks aim to either reject all\nunseen classes (open-set SSL) or to discover unseen classes by partitioning an\nunlabeled set during training (open-world SSL). In our work, we construct a\nclassifier for points from both seen and unseen classes. Our approach is based\non extending an existing SSL method, such as FlexMatch, by incorporating an\nadditional entropy loss. This enhancement allows our method to improve the\nperformance of any existing SSL method in the classification of both seen and\nunseen classes. We demonstrate large improvement gains over state-of-the-art\nSSL, open-set SSL, and open-world SSL methods, on two benchmark image\nclassification data sets, CIFAR-100 and STL-10. The gains are most pronounced\nwhen the labeled data is severely limited (1-25 labeled examples per class).\n","authors":["Noam Fluss","Guy Hacohen","Daphna Weinshall"],"pdf_url":"https://arxiv.org/pdf/2308.14119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13608v2","updated":"2023-08-27T14:11:34Z","published":"2023-05-23T02:16:14Z","title":"VDD: Varied Drone Dataset for Semantic Segmentation","summary":" Semantic segmentation of drone images is critical to many aerial vision tasks\nas it provides essential semantic details that can compensate for the lack of\ndepth information from monocular cameras. However, maintaining high accuracy of\nsemantic segmentation models for drones requires diverse, large-scale, and\nhigh-resolution datasets, which are rare in the field of aerial image\nprocessing. Existing datasets are typically small and focus primarily on urban\nscenes, neglecting rural and industrial areas. Models trained on such datasets\nare not sufficiently equipped to handle the variety of inputs seen in drone\nimagery. In the VDD-Varied Drone Dataset, we offer a large-scale and densely\nlabeled dataset comprising 400 high-resolution images that feature carefully\nchosen scenes, camera angles, and varied light and weather conditions.\nFurthermore, we have adapted existing drone datasets to conform to our\nannotation standards and integrated them with VDD to create a dataset 1.5 times\nthe size of fine annotation of Cityscapes. We have developed a novel DeepLabT\nmodel, which combines CNN and Transformer backbones, to provide a reliable\nbaseline for semantic segmentation in drone imagery. Our experiments indicate\nthat DeepLabT performs admirably on VDD and other drone datasets. We expect\nthat our dataset will generate considerable interest in drone image\nsegmentation and serve as a foundation for other drone vision tasks. VDD is\nfreely available on our website at https://vddvdd.com .\n","authors":["Wenxiao Cai","Ke Jin","Jinyan Hou","Cong Guo","Letian Wu","Wankou Yang"],"pdf_url":"https://arxiv.org/pdf/2305.13608v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14113v1","updated":"2023-08-27T14:07:57Z","published":"2023-08-27T14:07:57Z","title":"Semantic-aware Consistency Network for Cloth-changing Person\n Re-Identification","summary":" Cloth-changing Person Re-Identification (CC-ReID) is a challenging task that\naims to retrieve the target person across multiple surveillance cameras when\nclothing changes might happen. Despite recent progress in CC-ReID, existing\napproaches are still hindered by the interference of clothing variations since\nthey lack effective constraints to keep the model consistently focused on\nclothing-irrelevant regions. To address this issue, we present a Semantic-aware\nConsistency Network (SCNet) to learn identity-related semantic features by\nproposing effective consistency constraints. Specifically, we generate the\nblack-clothing image by erasing pixels in the clothing area, which explicitly\nmitigates the interference from clothing variations. In addition, to fully\nexploit the fine-grained identity information, a head-enhanced attention module\nis introduced, which learns soft attention maps by utilizing the proposed\npart-based matching loss to highlight head information. We further design a\nsemantic consistency loss to facilitate the learning of high-level\nidentity-related semantic features, forcing the model to focus on semantically\nconsistent cloth-irrelevant regions. By using the consistency constraint, our\nmodel does not require any extra auxiliary segmentation module to generate the\nblack-clothing image or locate the head region during the inference stage.\nExtensive experiments on four cloth-changing person Re-ID datasets (LTCC, PRCC,\nVc-Clothes, and DeepChange) demonstrate that our proposed SCNet makes\nsignificant improvements over prior state-of-the-art approaches. Our code is\navailable at: https://github.com/Gpn-star/SCNet.\n","authors":["Peini Guo","Hong Liu","Jianbing Wu","Guoquan Wang","Tao Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14113v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2304.09807v2","updated":"2023-08-27T13:58:18Z","published":"2023-04-19T16:47:20Z","title":"VMA: Divide-and-Conquer Vectorized Map Annotation System for Large-Scale\n Driving Scene","summary":" High-definition (HD) map serves as the essential infrastructure of autonomous\ndriving. In this work, we build up a systematic vectorized map annotation\nframework (termed VMA) for efficiently generating HD map of large-scale driving\nscene. We design a divide-and-conquer annotation scheme to solve the spatial\nextensibility problem of HD map generation, and abstract map elements with a\nvariety of geometric patterns as unified point sequence representation, which\ncan be extended to most map elements in the driving scene. VMA is highly\nefficient and extensible, requiring negligible human effort, and flexible in\nterms of spatial scale and element type. We quantitatively and qualitatively\nvalidate the annotation performance on real-world urban and highway scenes, as\nwell as NYC Planimetric Database. VMA can significantly improve map generation\nefficiency and require little human effort. On average VMA takes 160min for\nannotating a scene with a range of hundreds of meters, and reduces 52.3% of the\nhuman cost, showing great application value. Code:\nhttps://github.com/hustvl/VMA.\n","authors":["Shaoyu Chen","Yunchi Zhang","Bencheng Liao","Jiafeng Xie","Tianheng Cheng","Wei Sui","Qian Zhang","Chang Huang","Wenyu Liu","Xinggang Wang"],"pdf_url":"https://arxiv.org/pdf/2304.09807v2.pdf","comment":"https://github.com/hustvl/VMA"},{"id":"http://arxiv.org/abs/2308.14108v1","updated":"2023-08-27T13:50:15Z","published":"2023-08-27T13:50:15Z","title":"Depth self-supervision for single image novel view synthesis","summary":" In this paper, we tackle the problem of generating a novel image from an\narbitrary viewpoint given a single frame as input. While existing methods\noperating in this setup aim at predicting the target view depth map to guide\nthe synthesis, without explicit supervision over such a task, we jointly\noptimize our framework for both novel view synthesis and depth estimation to\nunleash the synergy between the two at its best. Specifically, a shared depth\ndecoder is trained in a self-supervised manner to predict depth maps that are\nconsistent across the source and target views. Our results demonstrate the\neffectiveness of our approach in addressing the challenges of both tasks\nallowing for higher-quality generated images, as well as more accurate depth\nfor the target viewpoint.\n","authors":["Giovanni Minelli","Matteo Poggi","Samuele Salti"],"pdf_url":"https://arxiv.org/pdf/2308.14108v1.pdf","comment":null},{"id":"http://arxiv.org/abs/1912.03623v3","updated":"2023-08-27T13:50:10Z","published":"2019-12-08T06:10:49Z","title":"Single image reflection removal via learning with multi-image\n constraints","summary":" Reflections are very common phenomena in our daily photography, which\ndistract people's attention from the scene behind the glass. The problem of\nremoving reflection artifacts is important but challenging due to its ill-posed\nnature. The traditional approaches solve an optimization problem over the\nconstraints induced from multiple images, at the expense of large computation\ncosts. Recent learning-based approaches have demonstrated a significant\nimprovement in both performance and running time for single image reflection\nremoval, but are limited as they require a large number of synthetic\nreflection/clean image pairs for direct supervision to approximate the ground\ntruth, at the risk of overfitting in the synthetic image domain and degrading\nin the real image domain. In this paper, we propose a novel learning-based\nsolution that combines the advantages of the aforementioned approaches and\novercomes their drawbacks. Our algorithm works by learning a deep neural\nnetwork to optimize the target with joint constraints enhanced among multiple\ninput images during the training phase, but is able to eliminate reflections\nonly from a single input for evaluation. Our algorithm runs in real-time and\nachieves state-of-the-art reflection removal performance on real images. We\nfurther propose a strong network backbone that disentangles the background and\nreflection information into separate latent codes, which are embedded into a\nshared one-branch deep neural network for both background and reflection\npredictions. The proposed backbone experimentally performs better than the\nother common network implementations, and provides insightful knowledge to\nunderstand the reflection removal task.\n","authors":["Yingda Yin","Qingnan Fan","Dongdong Chen","Yujie Wang","Angelica Aviles-Rivero","Ruoteng Li","Carola-Bibiane Schnlieb","Baoquan Chen"],"pdf_url":"https://arxiv.org/pdf/1912.03623v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14105v1","updated":"2023-08-27T13:22:55Z","published":"2023-08-27T13:22:55Z","title":"Unified and Dynamic Graph for Temporal Character Grouping in Long Videos","summary":" Video temporal character grouping locates appearing moments of major\ncharacters within a video according to their identities. To this end, recent\nworks have evolved from unsupervised clustering to graph-based supervised\nclustering. However, graph methods are built upon the premise of fixed affinity\ngraphs, bringing many inexact connections. Besides, they extract multi-modal\nfeatures with kinds of models, which are unfriendly to deployment. In this\npaper, we present a unified and dynamic graph (UniDG) framework for temporal\ncharacter grouping. This is accomplished firstly by a unified representation\nnetwork that learns representations of multiple modalities within the same\nspace and still preserves the modality's uniqueness simultaneously. Secondly,\nwe present a dynamic graph clustering where the neighbors of different\nquantities are dynamically constructed for each node via a cyclic matching\nstrategy, leading to a more reliable affinity graph. Thirdly, a progressive\nassociation method is introduced to exploit spatial and temporal contexts among\ndifferent modalities, allowing multi-modal clustering results to be well fused.\nAs current datasets only provide pre-extracted features, we evaluate our UniDG\nmethod on a collected dataset named MTCG, which contains each character's\nappearing clips of face and body and speaking voice tracks. We also evaluate\nour key components on existing clustering and retrieval datasets to verify the\ngeneralization ability. Experimental results manifest that our method can\nachieve promising results and outperform several state-of-the-art approaches.\n","authors":["Xiujun Shu","Wei Wen","Liangsheng Xu","Mingbao Lin","Ruizhi Qiao","Taian Guo","Hanjun Li","Bei Gan","Xiao Wang","Xin Sun"],"pdf_url":"https://arxiv.org/pdf/2308.14105v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14103v1","updated":"2023-08-27T13:17:34Z","published":"2023-08-27T13:17:34Z","title":"Towards Unified Token Learning for Vision-Language Tracking","summary":" In this paper, we present a simple, flexible and effective vision-language\n(VL) tracking pipeline, termed \\textbf{MMTrack}, which casts VL tracking as a\ntoken generation task. Traditional paradigms address VL tracking task\nindirectly with sophisticated prior designs, making them over-specialize on the\nfeatures of specific architectures or mechanisms. In contrast, our proposed\nframework serializes language description and bounding box into a sequence of\ndiscrete tokens. In this new design paradigm, all token queries are required to\nperceive the desired target and directly predict spatial coordinates of the\ntarget in an auto-regressive manner. The design without other prior modules\navoids multiple sub-tasks learning and hand-designed loss functions,\nsignificantly reducing the complexity of VL tracking modeling and allowing our\ntracker to use a simple cross-entropy loss as unified optimization objective\nfor VL tracking task. Extensive experiments on TNL2K, LaSOT, LaSOT$_{\\rm{ext}}$\nand OTB99-Lang benchmarks show that our approach achieves promising results,\ncompared to other state-of-the-arts.\n","authors":["Yaozong Zheng","Bineng Zhong","Qihua Liang","Guorong Li","Rongrong Ji","Xianxian Li"],"pdf_url":"https://arxiv.org/pdf/2308.14103v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14101v1","updated":"2023-08-27T13:13:28Z","published":"2023-08-27T13:13:28Z","title":"Superpixels algorithms through network community detection","summary":" Community detection is a powerful tool from complex networks analysis that\nfinds applications in various research areas. Several image segmentation\nmethods rely for instance on community detection algorithms as a black box in\norder to compute undersegmentations, i.e. a small number of regions that\nrepresent areas of interest of the image. However, to the best of our\nknowledge, the efficiency of such an approach w.r.t. superpixels, that aim at\nrepresenting the image at a smaller level while preserving as much as possible\noriginal information, has been neglected so far. The only related work seems to\nbe the one by Liu et. al. (IET Image Processing, 2022) that developed a\nsuperpixels algorithm using a so-called modularity maximization approach,\nleading to relevant results. We follow this line of research by studying the\nefficiency of superpixels computed by state-of-the-art community detection\nalgorithms on a 4-connected pixel graph, so-called pixel-grid. We first detect\ncommunities on such a graph and then apply a simple merging procedure that\nallows to obtain the desired number of superpixels. As we shall see, such\nmethods result in the computation of relevant superpixels as emphasized by both\nqualitative and quantitative experiments, according to different widely-used\nmetrics based on ground-truth comparison or on superpixels only. We observe\nthat the choice of the community detection algorithm has a great impact on the\nnumber of communities and hence on the merging procedure. Similarly, small\nvariations on the pixel-grid may provide different results from both\nqualitative and quantitative viewpoints. For the sake of completeness, we\ncompare our results with those of several state-of-the-art superpixels\nalgorithms as computed by Stutz et al. (Computer Vision and Image\nUnderstanding, 2018).\n","authors":["Anthony Perez"],"pdf_url":"https://arxiv.org/pdf/2308.14101v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14100v1","updated":"2023-08-27T13:07:44Z","published":"2023-08-27T13:07:44Z","title":"Rethinking Exemplars for Continual Semantic Segmentation in Endoscopy\n Scenes: Entropy-based Mini-Batch Pseudo-Replay","summary":" Endoscopy is a widely used technique for the early detection of diseases or\nrobotic-assisted minimally invasive surgery (RMIS). Numerous deep learning\n(DL)-based research works have been developed for automated diagnosis or\nprocessing of endoscopic view. However, existing DL models may suffer from\ncatastrophic forgetting. When new target classes are introduced over time or\ncross institutions, the performance of old classes may suffer severe\ndegradation. More seriously, data privacy and storage issues may lead to the\nunavailability of old data when updating the model. Therefore, it is necessary\nto develop a continual learning (CL) methodology to solve the problem of\ncatastrophic forgetting in endoscopic image segmentation. To tackle this, we\npropose a Endoscopy Continual Semantic Segmentation (EndoCSS) framework that\ndoes not involve the storage and privacy issues of exemplar data. The framework\nincludes a mini-batch pseudo-replay (MB-PR) mechanism and a self-adaptive noisy\ncross-entropy (SAN-CE) loss. The MB-PR strategy circumvents privacy and storage\nissues by generating pseudo-replay images through a generative model.\nMeanwhile, the MB-PR strategy can also correct the model deviation to the\nreplay data and current training data, which is aroused by the significant\ndifference in the amount of current and replay images. Therefore, the model can\nperform effective representation learning on both new and old tasks. SAN-CE\nloss can help model fitting by adjusting the model's output logits, and also\nimprove the robustness of training. Extensive continual semantic segmentation\n(CSS) experiments on public datasets demonstrate that our method can robustly\nand effectively address the catastrophic forgetting brought by class increment\nin endoscopy scenes. The results show that our framework holds excellent\npotential for real-world deployment in a streaming learning manner.\n","authors":["Guankun Wang","Long Bai","Yanan Wu","Tong Chen","Hongliang Ren"],"pdf_url":"https://arxiv.org/pdf/2308.14100v1.pdf","comment":"Accepted by Computers in Biology and Medicine"},{"id":"http://arxiv.org/abs/2307.00724v3","updated":"2023-08-27T12:49:57Z","published":"2023-07-03T03:09:44Z","title":"LXL: LiDAR Excluded Lean 3D Object Detection with 4D Imaging Radar and\n Camera Fusion","summary":" As an emerging technology and a relatively affordable device, the 4D imaging\nradar has already been confirmed effective in performing 3D object detection in\nautonomous driving. Nevertheless, the sparsity and noisiness of 4D radar point\nclouds hinder further performance improvement, and in-depth studies about its\nfusion with other modalities are lacking. On the other hand, as a new image\nview transformation strategy, \"sampling\" has been applied in a few image-based\ndetectors and shown to outperform the widely applied \"depth-based splatting\"\nproposed in Lift-Splat-Shoot (LSS), even without image depth prediction.\nHowever, the potential of \"sampling\" is not fully unleashed. In this paper, we\ninvestigate the \"sampling\" view transformation strategy on the camera and 4D\nimaging radar fusion-based 3D object detection. In the proposed LiDAR Excluded\nLean (LXL) model, predicted image depth distribution maps and radar 3D\noccupancy grids are generated from image perspective view (PV) features and\nradar bird's eye view (BEV) features, respectively. They are sent to the core\nof LXL, called \"radar occupancy-assisted depth-based sampling\", to aid image\nview transformation. Introducing image depths and radar information enhances\nthe \"sampling\" strategy and leads to more accurate view transformation.\nExperiments on VoD and TJ4DRadSet datasets show that the proposed method\noutperforms the state-of-the-art 3D object detection methods by a significant\nmargin without bells and whistles. Ablation studies demonstrate that our method\nperforms the best among different enhancement settings.\n","authors":["Weiyi Xiong","Jianan Liu","Tao Huang","Qing-Long Han","Yuxuan Xia","Bing Zhu"],"pdf_url":"https://arxiv.org/pdf/2307.00724v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14087v1","updated":"2023-08-27T12:20:28Z","published":"2023-08-27T12:20:28Z","title":"A comprehensive review on Plant Leaf Disease detection using Deep\n learning","summary":" Leaf disease is a common fatal disease for plants. Early diagnosis and\ndetection is necessary in order to improve the prognosis of leaf diseases\naffecting plant. For predicting leaf disease, several automated systems have\nalready been developed using different plant pathology imaging modalities. This\npaper provides a systematic review of the literature on leaf disease-based\nmodels for the diagnosis of various plant leaf diseases via deep learning. The\nadvantages and limitations of different deep learning models including Vision\nTransformer (ViT), Deep convolutional neural network (DCNN), Convolutional\nneural network (CNN), Residual Skip Network-based Super-Resolution for Leaf\nDisease Detection (RSNSR-LDD), Disease Detection Network (DDN), and YOLO (You\nonly look once) are described in this review. The review also shows that the\nstudies related to leaf disease detection applied different deep learning\nmodels to a number of publicly available datasets. For comparing the\nperformance of the models, different metrics such as accuracy, precision,\nrecall, etc. were used in the existing studies.\n","authors":["Sumaya Mustofa","Md Mehedi Hasan Munna","Yousuf Rayhan Emon","Golam Rabbany","Md Taimur Ahad"],"pdf_url":"https://arxiv.org/pdf/2308.14087v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14084v1","updated":"2023-08-27T12:12:27Z","published":"2023-08-27T12:12:27Z","title":"Practical Edge Detection via Robust Collaborative Learning","summary":" Edge detection, as a core component in a wide range of visionoriented tasks,\nis to identify object boundaries and prominent edges in natural images. An edge\ndetector is desired to be both efficient and accurate for practical use. To\nachieve the goal, two key issues should be concerned: 1) How to liberate deep\nedge models from inefficient pre-trained backbones that are leveraged by most\nexisting deep learning methods, for saving the computational cost and cutting\nthe model size; and 2) How to mitigate the negative influence from noisy or\neven wrong labels in training data, which widely exist in edge detection due to\nthe subjectivity and ambiguity of annotators, for the robustness and accuracy.\nIn this paper, we attempt to simultaneously address the above problems via\ndeveloping a collaborative learning based model, termed PEdger. The principle\nbehind our PEdger is that, the information learned from different training\nmoments and heterogeneous (recurrent and non recurrent in this work)\narchitectures, can be assembled to explore robust knowledge against noisy\nannotations, even without the help of pre-training on extra data. Extensive\nablation studies together with quantitative and qualitative experimental\ncomparisons on the BSDS500 and NYUD datasets are conducted to verify the\neffectiveness of our design, and demonstrate its superiority over other\ncompetitors in terms of accuracy, speed, and model size. Codes can be found at\nhttps://github.co/ForawardStar/PEdger.\n","authors":["Yuanbin Fu","Xiaojie Guo"],"pdf_url":"https://arxiv.org/pdf/2308.14084v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14083v1","updated":"2023-08-27T12:08:49Z","published":"2023-08-27T12:08:49Z","title":"4D Myocardium Reconstruction with Decoupled Motion and Shape Model","summary":" Estimating the shape and motion state of the myocardium is essential in\ndiagnosing cardiovascular diseases.However, cine magnetic resonance (CMR)\nimaging is dominated by 2D slices, whose large slice spacing challenges\ninter-slice shape reconstruction and motion acquisition.To address this\nproblem, we propose a 4D reconstruction method that decouples motion and shape,\nwhich can predict the inter-/intra- shape and motion estimation from a given\nsparse point cloud sequence obtained from limited slices. Our framework\ncomprises a neural motion model and an end-diastolic (ED) shape model. The\nimplicit ED shape model can learn a continuous boundary and encourage the\nmotion model to predict without the supervision of ground truth deformation,\nand the motion model enables canonical input of the shape model by deforming\nany point from any phase to the ED phase. Additionally, the constructed\nED-space enables pre-training of the shape model, thereby guiding the motion\nmodel and addressing the issue of data scarcity. We propose the first 4D\nmyocardial dataset as we know and verify our method on the proposed, public,\nand cross-modal datasets, showing superior reconstruction performance and\nenabling various clinical applications.\n","authors":["Xiaohan Yuan","Cong Liu","Yangang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14083v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.14082v1","updated":"2023-08-27T12:01:11Z","published":"2023-08-27T12:01:11Z","title":"Reconstructing Interacting Hands with Interaction Prior from Monocular\n Images","summary":" Reconstructing interacting hands from monocular images is indispensable in\nAR/VR applications. Most existing solutions rely on the accurate localization\nof each skeleton joint. However, these methods tend to be unreliable due to the\nsevere occlusion and confusing similarity among adjacent hand parts. This also\ndefies human perception because humans can quickly imitate an interaction\npattern without localizing all joints. Our key idea is to first construct a\ntwo-hand interaction prior and recast the interaction reconstruction task as\nthe conditional sampling from the prior. To expand more interaction states, a\nlarge-scale multimodal dataset with physical plausibility is proposed. Then a\nVAE is trained to further condense these interaction patterns as latent codes\nin a prior distribution. When looking for image cues that contribute to\ninteraction prior sampling, we propose the interaction adjacency heatmap (IAH).\nCompared with a joint-wise heatmap for localization, IAH assigns denser visible\nfeatures to those invisible joints. Compared with an all-in-one visible\nheatmap, it provides more fine-grained local interaction information in each\ninteraction region. Finally, the correlations between the extracted features\nand corresponding interaction codes are linked by the ViT module. Comprehensive\nevaluations on benchmark datasets have verified the effectiveness of this\nframework. The code and dataset are publicly available at\nhttps://github.com/binghui-z/InterPrior_pytorch\n","authors":["Binghui Zuo","Zimeng Zhao","Wenqian Sun","Wei Xie","Zhou Xue","Yangang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14082v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.14081v1","updated":"2023-08-27T11:57:56Z","published":"2023-08-27T11:57:56Z","title":"U-SEANNet: A Simple, Efficient and Applied U-Shaped Network for\n Diagnosing Nasal Diseases from Nasal Endoscopic Images","summary":" Utilizing deep learning (DL) models to improve the early diagnosis of nasal\ndiseases from nasal endoscopic images holds paramount importance. However, the\nlack of available datasets stymies advancements in this field. Furthermore,\nexisting models fail to strike a good trade-off between model diagnosis\nperformance, model complexity and parameter size, rendering them unsuitable for\npractical application. To bridge these gaps, we created the first large-scale\nnasal endoscopy dataset, named 7-NasEID, comprising 11,352 images that span six\nnasal diseases and normal samples. Building on this, we proposed U-SEANNet, an\ninnovative architecture, underpinned by depth-wise separable convolutions.\nAdditionally, to augment its discernment capabilities for subtle variations in\ninput images, we further proposed the Global-Local Channel Feature Fusion\nModule, enabling the U-SEANNet to focus salient channel features from both\nglobal and local contexts. Notably, U-SEANNet's parameter size and GFLOPs are\nonly 0.78M and 0.21, respectively. Employing the 7-NasalEID, we conducted the\nfive-fold cross-validation on U-SEANNet, juxtaposing its performance against\nseventeen renowned architectures. The experimental results suggest U-SEANNet as\nthe state-of-the-art (SOTA) model, achieves an accuracy of 93.58%, sensitivity\nof 90.17%, and specificity of 91.27%. These findings demonstrate U-SEANNet's\nprodigious potential for diagnosing nasal diseases in practical use, providing\nthe development of efficacy nasal diseases diagnosis tools with a new insight.\n","authors":["Yubiao Yue","Jun Xue","Haihua Liang","Zhenzhang Li"],"pdf_url":"https://arxiv.org/pdf/2308.14081v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08648v3","updated":"2023-08-27T11:52:50Z","published":"2023-06-14T17:28:45Z","title":"SimpleMapping: Real-Time Visual-Inertial Dense Mapping with Deep\n Multi-View Stereo","summary":" We present a real-time visual-inertial dense mapping method capable of\nperforming incremental 3D mesh reconstruction with high quality using only\nsequential monocular images and inertial measurement unit (IMU) readings. 6-DoF\ncamera poses are estimated by a robust feature-based visual-inertial odometry\n(VIO), which also generates noisy sparse 3D map points as a by-product. We\npropose a sparse point aided multi-view stereo neural network (SPA-MVSNet) that\ncan effectively leverage the informative but noisy sparse points from the VIO\nsystem. The sparse depth from VIO is firstly completed by a single-view depth\ncompletion network. This dense depth map, although naturally limited in\naccuracy, is then used as a prior to guide our MVS network in the cost volume\ngeneration and regularization for accurate dense depth prediction. Predicted\ndepth maps of keyframe images by the MVS network are incrementally fused into a\nglobal map using TSDF-Fusion. We extensively evaluate both the proposed\nSPA-MVSNet and the entire visual-inertial dense mapping system on several\npublic datasets as well as our own dataset, demonstrating the system's\nimpressive generalization capabilities and its ability to deliver high-quality\n3D mesh reconstruction online. Our proposed dense mapping system achieves a\n39.7% improvement in F-score over existing systems when evaluated on the\nchallenging scenarios of the EuRoC dataset.\n","authors":["Yingye Xin","Xingxing Zuo","Dongyue Lu","Stefan Leutenegger"],"pdf_url":"https://arxiv.org/pdf/2306.08648v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14078v1","updated":"2023-08-27T11:52:00Z","published":"2023-08-27T11:52:00Z","title":"Sparse3D: Distilling Multiview-Consistent Diffusion for Object\n Reconstruction from Sparse Views","summary":" Reconstructing 3D objects from extremely sparse views is a long-standing and\nchallenging problem. While recent techniques employ image diffusion models for\ngenerating plausible images at novel viewpoints or for distilling pre-trained\ndiffusion priors into 3D representations using score distillation sampling\n(SDS), these methods often struggle to simultaneously achieve high-quality,\nconsistent, and detailed results for both novel-view synthesis (NVS) and\ngeometry. In this work, we present Sparse3D, a novel 3D reconstruction method\ntailored for sparse view inputs. Our approach distills robust priors from a\nmultiview-consistent diffusion model to refine a neural radiance field.\nSpecifically, we employ a controller that harnesses epipolar features from\ninput views, guiding a pre-trained diffusion model, such as Stable Diffusion,\nto produce novel-view images that maintain 3D consistency with the input. By\ntapping into 2D priors from powerful image diffusion models, our integrated\nmodel consistently delivers high-quality results, even when faced with\nopen-world objects. To address the blurriness introduced by conventional SDS,\nwe introduce the category-score distillation sampling (C-SDS) to enhance\ndetail. We conduct experiments on CO3DV2 which is a multi-view dataset of\nreal-world objects. Both quantitative and qualitative evaluations demonstrate\nthat our approach outperforms previous state-of-the-art works on the metrics\nregarding NVS and geometry reconstruction.\n","authors":["Zi-Xin Zou","Weihao Cheng","Yan-Pei Cao","Shi-Sheng Huang","Ying Shan","Song-Hai Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.14078v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14076v1","updated":"2023-08-27T11:49:46Z","published":"2023-08-27T11:49:46Z","title":"A Novel Multi-scale Attention Feature Extraction Block for Aerial Remote\n Sensing Image Classification","summary":" Classification of very high-resolution (VHR) aerial remote sensing (RS)\nimages is a well-established research area in the remote sensing community as\nit provides valuable spatial information for decision-making. Existing works on\nVHR aerial RS image classification produce an excellent classification\nperformance; nevertheless, they have a limited capability to well-represent VHR\nRS images having complex and small objects, thereby leading to performance\ninstability. As such, we propose a novel plug-and-play multi-scale attention\nfeature extraction block (MSAFEB) based on multi-scale convolution at two\nlevels with skip connection, producing discriminative/salient information at a\ndeeper/finer level. The experimental study on two benchmark VHR aerial RS image\ndatasets (AID and NWPU) demonstrates that our proposal achieves a\nstable/consistent performance (minimum standard deviation of $0.002$) and\ncompetent overall classification performance (AID: 95.85\\% and NWPU: 94.09\\%).\n","authors":["Chiranjibi Sitaula","Jagannath Aryal","Avik Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2308.14076v1.pdf","comment":"The paper is under review in IEEE Geoscience and Remote Sensing\n Letters Journal (IEEE-GRSL). This version may be deleted and/or updated based\n on the journal's policy"},{"id":"http://arxiv.org/abs/2308.14075v1","updated":"2023-08-27T11:38:42Z","published":"2023-08-27T11:38:42Z","title":"FaceCoresetNet: Differentiable Coresets for Face Set Recognition","summary":" In set-based face recognition, we aim to compute the most discriminative\ndescriptor from an unbounded set of images and videos showing a single person.\nA discriminative descriptor balances two policies when aggregating information\nfrom a given set. The first is a quality-based policy: emphasizing high-quality\nand down-weighting low-quality images. The second is a diversity-based policy:\nemphasizing unique images in the set and down-weighting multiple occurrences of\nsimilar images as found in video clips which can overwhelm the set\nrepresentation. This work frames face-set representation as a differentiable\ncoreset selection problem. Our model learns how to select a small coreset of\nthe input set that balances quality and diversity policies using a learned\nmetric parameterized by the face quality, optimized end-to-end. The selection\nprocess is a differentiable farthest-point sampling (FPS) realized by\napproximating the non-differentiable Argmax operation with differentiable\nsampling from the Gumbel-Softmax distribution of distances. The small coreset\nis later used as queries in a self and cross-attention architecture to enrich\nthe descriptor with information from the whole set. Our model is\norder-invariant and linear in the input set size. We set a new SOTA to set face\nverification on the IJB-B and IJB-C datasets. Our code is publicly available.\n","authors":["Gil Shapira","Yosi Keller"],"pdf_url":"https://arxiv.org/pdf/2308.14075v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14074v1","updated":"2023-08-27T11:37:26Z","published":"2023-08-27T11:37:26Z","title":"Nonrigid Object Contact Estimation With Regional Unwrapping Transformer","summary":" Acquiring contact patterns between hands and nonrigid objects is a common\nconcern in the vision and robotics community. However, existing learning-based\nmethods focus more on contact with rigid ones from monocular images. When\nadopting them for nonrigid contact, a major problem is that the existing\ncontact representation is restricted by the geometry of the object.\nConsequently, contact neighborhoods are stored in an unordered manner and\ncontact features are difficult to align with image cues. At the core of our\napproach lies a novel hand-object contact representation called RUPs (Region\nUnwrapping Profiles), which unwrap the roughly estimated hand-object surfaces\nas multiple high-resolution 2D regional profiles. The region grouping strategy\nis consistent with the hand kinematic bone division because they are the\nprimitive initiators for a composite contact pattern. Based on this\nrepresentation, our Regional Unwrapping Transformer (RUFormer) learns the\ncorrelation priors across regions from monocular inputs and predicts\ncorresponding contact and deformed transformations. Our experiments demonstrate\nthat the proposed framework can robustly estimate the deformed degrees and\ndeformed transformations, which makes it suitable for both nonrigid and rigid\ncontact.\n","authors":["Wei Xie","Zimeng Zhao","Shiying Li","Binghui Zuo","Yangang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14074v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.14070v1","updated":"2023-08-27T11:04:26Z","published":"2023-08-27T11:04:26Z","title":"DETDet: Dual Ensemble Teeth Detection","summary":" The field of dentistry is in the era of digital transformation. Particularly,\nartificial intelligence is anticipated to play a significant role in digital\ndentistry. AI holds the potential to significantly assist dental practitioners\nand elevate diagnostic accuracy. In alignment with this vision, the 2023 MICCAI\nDENTEX challenge aims to enhance the performance of dental panoramic X-ray\ndiagnosis and enumeration through technological advancement. In response, we\nintroduce DETDet, a Dual Ensemble Teeth Detection network. DETDet encompasses\ntwo distinct modules dedicated to enumeration and diagnosis. Leveraging the\nadvantages of teeth mask data, we employ Mask-RCNN for the enumeration module.\nFor the diagnosis module, we adopt an ensemble model comprising DiffusionDet\nand DINO. To further enhance precision scores, we integrate a complementary\nmodule to harness the potential of unlabeled data. The code for our approach\nwill be made accessible at https://github.com/Bestever-choi/Evident\n","authors":["Kyoungyeon Choi","Jaewon Shin","Eunyi Lyou"],"pdf_url":"https://arxiv.org/pdf/2308.14070v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14066v1","updated":"2023-08-27T10:39:33Z","published":"2023-08-27T10:39:33Z","title":"Bi-Modality Medical Image Synthesis Using Semi-Supervised Sequential\n Generative Adversarial Networks","summary":" In this paper, we propose a bi-modality medical image synthesis approach\nbased on sequential generative adversarial network (GAN) and semi-supervised\nlearning. Our approach consists of two generative modules that synthesize\nimages of the two modalities in a sequential order. A method for measuring the\nsynthesis complexity is proposed to automatically determine the synthesis order\nin our sequential GAN. Images of the modality with a lower complexity are\nsynthesized first, and the counterparts with a higher complexity are generated\nlater. Our sequential GAN is trained end-to-end in a semi-supervised manner. In\nsupervised training, the joint distribution of bi-modality images are learned\nfrom real paired images of the two modalities by explicitly minimizing the\nreconstruction losses between the real and synthetic images. To avoid\noverfitting limited training images, in unsupervised training, the marginal\ndistribution of each modality is learned based on unpaired images by minimizing\nthe Wasserstein distance between the distributions of real and fake images. We\ncomprehensively evaluate the proposed model using two synthesis tasks based on\nthree types of evaluate metrics and user studies. Visual and quantitative\nresults demonstrate the superiority of our method to the state-of-the-art\nmethods, and reasonable visual quality and clinical significance. Code is made\npublicly available at\nhttps://github.com/hustlinyi/Multimodal-Medical-Image-Synthesis.\n","authors":["Xin Yang","Yi Lin","Zhiwei Wang","Xin Li","Kwang-Ting Cheng"],"pdf_url":"https://arxiv.org/pdf/2308.14066v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14064v1","updated":"2023-08-27T10:32:52Z","published":"2023-08-27T10:32:52Z","title":"Multi-model fusion for Aerial Vision and Dialog Navigation based on\n human attention aids","summary":" Drones have been widely used in many areas of our daily lives. It relieves\npeople of the burden of holding a controller all the time and makes drone\ncontrol easier to use for people with disabilities or occupied hands. However,\nthe control of aerial robots is more complicated compared to normal robots due\nto factors such as uncontrollable height. Therefore, it is crucial to develop\nan intelligent UAV that has the ability to talk to humans and follow natural\nlanguage commands. In this report, we present an aerial navigation task for the\n2023 ICCV Conversation History. Based on the AVDN dataset containing more than\n3k recorded navigation trajectories and asynchronous human-robot conversations,\nwe propose an effective method of fusion training of Human Attention Aided\nTransformer model (HAA-Transformer) and Human Attention Aided LSTM (HAA-LSTM)\nmodel, which achieves the prediction of the navigation routing points and human\nattention. The method not only achieves high SR and SPL metrics, but also shows\na 7% improvement in GP metrics compared to the baseline model.\n","authors":["Xinyi Wang","Xuan Cui","Danxu Li","Fang Liu","Licheng Jiao"],"pdf_url":"https://arxiv.org/pdf/2308.14064v1.pdf","comment":"4 pages, 1 figures"},{"id":"http://arxiv.org/abs/2302.08715v2","updated":"2023-08-27T10:08:54Z","published":"2023-02-17T06:14:37Z","title":"EEP-3DQA: Efficient and Effective Projection-based 3D Model Quality\n Assessment","summary":" Currently, great numbers of efforts have been put into improving the\neffectiveness of 3D model quality assessment (3DQA) methods. However, little\nattention has been paid to the computational costs and inference time, which is\nalso important for practical applications. Unlike 2D media, 3D models are\nrepresented by more complicated and irregular digital formats, such as point\ncloud and mesh. Thus it is normally difficult to perform an efficient module to\nextract quality-aware features of 3D models. In this paper, we address this\nproblem from the aspect of projection-based 3DQA and develop a no-reference\n(NR) \\underline{E}fficient and \\underline{E}ffective\n\\underline{P}rojection-based \\underline{3D} Model \\underline{Q}uality\n\\underline{A}ssessment (\\textbf{EEP-3DQA}) method. The input projection images\nof EEP-3DQA are randomly sampled from the six perpendicular viewpoints of the\n3D model and are further spatially downsampled by the grid-mini patch sampling\nstrategy. Further, the lightweight Swin-Transformer tiny is utilized as the\nbackbone to extract the quality-aware features. Finally, the proposed EEP-3DQA\nand EEP-3DQA-t (tiny version) achieve the best performance than the existing\nstate-of-the-art NR-3DQA methods and even outperforms most full-reference (FR)\n3DQA methods on the point cloud and mesh quality assessment databases while\nconsuming less inference time than the compared 3DQA methods.\n","authors":["Zicheng Zhang","Wei Sun","Yingjie Zhou","Wei Lu","Yucheng Zhu","Xiongkuo Min","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2302.08715v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14061v1","updated":"2023-08-27T10:03:48Z","published":"2023-08-27T10:03:48Z","title":"Hierarchical Contrastive Learning for Pattern-Generalizable Image\n Corruption Detection","summary":" Effective image restoration with large-size corruptions, such as blind image\ninpainting, entails precise detection of corruption region masks which remains\nextremely challenging due to diverse shapes and patterns of corruptions. In\nthis work, we present a novel method for automatic corruption detection, which\nallows for blind corruption restoration without known corruption masks.\nSpecifically, we develop a hierarchical contrastive learning framework to\ndetect corrupted regions by capturing the intrinsic semantic distinctions\nbetween corrupted and uncorrupted regions. In particular, our model detects the\ncorrupted mask in a coarse-to-fine manner by first predicting a coarse mask by\ncontrastive learning in low-resolution feature space and then refines the\nuncertain area of the mask by high-resolution contrastive learning. A\nspecialized hierarchical interaction mechanism is designed to facilitate the\nknowledge propagation of contrastive learning in different scales, boosting the\nmodeling performance substantially. The detected multi-scale corruption masks\nare then leveraged to guide the corruption restoration. Detecting corrupted\nregions by learning the contrastive distinctions rather than the semantic\npatterns of corruptions, our model has well generalization ability across\ndifferent corruption patterns. Extensive experiments demonstrate following\nmerits of our model: 1) the superior performance over other methods on both\ncorruption detection and various image restoration tasks including blind\ninpainting and watermark removal, and 2) strong generalization across different\ncorruption patterns such as graffiti, random noise or other image content.\nCodes and trained weights are available at https://github.com/xyfJASON/HCL .\n","authors":["Xin Feng","Yifeng Xu","Guangming Lu","Wenjie Pei"],"pdf_url":"https://arxiv.org/pdf/2308.14061v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.14058v1","updated":"2023-08-27T09:45:41Z","published":"2023-08-27T09:45:41Z","title":"Pruning the Unlabeled Data to Improve Semi-Supervised Learning","summary":" In the domain of semi-supervised learning (SSL), the conventional approach\ninvolves training a learner with a limited amount of labeled data alongside a\nsubstantial volume of unlabeled data, both drawn from the same underlying\ndistribution. However, for deep learning models, this standard practice may not\nyield optimal results. In this research, we propose an alternative perspective,\nsuggesting that distributions that are more readily separable could offer\nsuperior benefits to the learner as compared to the original distribution. To\nachieve this, we present PruneSSL, a practical technique for selectively\nremoving examples from the original unlabeled dataset to enhance its\nseparability. We present an empirical study, showing that although PruneSSL\nreduces the quantity of available training data for the learner, it\nsignificantly improves the performance of various competitive SSL algorithms,\nthereby achieving state-of-the-art results across several image classification\ntasks.\n","authors":["Guy Hacohen","Daphna Weinshall"],"pdf_url":"https://arxiv.org/pdf/2308.14058v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.14165v1","updated":"2023-08-27T17:58:32Z","published":"2023-08-27T17:58:32Z","title":"Distributional Off-Policy Evaluation for Slate Recommendations","summary":" Recommendation strategies are typically evaluated by using previously logged\ndata, employing off-policy evaluation methods to estimate their expected\nperformance. However, for strategies that present users with slates of multiple\nitems, the resulting combinatorial action space renders many of these methods\nimpractical. Prior work has developed estimators that leverage the structure in\nslates to estimate the expected off-policy performance, but the estimation of\nthe entire performance distribution remains elusive. Estimating the complete\ndistribution allows for a more comprehensive evaluation of recommendation\nstrategies, particularly along the axes of risk and fairness that employ\nmetrics computable from the distribution. In this paper, we propose an\nestimator for the complete off-policy performance distribution for slates and\nestablish conditions under which the estimator is unbiased and consistent. This\nbuilds upon prior work on off-policy evaluation for slates and off-policy\ndistribution estimation in reinforcement learning. We validate the efficacy of\nour method empirically on synthetic data as well as on a slate recommendation\nsimulator constructed from real-world data (MovieLens-20M). Our results show a\nsignificant reduction in estimation variance and improved sample efficiency\nover prior work across a range of slate structures.\n","authors":["Shreyas Chaudhari","David Arbour","Georgios Theocharous","Nikos Vlassis"],"pdf_url":"https://arxiv.org/pdf/2308.14165v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14155v1","updated":"2023-08-27T16:43:06Z","published":"2023-08-27T16:43:06Z","title":"Only Encode Once: Making Content-based News Recommender Greener","summary":" Large pretrained language models (PLM) have become de facto news encoders in\nmodern news recommender systems, due to their strong ability in comprehending\ntextual content. These huge Transformer-based architectures, when finetuned on\nrecommendation tasks, can greatly improve news recommendation performance.\nHowever, the PLM-based pretrain-finetune framework incurs high computational\ncost and energy consumption, primarily due to the extensive redundant\nprocessing of news encoding during each training epoch. In this paper, we\npropose the ``Only Encode Once'' framework for news recommendation (OLEO), by\ndecoupling news representation learning from downstream recommendation task\nlearning. The decoupled design makes content-based news recommender as green\nand efficient as id-based ones, leading to great reduction in computational\ncost and training resources. Extensive experiments show that our OLEO framework\ncan reduce carbon emissions by up to 13 times compared with the\nstate-of-the-art pretrain-finetune framework and maintain a competitive or even\nsuperior performance level. The source code is released for reproducibility.\n","authors":["Qijiong Liu","Jieming Zhu","Quanyu Dai","Xiao-Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2308.14155v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.00676v2","updated":"2023-08-27T16:26:57Z","published":"2022-09-01T18:24:16Z","title":"Analyzing and visualizing polarization and balance with signed networks:\n the U.S. Congress case study","summary":" Signed networks and balance theory provide a natural setting for real-world\nscenarios that show polarization dynamics, positive/negative relationships, and\npolitical partisanship. For example, they have been proven effective in\nstudying the increasing polarization of the votes in the two chambers of the\nU.S. Congress from World War II on.\n To provide further insights into this particular case study, we propose the\napplication of a pipeline to analyze and visualize a signed graph's\nconfiguration based on the exploitation of the corresponding Laplacian matrix'\nspectral properties. The overall methodology is comparable with others based on\nthe frustration index, but it has at least two main advantages: first, it\nrequires a much lower computational cost; second, it allows for a quantitative\nand visual assessment of how arbitrarily small subgraphs (even single nodes)\ncontribute to the overall balance (or unbalance) of the network.\n The proposed pipeline allows the exploration of polarization dynamics shown\nby the U.S. Congress from 1945 to 2020 at different resolution scales. In fact,\nwe are able to spot and point out the influence of some (groups of) congressmen\nin the overall balance, as well as to observe and explore polarization's\nevolution of both chambers across the years.\n","authors":["Arthur Capozzi","Alfonso Semeraro","Giancarlo Ruffo"],"pdf_url":"https://arxiv.org/pdf/2209.00676v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07711v3","updated":"2023-08-27T11:21:38Z","published":"2023-08-15T11:45:34Z","title":"SPM: Structured Pretraining and Matching Architectures for Relevance\n Modeling in Meituan Search","summary":" In e-commerce search, relevance between query and documents is an essential\nrequirement for satisfying user experience. Different from traditional\ne-commerce platforms that offer products, users search on life service\nplatforms such as Meituan mainly for product providers, which usually have\nabundant structured information, e.g. name, address, category, thousands of\nproducts. Modeling search relevance with these rich structured contents is\nchallenging due to the following issues: (1) there is language distribution\ndiscrepancy among different fields of structured document, making it difficult\nto directly adopt off-the-shelf pretrained language model based methods like\nBERT. (2) different fields usually have different importance and their length\nvary greatly, making it difficult to extract document information helpful for\nrelevance matching.\n To tackle these issues, in this paper we propose a novel two-stage\npretraining and matching architecture for relevance matching with rich\nstructured documents. At pretraining stage, we propose an effective pretraining\nmethod that employs both query and multiple fields of document as inputs,\nincluding an effective information compression method for lengthy fields. At\nrelevance matching stage, a novel matching method is proposed by leveraging\ndomain knowledge in search query to generate more effective document\nrepresentations for relevance scoring. Extensive offline experiments and online\nA/B tests on millions of users verify that the proposed architectures\neffectively improve the performance of relevance modeling. The model has\nalready been deployed online, serving the search traffic of Meituan for over a\nyear.\n","authors":["Wen Zan","Yaopeng Han","Xiaotian Jiang","Yao Xiao","Yang Yang","Dayao Chen","Sheng Chen"],"pdf_url":"https://arxiv.org/pdf/2308.07711v3.pdf","comment":"Accepted by CIKM '23"},{"id":"http://arxiv.org/abs/2304.07920v2","updated":"2023-08-27T10:09:22Z","published":"2023-04-17T00:05:52Z","title":"Causal Decision Transformer for Recommender Systems via Offline\n Reinforcement Learning","summary":" Reinforcement learning-based recommender systems have recently gained\npopularity. However, the design of the reward function, on which the agent\nrelies to optimize its recommendation policy, is often not straightforward.\nExploring the causality underlying users' behavior can take the place of the\nreward function in guiding the agent to capture the dynamic interests of users.\nMoreover, due to the typical limitations of simulation environments (e.g., data\ninefficiency), most of the work cannot be broadly applied in large-scale\nsituations. Although some works attempt to convert the offline dataset into a\nsimulator, data inefficiency makes the learning process even slower. Because of\nthe nature of reinforcement learning (i.e., learning by interaction), it cannot\ncollect enough data to train during a single interaction. Furthermore,\ntraditional reinforcement learning algorithms do not have a solid capability\nlike supervised learning methods to learn from offline datasets directly. In\nthis paper, we propose a new model named the causal decision transformer for\nrecommender systems (CDT4Rec). CDT4Rec is an offline reinforcement learning\nsystem that can learn from a dataset rather than from online interaction.\nMoreover, CDT4Rec employs the transformer architecture, which is capable of\nprocessing large offline datasets and capturing both short-term and long-term\ndependencies within the data to estimate the causal relationship between\naction, state, and reward. To demonstrate the feasibility and superiority of\nour model, we have conducted experiments on six real-world offline datasets and\none online simulator.\n","authors":["Siyu Wang","Xiaocong Chen","Dietmar Jannach","Lina Yao"],"pdf_url":"https://arxiv.org/pdf/2304.07920v2.pdf","comment":"Accepted by SIGIR'23, please check the camera-ready version for more\n details such as the implementation"},{"id":"http://arxiv.org/abs/2308.14056v1","updated":"2023-08-27T09:40:52Z","published":"2023-08-27T09:40:52Z","title":"CTR is not Enough: a Novel Reinforcement Learning based Ranking Approach\n for Optimizing Session Clicks","summary":" Ranking is a crucial module using in the recommender system. In particular,\nthe ranking module using in our YoungTao recommendation scenario is to provide\nan ordered list of items to users, to maximize the click number throughout the\nrecommendation session for each user. However, we found that the traditional\nranking method for optimizing Click-Through rate(CTR) cannot address our\nranking scenario well, since it completely ignores user leaving, and CTR is the\noptimization goal for the one-step recommendation. To effectively undertake the\npurpose of our ranking module, we propose a long-term optimization goal, named\nas CTE (Click-Through quantity expectation), for explicitly taking the behavior\nof user leaving into account. Based on CTE, we propose an effective model\ntrained by reinforcement learning. Moreover, we build a simulation environment\nfrom offline log data for estimating PBR and CTR. We conduct extensive\nexperiments on offline datasets and an online e-commerce scenario in TaoBao.\nExperimental results show that our method can boost performance effectively\n","authors":["Shaowei Liu","Yangjun Liu"],"pdf_url":"https://arxiv.org/pdf/2308.14056v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14029v1","updated":"2023-08-27T07:44:33Z","published":"2023-08-27T07:44:33Z","title":"Text Matching Improves Sequential Recommendation by Reducing Popularity\n Biases","summary":" This paper proposes Text mAtching based SequenTial rEcommendation model\n(TASTE), which maps items and users in an embedding space and recommends items\nby matching their text representations. TASTE verbalizes items and user-item\ninteractions using identifiers and attributes of items. To better characterize\nuser behaviors, TASTE additionally proposes an attention sparsity method, which\nenables TASTE to model longer user-item interactions by reducing the\nself-attention computations during encoding. Our experiments show that TASTE\noutperforms the state-of-the-art methods on widely used sequential\nrecommendation datasets. TASTE alleviates the cold start problem by\nrepresenting long-tail items using full-text modeling and bringing the benefits\nof pretrained language models to recommendation systems. Our further analyses\nillustrate that TASTE significantly improves the recommendation accuracy by\nreducing the popularity bias of previous item id based recommendation models\nand returning more appropriate and text-relevant items to satisfy users. All\ncodes are available at https://github.com/OpenMatch/TASTE.\n","authors":["Zhenghao Liu","Sen Mei","Chenyan Xiong","Xiaohua Li","Shi Yu","Zhiyuan Liu","Yu Gu","Ge Yu"],"pdf_url":"https://arxiv.org/pdf/2308.14029v1.pdf","comment":"Accepted by CIKM 2023"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2308.14224v1","updated":"2023-08-27T22:59:08Z","published":"2023-08-27T22:59:08Z","title":"Modeling Player Personality Factors from In-Game Behavior and Affective\n Expression","summary":" Developing a thorough understanding of the target audience (and/or single\nindividuals) is a key factor for success - which is exceptionally important and\npowerful for the domain of video games that can not only benefit from informed\ndecision making during development, but ideally even tailor game content,\ndifficulty and player experience while playing. The granular assessment of\nindividual personality and differences across players is a particularly\ndifficult endeavor, given the highly variant human nature, disagreement in\npsychological background models and because of the effortful data collection\nthat most often builds upon long, time-consuming and deterrent questionnaires.\nIn this work, we explore possibilities to predict a series of player\npersonality questionnaire metrics from recorded in-game behavior and extend\nrelated work by explicitly adding affective dialog decisions to the game\nenvironment which could elevate the model's accuracy. Using random forest\nregression, we predicted a wide variety of personality metrics from seven\nestablished questionnaires across 62 players over 60 minute gameplay of a\ncustomized version of the role-playing game Fallout: New Vegas. While some\npersonality variables could already be identified from reasonable underlying\nin-game actions and affective expressions, we did not find ways to predict\nothers or encountered questionable correlations that could not be justified by\ntheoretical background literature. Yet, building on the initial opportunities\nof this explorative study, we are striving to massively enlarge our data set to\nplayers from an ecologically valid industrial game environment and investigate\nthe performance of more sophisticated machine learning approaches.\n","authors":["Reza Habibi","Johannes Pfau","Magy Seif El-Nasr"],"pdf_url":"https://arxiv.org/pdf/2308.14224v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14220v1","updated":"2023-08-27T22:42:31Z","published":"2023-08-27T22:42:31Z","title":"On Active Learning for Gaussian Process-based Global Sensitivity\n Analysis","summary":" This paper explores the application of active learning strategies to\nadaptively learn Sobol indices for global sensitivity analysis. We demonstrate\nthat active learning for Sobol indices poses unique challenges due to the\ndefinition of the Sobol index as a ratio of variances estimated from Gaussian\nprocess surrogates. Consequently, learning strategies must either focus on\nconvergence in the numerator or the denominator of this ratio. However, rapid\nconvergence in either one does not guarantee convergence in the Sobol index. We\npropose a novel strategy for active learning that focuses on resolving the main\neffects of the Gaussian process (associated with the numerator of the Sobol\nindex) and compare this with existing strategies based on convergence in the\ntotal variance (the denominator of the Sobol index). The new strategy,\nimplemented through a new learning function termed the MUSIC (minimize\nuncertainty in Sobol index convergence), generally converges in Sobol index\nerror more rapidly than the existing strategies based on the Expected\nImprovement for Global Fit (EIGF) and the Variance Improvement for Global Fit\n(VIGF). Both strategies are compared with simple sequential random sampling and\nthe MUSIC learning function generally converges most rapidly for\nlow-dimensional problems. However, for high-dimensional problems, the\nperformance is comparable to random sampling. The new learning strategy is\ndemonstrated for a practical case of adaptive experimental design for\nlarge-scale Boundary Layer Wind Tunnel experiments.\n","authors":["Mohit Chauhan","Mariel Ojeda-Tuz","Ryan Catarelli","Kurtis Gurley","Dimitrios Tsapetis","Michael D. Shields"],"pdf_url":"https://arxiv.org/pdf/2308.14220v1.pdf","comment":"31 pages, 16 figures"},{"id":"http://arxiv.org/abs/2308.14216v1","updated":"2023-08-27T22:34:10Z","published":"2023-08-27T22:34:10Z","title":"Machine Learning for Administrative Health Records: A Systematic Review\n of Techniques and Applications","summary":" Machine learning provides many powerful and effective techniques for\nanalysing heterogeneous electronic health records (EHR). Administrative Health\nRecords (AHR) are a subset of EHR collected for administrative purposes, and\nthe use of machine learning on AHRs is a growing subfield of EHR analytics.\nExisting reviews of EHR analytics emphasise that the data-modality of the EHR\nlimits the breadth of suitable machine learning techniques, and pursuable\nhealthcare applications. Despite emphasising the importance of data modality,\nthe literature fails to analyse which techniques and applications are relevant\nto AHRs. AHRs contain uniquely well-structured, categorically encoded records\nwhich are distinct from other data-modalities captured by EHRs, and they can\nprovide valuable information pertaining to how patients interact with the\nhealthcare system.\n This paper systematically reviews AHR-based research, analysing 70 relevant\nstudies and spanning multiple databases. We identify and analyse which machine\nlearning techniques are applied to AHRs and which health informatics\napplications are pursued in AHR-based research. We also analyse how these\ntechniques are applied in pursuit of each application, and identify the\nlimitations of these approaches. We find that while AHR-based studies are\ndisconnected from each other, the use of AHRs in health informatics research is\nsubstantial and accelerating. Our synthesis of these studies highlights the\nutility of AHRs for pursuing increasingly complex and diverse research\nobjectives despite a number of pervading data- and technique-based limitations.\nFinally, through our findings, we propose a set of future research directions\nthat can enhance the utility of AHR data and machine learning techniques for\nhealth informatics research.\n","authors":["Adrian Caruana","Madhushi Bandara","Katarzyna Musial","Daniel Catchpoole","Paul J. Kennedy"],"pdf_url":"https://arxiv.org/pdf/2308.14216v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14215v1","updated":"2023-08-27T22:27:57Z","published":"2023-08-27T22:27:57Z","title":"TimeTrail: Unveiling Financial Fraud Patterns through Temporal\n Correlation Analysis","summary":" In the field of financial fraud detection, understanding the underlying\npatterns and dynamics is important to ensure effective and reliable systems.\nThis research introduces a new technique, \"TimeTrail,\" which employs advanced\ntemporal correlation analysis to explain complex financial fraud patterns. The\ntechnique leverages time-related insights to provide transparent and\ninterpretable explanations for fraud detection decisions, enhancing\naccountability and trust.\n The \"TimeTrail\" methodology consists of three key phases: temporal data\nenrichment, dynamic correlation analysis, and interpretable pattern\nvisualization. Initially, raw financial transaction data is enriched with\ntemporal attributes. Dynamic correlations between these attributes are then\nquantified using innovative statistical measures. Finally, a unified\nvisualization framework presents these correlations in an interpretable manner.\nTo validate the effectiveness of \"TimeTrail,\" a study is conducted on a diverse\nfinancial dataset, surrounding various fraud scenarios. Results demonstrate the\ntechnique's capability to uncover hidden temporal correlations and patterns,\nperforming better than conventional methods in both accuracy and\ninterpretability. Moreover, a case study showcasing the application of\n\"TimeTrail\" in real-world scenarios highlights its utility for fraud detection.\n","authors":["Sushrut Ghimire"],"pdf_url":"https://arxiv.org/pdf/2308.14215v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14207v1","updated":"2023-08-27T21:25:45Z","published":"2023-08-27T21:25:45Z","title":"Predictive Sparse Manifold Transform","summary":" We present Predictive Sparse Manifold Transform (PSMT), a minimalistic,\ninterpretable and biologically plausible framework for learning and predicting\nnatural dynamics. PSMT incorporates two layers where the first sparse coding\nlayer represents the input sequence as sparse coefficients over an overcomplete\ndictionary and the second manifold learning layer learns a geometric embedding\nspace that captures topological similarity and dynamic temporal linearity in\nsparse coefficients. We apply PSMT on a natural video dataset and evaluate the\nreconstruction performance with respect to contextual variability, the number\nof sparse coding basis functions and training samples. We then interpret the\ndynamic topological organization in the embedding space. We next utilize PSMT\nto predict future frames compared with two baseline methods with a static\nembedding space. We demonstrate that PSMT with a dynamic embedding space can\nachieve better prediction performance compared to static baselines. Our work\nestablishes that PSMT is an efficient unsupervised generative framework for\nprediction of future visual stimuli.\n","authors":["Yujia Xie","Xinhui Li","Vince D. Calhoun"],"pdf_url":"https://arxiv.org/pdf/2308.14207v1.pdf","comment":"Paper presented at the 1st Workshop on High-dimensional Learning\n Dynamics (HLD) at the 40th International Conference on Machine Learning\n (ICML) 2023, Honolulu, Hawaii, USA\n (https://sites.google.com/view/hidimlearning), 10 pages"},{"id":"http://arxiv.org/abs/2211.00646v2","updated":"2023-08-27T20:24:37Z","published":"2022-11-01T00:40:09Z","title":"Learning Melanocytic Cell Masks from Adjacent Stained Tissue","summary":" Melanoma is one of the most aggressive forms of skin cancer, causing a large\nproportion of skin cancer deaths. However, melanoma diagnoses by pathologists\nshows low interrater reliability. As melanoma is a cancer of the melanocyte,\nthere is a clear need to develop a melanocytic cell segmentation tool that is\nagnostic to pathologist variability and automates pixel-level annotation.\nGigapixel-level pathologist labeling, however, is impractical. Herein, we\npropose a means to train deep neural networks for melanocytic cell segmentation\nfrom hematoxylin and eosin (H&E) stained slides using paired\nimmunohistochemical (IHC) slides of adjacent tissue sections, achieving a mean\nIOU of 0.64 despite imperfect ground-truth labels.\n","authors":["Mikio Tada","Maria L. Wei","Michael J. Keiser"],"pdf_url":"https://arxiv.org/pdf/2211.00646v2.pdf","comment":"{Medical Image Learning with Limited & Noisy Data Workshop at MICCAI\n 2022"},{"id":"http://arxiv.org/abs/2003.01052v6","updated":"2023-08-27T20:04:06Z","published":"2020-03-02T17:38:38Z","title":"How to choose the most appropriate centrality measure? A decision tree\n approach","summary":" Centrality metrics play a crucial role in network analysis, while the choice\nof specific measures significantly influences the accuracy of conclusions as\neach measure represents a unique concept of node importance. Among over 400\nproposed indices, selecting the most suitable ones for specific applications\nremains a challenge. Existing approaches -- model-based, data-driven, and\naxiomatic -- have limitations, requiring association with models, training\ndatasets, or restrictive axioms for each specific application. To address this,\nwe introduce the culling method, which relies on the expert concept of\ncentrality behavior on simple graphs. The culling method involves forming a set\nof candidate measures, generating a list of as small graphs as possible needed\nto distinguish the measures from each other, constructing a decision-tree\nsurvey, and identifying the measure consistent with the expert's concept. We\napply this approach to a diverse set of 40 centralities, including novel\nkernel-based indices, and combine it with the axiomatic approach. Remarkably,\nonly 13 small 1-trees are sufficient to separate all 40 measures, even for\npairs of closely related ones. By adopting simple ordinal axioms like\nSelf-consistency or Bridge axiom, the set of measures can be drastically\nreduced making the culling survey short. Applying the culling method provides\ninsightful findings on some centrality indices, such as PageRank, Bridging, and\ndissimilarity-based Eigencentrality measures, among others. The proposed\napproach offers a cost-effective solution in terms of labor and time,\ncomplementing existing methods for measure selection, and providing deeper\ninsights into the underlying mechanisms of centrality measures.\n","authors":["Pavel Chebotarev","Dmitry Gubanov"],"pdf_url":"https://arxiv.org/pdf/2003.01052v6.pdf","comment":"12 pages, 2 tables, 1 algorithm, 8 figures. Presentation has been\n improved"},{"id":"http://arxiv.org/abs/2308.14190v1","updated":"2023-08-27T19:43:43Z","published":"2023-08-27T19:43:43Z","title":"Score-Based Generative Models for PET Image Reconstruction","summary":" Score-based generative models have demonstrated highly promising results for\nmedical image reconstruction tasks in magnetic resonance imaging or computed\ntomography. However, their application to Positron Emission Tomography (PET) is\nstill largely unexplored. PET image reconstruction involves a variety of\nchallenges, including Poisson noise with high variance and a wide dynamic\nrange. To address these challenges, we propose several PET-specific adaptations\nof score-based generative models. The proposed framework is developed for both\n2D and 3D PET. In addition, we provide an extension to guided reconstruction\nusing magnetic resonance images. We validate the approach through extensive 2D\nand 3D $\\textit{in-silico}$ experiments with a model trained on\npatient-realistic data without lesions, and evaluate on data without lesions as\nwell as out-of-distribution data with lesions. This demonstrates the proposed\nmethod's robustness and significant potential for improved PET reconstruction.\n","authors":["Imraj RD Singh","Alexander Denker","Riccardo Barbano","Željko Kereta","Bangti Jin","Kris Thielemans","Peter Maass","Simon Arridge"],"pdf_url":"https://arxiv.org/pdf/2308.14190v1.pdf","comment":"35 pages, 16 figures, submitted to Journal of Machine Learning for\n Biomedical Imaging (MELBA)"},{"id":"http://arxiv.org/abs/2305.14704v3","updated":"2023-08-27T19:22:40Z","published":"2023-05-24T04:16:56Z","title":"Practical Batch Bayesian Sampling Algorithms for Online Adaptive Traffic\n Experimentation","summary":" Online controlled experiments have emerged as industry gold standard for\nassessing new web features. As new web algorithms proliferate, experimentation\nplatform faces an increasing demand on the velocity of online experiments,\nwhich encourages adaptive traffic testing methods to speed up identifying best\nvariant by efficiently allocating traffic. This paper proposed four Bayesian\nbatch bandit algorithms (NB-TS, WB-TS, NB-TTTS, WB-TTTS) for eBay's\nexperimentation platform, using summary batch statistics of a goal metric\nwithout incurring new engineering technical debts. The novel WB-TTTS, in\nparticular, demonstrates as an efficient, trustworthy and robust alternative to\nfixed horizon A/B testing. Another novel contribution is to bring\ntrustworthiness of best arm identification algorithms into evaluation criterion\nand highlight the existence of severe false positive inflation with equivalent\nbest arms. To gain the trust of experimenters, the experimentation platform\nmust consider both efficiency and trustworthiness; However, to the best of\nauthors' knowledge, trustworthiness as an important topic is rarely discussed\nin literatures of either best arm identification or multi-armed bandit. This\npaper shows that Bayesian bandits without neutral posterior reshaping,\nparticularly naive Thompson sampling (NB-TS), are untrustworthy because they\ncan always identify an arm as best from equivalent best arms. To restore\ntrustworthiness, a novel finding uncovers connections between convergence\ndistribution of posterior optimal probabilities of equivalent best arms and\nneutral posterior reshaping, which controls false positives. Lastly, this paper\npresents lessons learned from eBay's experience, as well as evaluations of the\nfour algorithms. We hope our work is useful to other industrial practitioners\nand inspire academic researchers interested in the trustworthiness of adaptive\ntraffic experimentation.\n","authors":["Zezhong Zhang","Ted Yuan"],"pdf_url":"https://arxiv.org/pdf/2305.14704v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14181v1","updated":"2023-08-27T19:01:29Z","published":"2023-08-27T19:01:29Z","title":"Topological Augmentation for Class-Imbalanced Node Classification","summary":" Class imbalance is prevalent in real-world node classification tasks and\noften biases graph learning models toward majority classes. Most existing\nstudies root from a node-centric perspective and aim to address the class\nimbalance in training data by node/class-wise reweighting or resampling. In\nthis paper, we approach the source of the class-imbalance bias from an\nunder-explored topology-centric perspective. Our investigation reveals that\nbeyond the inherently skewed training class distribution, the graph topology\nalso plays an important role in the formation of predictive bias: we identify\ntwo fundamental challenges, namely ambivalent and distant message-passing, that\ncan exacerbate the bias by aggravating majority-class over-generalization and\nminority-class misclassification. In light of these findings, we devise a\nlightweight topological augmentation method ToBA to dynamically rectify the\nnodes influenced by ambivalent/distant message-passing during graph learning,\nso as to mitigate the class-imbalance bias. We highlight that ToBA is a\nmodel-agnostic, efficient, and versatile solution that can be seamlessly\ncombined with and further boost other imbalance-handling techniques. Systematic\nexperiments validate the superior performance of ToBA in both promoting\nimbalanced node classification and mitigating the prediction bias between\ndifferent classes.\n","authors":["Zhining Liu","Zhichen Zeng","Ruizhong Qiu","Hyunsik Yoo","David Zhou","Zhe Xu","Yada Zhu","Kommy Weldemariam","Jingrui He","Hanghang Tong"],"pdf_url":"https://arxiv.org/pdf/2308.14181v1.pdf","comment":"19 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.14175v1","updated":"2023-08-27T18:38:09Z","published":"2023-08-27T18:38:09Z","title":"Leveraging Linear Independence of Component Classifiers: Optimizing Size\n and Prediction Accuracy for Online Ensembles","summary":" Ensembles, which employ a set of classifiers to enhance classification\naccuracy collectively, are crucial in the era of big data. However, although\nthere is general agreement that the relation between ensemble size and its\nprediction accuracy, the exact nature of this relationship is still unknown. We\nintroduce a novel perspective, rooted in the linear independence of\nclassifier's votes, to analyze the interplay between ensemble size and\nprediction accuracy. This framework reveals a theoretical link, consequently\nproposing an ensemble size based on this relationship. Our study builds upon a\ngeometric framework and develops a series of theorems. These theorems clarify\nthe role of linear dependency in crafting ensembles. We present a method to\ndetermine the minimum ensemble size required to ensure a target probability of\nlinearly independent votes among component classifiers. Incorporating real and\nsynthetic datasets, our empirical results demonstrate a trend: increasing the\nnumber of classifiers enhances accuracy, as predicted by our theoretical\ninsights. However, we also identify a point of diminishing returns, beyond\nwhich additional classifiers provide diminishing improvements in accuracy.\nSurprisingly, the calculated ideal ensemble size deviates from empirical\nresults for certain datasets, emphasizing the influence of other factors. This\nstudy opens avenues for deeper investigations into the complex dynamics\ngoverning ensemble design and offers guidance for constructing efficient and\neffective ensembles in practical scenarios.\n","authors":["Enes Bektas","Fazli Can"],"pdf_url":"https://arxiv.org/pdf/2308.14175v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14174v1","updated":"2023-08-27T18:35:46Z","published":"2023-08-27T18:35:46Z","title":"Integrated Approach of Gearbox Fault Diagnosis","summary":" Gearbox fault diagnosis is one of the most important parts in any industrial\nsystems. Failure of components inside gearbox can lead to a catastrophic\nfailure, uneven breakdown, and financial losses in industrial organization. In\nthat case intelligent maintenance of the gearbox comes into context. This paper\npresents an integrated gearbox fault diagnosis approach which can easily deploy\nin online condition monitoring. This work introduces a nonparametric data\npreprocessing technique i.e., calculus enhanced energy operator (CEEO) to\npreserve the characteristics frequencies in the noisy and inferred vibrational\nsignal. A set of time domain and spectral domain features are calculated from\nthe raw and CEEO vibration signal and inputted to the multiclass support vector\nmachine (MCSVM) to diagnose the faults on the system. An effective comparison\nbetween raw signal and CEEO signal are presented to show the impact of CEEO in\ngearbox fault diagnosis. The obtained results of this work look very promising\nand can be implemented in any type of industrial system due to its\nnonparametric nature.\n","authors":["Vikash Kumar","Subrata Mukherjee","Somnath Sarangi"],"pdf_url":"https://arxiv.org/pdf/2308.14174v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14172v1","updated":"2023-08-27T18:28:58Z","published":"2023-08-27T18:28:58Z","title":"Hypergraph Structure Inference From Data Under Smoothness Prior","summary":" Hypergraphs are important for processing data with higher-order relationships\ninvolving more than two entities. In scenarios where explicit hypergraphs are\nnot readily available, it is desirable to infer a meaningful hypergraph\nstructure from the node features to capture the intrinsic relations within the\ndata. However, existing methods either adopt simple pre-defined rules that fail\nto precisely capture the distribution of the potential hypergraph structure, or\nlearn a mapping between hypergraph structures and node features but require a\nlarge amount of labelled data, i.e., pre-existing hypergraph structures, for\ntraining. Both restrict their applications in practical scenarios. To fill this\ngap, we propose a novel smoothness prior that enables us to design a method to\ninfer the probability for each potential hyperedge without labelled data as\nsupervision. The proposed prior indicates features of nodes in a hyperedge are\nhighly correlated by the features of the hyperedge containing them. We use this\nprior to derive the relation between the hypergraph structure and the node\nfeatures via probabilistic modelling. This allows us to develop an unsupervised\ninference method to estimate the probability for each potential hyperedge via\nsolving an optimisation problem that has an analytical solution. Experiments on\nboth synthetic and real-world data demonstrate that our method can learn\nmeaningful hypergraph structures from data more efficiently than existing\nhypergraph structure inference methods.\n","authors":["Bohan Tang","Siheng Chen","Xiaowen Dong"],"pdf_url":"https://arxiv.org/pdf/2308.14172v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.08244v2","updated":"2023-08-27T18:23:01Z","published":"2022-11-14T03:51:12Z","title":"Artificial Intelligence for Automatic Detection and Classification\n Disease on the X-Ray Images","summary":" Detecting and classifying diseases using X-ray images is one of the more\nchallenging core tasks in the medical and research world. Due to the recent\nhigh interest in radiological images and AI, early detection of diseases in\nX-ray images has become notably more essential to prevent further spreading and\nflatten the curve. Innovations and revolutions of Computer Vision with Deep\nlearning methods offer great promise for fast and accurate diagnosis of\nscreening and detection from chest X-ray images (CXR). This work presents rapid\ndetection of diseases in the lung using the efficient Deep learning pre-trained\nRepVGG algorithm for deep feature extraction and classification. We used X-ray\nimages as an example to show the model's efficiency. To perform this task, we\nclassify X-Ray images into Covid-19, Pneumonia, and Normal X-Ray images. Employ\nROI object to improve the detection accuracy for lung extraction, followed by\ndata pre-processing and augmentation. We are applying Artificial Intelligence\ntechnology for automatic highlighted detection of affected areas of people's\nlungs. Based on the X-Ray images, an algorithm was developed that classifies\nX-Ray images with height accuracy and power faster thanks to the architecture\ntransformation of the model. We compared deep learning frameworks' accuracy and\ndetection of disease. The study shows the high power of deep learning methods\nfor X-ray images based on COVID-19 detection utilizing chest X-rays. The\nproposed framework offers better diagnostic accuracy by comparing popular deep\nlearning models, i.e., VGG, ResNet50, inceptionV3, DenseNet, and\nInceptionResnetV2.\n","authors":["Liora Mayats-Alpay"],"pdf_url":"https://arxiv.org/pdf/2211.08244v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14165v1","updated":"2023-08-27T17:58:32Z","published":"2023-08-27T17:58:32Z","title":"Distributional Off-Policy Evaluation for Slate Recommendations","summary":" Recommendation strategies are typically evaluated by using previously logged\ndata, employing off-policy evaluation methods to estimate their expected\nperformance. However, for strategies that present users with slates of multiple\nitems, the resulting combinatorial action space renders many of these methods\nimpractical. Prior work has developed estimators that leverage the structure in\nslates to estimate the expected off-policy performance, but the estimation of\nthe entire performance distribution remains elusive. Estimating the complete\ndistribution allows for a more comprehensive evaluation of recommendation\nstrategies, particularly along the axes of risk and fairness that employ\nmetrics computable from the distribution. In this paper, we propose an\nestimator for the complete off-policy performance distribution for slates and\nestablish conditions under which the estimator is unbiased and consistent. This\nbuilds upon prior work on off-policy evaluation for slates and off-policy\ndistribution estimation in reinforcement learning. We validate the efficacy of\nour method empirically on synthetic data as well as on a slate recommendation\nsimulator constructed from real-world data (MovieLens-20M). Our results show a\nsignificant reduction in estimation variance and improved sample efficiency\nover prior work across a range of slate structures.\n","authors":["Shreyas Chaudhari","David Arbour","Georgios Theocharous","Nikos Vlassis"],"pdf_url":"https://arxiv.org/pdf/2308.14165v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14163v1","updated":"2023-08-27T17:47:30Z","published":"2023-08-27T17:47:30Z","title":"Explaining with Attribute-based and Relational Near Misses: An\n Interpretable Approach to Distinguishing Facial Expressions of Pain and\n Disgust","summary":" Explaining concepts by contrasting examples is an efficient and convenient\nway of giving insights into the reasons behind a classification decision. This\nis of particular interest in decision-critical domains, such as medical\ndiagnostics. One particular challenging use case is to distinguish facial\nexpressions of pain and other states, such as disgust, due to high similarity\nof manifestation. In this paper, we present an approach for generating\ncontrastive explanations to explain facial expressions of pain and disgust\nshown in video sequences. We implement and compare two approaches for\ncontrastive explanation generation. The first approach explains a specific pain\ninstance in contrast to the most similar disgust instance(s) based on the\noccurrence of facial expressions (attributes). The second approach takes into\naccount which temporal relations hold between intervals of facial expressions\nwithin a sequence (relations). The input to our explanation generation approach\nis the output of an interpretable rule-based classifier for pain and disgust.We\nutilize two different similarity metrics to determine near misses and far\nmisses as contrasting instances. Our results show that near miss explanations\nare shorter than far miss explanations, independent from the applied similarity\nmetric. The outcome of our evaluation indicates that pain and disgust can be\ndistinguished with the help of temporal relations. We currently plan\nexperiments to evaluate how the explanations help in teaching concepts and how\nthey could be enhanced by further modalities and interaction.\n","authors":["Bettina Finzel","Simon P. Kuhn","David E. Tafler","Ute Schmid"],"pdf_url":"https://arxiv.org/pdf/2308.14163v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.12856v3","updated":"2023-08-27T16:46:38Z","published":"2022-08-26T20:08:40Z","title":"Local Context-Aware Active Domain Adaptation","summary":" Active Domain Adaptation (ADA) queries the labels of a small number of\nselected target samples to help adapting a model from a source domain to a\ntarget domain. The local context of queried data is important, especially when\nthe domain gap is large. However, this has not been fully explored by existing\nADA works. In this paper, we propose a Local context-aware ADA framework, named\nLADA, to address this issue. To select informative target samples, we devise a\nnovel criterion based on the local inconsistency of model predictions. Since\nthe labeling budget is usually small, fine-tuning model on only queried data\ncan be inefficient. We progressively augment labeled target data with the\nconfident neighbors in a class-balanced manner. Experiments validate that the\nproposed criterion chooses more informative target samples than existing active\nselection strategies. Furthermore, our full method clearly surpasses recent ADA\narts on various benchmarks. Code is available at https://github.com/tsun/LADA.\n","authors":["Tao Sun","Cheng Lu","Haibin Ling"],"pdf_url":"https://arxiv.org/pdf/2208.12856v3.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.14144v1","updated":"2023-08-27T15:57:08Z","published":"2023-08-27T15:57:08Z","title":"Learning end-to-end inversion of circular Radon transforms in the\n partial radial setup","summary":" We present a deep learning-based computational algorithm for inversion of\ncircular Radon transforms in the partial radial setup, arising in photoacoustic\ntomography. We first demonstrate that the truncated singular value\ndecomposition-based method, which is the only traditional algorithm available\nto solve this problem, leads to severe artifacts which renders the\nreconstructed field as unusable. With the objective of overcoming this\ncomputational bottleneck, we train a ResBlock based U-Net to recover the\ninferred field that directly operates on the measured data. Numerical results\nwith augmented Shepp-Logan phantoms, in the presence of noisy full and limited\nview data, demonstrate the superiority of the proposed algorithm.\n","authors":["Deep Ray","Souvik Roy"],"pdf_url":"https://arxiv.org/pdf/2308.14144v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14142v1","updated":"2023-08-27T15:44:28Z","published":"2023-08-27T15:44:28Z","title":"Integrated Variational Fourier Features for Fast Spatial Modelling with\n Gaussian Processes","summary":" Sparse variational approximations are popular methods for scaling up\ninference and learning in Gaussian processes to larger datasets. For $N$\ntraining points, exact inference has $O(N^3)$ cost; with $M \\ll N$ features,\nstate of the art sparse variational methods have $O(NM^2)$ cost. Recently,\nmethods have been proposed using more sophisticated features; these promise\n$O(M^3)$ cost, with good performance in low dimensional tasks such as spatial\nmodelling, but they only work with a very limited class of kernels, excluding\nsome of the most commonly used. In this work, we propose integrated Fourier\nfeatures, which extends these performance benefits to a very broad class of\nstationary covariance functions. We motivate the method and choice of\nparameters from a convergence analysis and empirical exploration, and show\npractical speedup in synthetic and real world spatial regression tasks.\n","authors":["Talay M Cheema","Carl Edward Rasmussen"],"pdf_url":"https://arxiv.org/pdf/2308.14142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14132v1","updated":"2023-08-27T15:20:06Z","published":"2023-08-27T15:20:06Z","title":"Detecting Language Model Attacks with Perplexity","summary":" A novel hack involving Large Language Models (LLMs) has emerged, leveraging\nadversarial suffixes to trick models into generating perilous responses. This\nmethod has garnered considerable attention from reputable media outlets such as\nthe New York Times and Wired, thereby influencing public perception regarding\nthe security and safety of LLMs. In this study, we advocate the utilization of\nperplexity as one of the means to recognize such potential attacks. The\nunderlying concept behind these hacks revolves around appending an unusually\nconstructed string of text to a harmful query that would otherwise be blocked.\nThis maneuver confuses the protective mechanisms and tricks the model into\ngenerating a forbidden response. Such scenarios could result in providing\ndetailed instructions to a malicious user for constructing explosives or\norchestrating a bank heist. Our investigation demonstrates the feasibility of\nemploying perplexity, a prevalent natural language processing metric, to detect\nthese adversarial tactics before generating a forbidden response. By evaluating\nthe perplexity of queries with and without such adversarial suffixes using an\nopen-source LLM, we discovered that nearly 90 percent were above a perplexity\nof 1000. This contrast underscores the efficacy of perplexity for detecting\nthis type of exploit.\n","authors":["Gabriel Alon","Michael Kamfonas"],"pdf_url":"https://arxiv.org/pdf/2308.14132v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14129v1","updated":"2023-08-27T15:11:44Z","published":"2023-08-27T15:11:44Z","title":"SPEED: Streaming Partition and Parallel Acceleration for Temporal\n Interaction Graph Embedding","summary":" Temporal Interaction Graphs (TIGs) are widely employed to model intricate\nreal-world systems such as financial systems and social networks. To capture\nthe dynamism and interdependencies of nodes, existing TIG embedding models need\nto process edges sequentially and chronologically. However, this requirement\nprevents it from being processed in parallel and struggle to accommodate\nburgeoning data volumes to GPU. Consequently, many large-scale temporal\ninteraction graphs are confined to CPU processing. Furthermore, a generalized\nGPU scaling and acceleration approach remains unavailable. To facilitate\nlarge-scale TIGs' implementation on GPUs for acceleration, we introduce a novel\ntraining approach namely Streaming Edge Partitioning and Parallel Acceleration\nfor Temporal Interaction Graph Embedding (SPEED). The SPEED is comprised of a\nStreaming Edge Partitioning Component (SEP) which addresses space overhead\nissue by assigning fewer nodes to each GPU, and a Parallel Acceleration\nComponent (PAC) which enables simultaneous training of different sub-graphs,\naddressing time overhead issue. Our method can achieve a good balance in\ncomputing resources, computing time, and downstream task performance. Empirical\nvalidation across 7 real-world datasets demonstrates the potential to expedite\ntraining speeds by a factor of up to 19.29x. Simultaneously, resource\nconsumption of a single-GPU can be diminished by up to 69%, thus enabling the\nmultiple GPU-based training and acceleration encompassing millions of nodes and\nbillions of edges. Furthermore, our approach also maintains its competitiveness\nin downstream tasks.\n","authors":["Xi Chen","Yongxiang Liao","Yun Xiong","Yao Zhang","Siwei Zhang","Jiawei Zhang","Yiheng Sun"],"pdf_url":"https://arxiv.org/pdf/2308.14129v1.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2206.04678v3","updated":"2023-08-27T14:35:43Z","published":"2022-06-08T17:19:55Z","title":"ReCo: A Dataset for Residential Community Layout Planning","summary":" Layout planning is centrally important in the field of architecture and urban\ndesign. Among the various basic units carrying urban functions, residential\ncommunity plays a vital part for supporting human life. Therefore, the layout\nplanning of residential community has always been of concern, and has attracted\nparticular attention since the advent of deep learning that facilitates the\nautomated layout generation and spatial pattern recognition. However, the\nresearch circles generally suffer from the insufficiency of residential\ncommunity layout benchmark or high-quality datasets, which hampers the future\nexploration of data-driven methods for residential community layout planning.\nThe lack of datasets is largely due to the difficulties of large-scale\nreal-world residential data acquisition and long-term expert screening. In\norder to address the issues and advance a benchmark dataset for various\nintelligent spatial design and analysis applications in the development of\nsmart city, we introduce Residential Community Layout Planning (ReCo) Dataset,\nwhich is the first and largest open-source vector dataset related to real-world\ncommunity to date. ReCo Dataset is presented in multiple data formats with\n37,646 residential community layout plans, covering 598,728 residential\nbuildings with height information. ReCo can be conveniently adapted for\nresidential community layout related urban design tasks, e.g., generative\nlayout design, morphological pattern recognition and spatial evaluation. To\nvalidate the utility of ReCo in automated residential community layout\nplanning, two Generative Adversarial Network (GAN) based generative models are\nfurther applied to the dataset. We expect ReCo Dataset to inspire more creative\nand practical work in intelligent design and beyond. The ReCo Dataset is\npublished at: https://www.kaggle.com/fdudsde/reco-dataset.\n","authors":["Xi Chen","Yun Xiong","Siqi Wang","Haofen Wang","Tao Sheng","Yao Zhang","Yu Ye"],"pdf_url":"https://arxiv.org/pdf/2206.04678v3.pdf","comment":"9 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.14120v1","updated":"2023-08-27T14:28:38Z","published":"2023-08-27T14:28:38Z","title":"Empowering Clinicians and Democratizing Data Science: Large Language\n Models Automate Machine Learning for Clinical Studies","summary":" A knowledge gap persists between Machine Learning (ML) developers (e.g., data\nscientists) and practitioners (e.g., clinicians), hampering the full\nutilization of ML for clinical data analysis. We investigated the potential of\nthe chatGPT Code Interpreter (CI), an extension of GPT-4, to bridge this gap\nand perform ML analyses efficiently. Real-world clinical datasets and study\ndetails from large trials across various medical specialties were presented to\nchatGPT CI without specific guidance. ChatGPT CI autonomously developed\nstate-of-the-art ML models based on the original study's training data to\npredict clinical outcomes such as cancer development, cancer progression,\ndisease complications, or biomarkers such as pathogenic gene sequences.\nStrikingly, these ML models matched or outperformed their published\ncounterparts. We conclude that chatGPT CI offers a promising avenue to\ndemocratize ML in medicine, making advanced analytics accessible to non-ML\nexperts and promoting broader applications in medical research and practice.\n","authors":["Soroosh Tayebi Arasteh","Tianyu Han","Mahshad Lotfinia","Christiane Kuhl","Jakob Nikolas Kather","Daniel Truhn","Sven Nebelung"],"pdf_url":"https://arxiv.org/pdf/2308.14120v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14119v1","updated":"2023-08-27T14:25:07Z","published":"2023-08-27T14:25:07Z","title":"Semi-Supervised Learning in the Few-Shot Zero-Shot Scenario","summary":" Semi-Supervised Learning (SSL) leverages both labeled and unlabeled data to\nimprove model performance. Traditional SSL methods assume that labeled and\nunlabeled data share the same label space. However, in real-world applications,\nespecially when the labeled training set is small, there may be classes that\nare missing from the labeled set. Existing frameworks aim to either reject all\nunseen classes (open-set SSL) or to discover unseen classes by partitioning an\nunlabeled set during training (open-world SSL). In our work, we construct a\nclassifier for points from both seen and unseen classes. Our approach is based\non extending an existing SSL method, such as FlexMatch, by incorporating an\nadditional entropy loss. This enhancement allows our method to improve the\nperformance of any existing SSL method in the classification of both seen and\nunseen classes. We demonstrate large improvement gains over state-of-the-art\nSSL, open-set SSL, and open-world SSL methods, on two benchmark image\nclassification data sets, CIFAR-100 and STL-10. The gains are most pronounced\nwhen the labeled data is severely limited (1-25 labeled examples per class).\n","authors":["Noam Fluss","Guy Hacohen","Daphna Weinshall"],"pdf_url":"https://arxiv.org/pdf/2308.14119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14114v1","updated":"2023-08-27T14:13:29Z","published":"2023-08-27T14:13:29Z","title":"Hybrid Transformer-RNN Architecture for Household Occupancy Detection\n Using Low-Resolution Smart Meter Data","summary":" Residential occupancy detection has become an enabling technology in today's\nurbanized world for various smart home applications, such as building\nautomation, energy management, and improved security and comfort.\nDigitalization of the energy system provides smart meter data that can be used\nfor occupancy detection in a non-intrusive manner without causing concerns\nregarding privacy and data security. In particular, deep learning techniques\nmake it possible to infer occupancy from low-resolution smart meter data, such\nthat the need for accurate occupancy detection with privacy preservation can be\nachieved. Our work is thus motivated to develop a privacy-aware and effective\nmodel for residential occupancy detection in contemporary living environments.\nOur model aims to leverage the advantages of both recurrent neural networks\n(RNNs), which are adept at capturing local temporal dependencies, and\ntransformers, which are effective at handling global temporal dependencies. Our\ndesigned hybrid transformer-RNN model detects residential occupancy using\nhourly smart meter data, achieving an accuracy of nearly 92\\% across households\nwith diverse profiles. We validate the effectiveness of our method using a\npublicly accessible dataset and demonstrate its performance by comparing it\nwith state-of-the-art models, including attention-based occupancy detection\nmethods.\n","authors":["Xinyu Liang","Hao Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14114v1.pdf","comment":"IEEE IECON 2023 (The 49th Annual Conference of the IEEE Industrial\n Electronics Society)"},{"id":"http://arxiv.org/abs/2308.14108v1","updated":"2023-08-27T13:50:15Z","published":"2023-08-27T13:50:15Z","title":"Depth self-supervision for single image novel view synthesis","summary":" In this paper, we tackle the problem of generating a novel image from an\narbitrary viewpoint given a single frame as input. While existing methods\noperating in this setup aim at predicting the target view depth map to guide\nthe synthesis, without explicit supervision over such a task, we jointly\noptimize our framework for both novel view synthesis and depth estimation to\nunleash the synergy between the two at its best. Specifically, a shared depth\ndecoder is trained in a self-supervised manner to predict depth maps that are\nconsistent across the source and target views. Our results demonstrate the\neffectiveness of our approach in addressing the challenges of both tasks\nallowing for higher-quality generated images, as well as more accurate depth\nfor the target viewpoint.\n","authors":["Giovanni Minelli","Matteo Poggi","Samuele Salti"],"pdf_url":"https://arxiv.org/pdf/2308.14108v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14104v1","updated":"2023-08-27T13:22:50Z","published":"2023-08-27T13:22:50Z","title":"Towards Generalizable Neural Solvers for Vehicle Routing Problems via\n Ensemble with Transferrable Local Policy","summary":" Machine learning has been adapted to help solve NP-hard combinatorial\noptimization problems. One prevalent way is learning to construct solutions by\ndeep neural networks, which has been receiving more and more attention due to\nthe high efficiency and less requirement for expert knowledge. However, many\nneural construction methods for Vehicle Routing Problems (VRPs) focus on\nsynthetic problem instances with limited scales and specified node\ndistributions, leading to poor performance on real-world problems which usually\ninvolve large scales together with complex and unknown node distributions. To\nmake neural VRP solvers more practical in real-world scenarios, we design an\nauxiliary policy that learns from the local transferable topological features,\nnamed local policy, and integrate it with a typical constructive policy (which\nlearns from the global information of VRP instances) to form an ensemble\npolicy. With joint training, the aggregated policies perform cooperatively and\ncomplementarily to boost generalization. The experimental results on two\nwell-known benchmarks, TSPLIB and CVRPLIB, of travelling salesman problem and\ncapacitated VRP show that the ensemble policy consistently achieves better\ngeneralization than state-of-the-art construction methods and even works well\non real-world problems with several thousand nodes.\n","authors":["Chengrui Gao","Haopu Shang","Ke Xue","Dong Li","Chao Qian"],"pdf_url":"https://arxiv.org/pdf/2308.14104v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11391v2","updated":"2023-08-27T13:12:30Z","published":"2023-05-19T02:41:12Z","title":"A Survey of Safety and Trustworthiness of Large Language Models through\n the Lens of Verification and Validation","summary":" Large Language Models (LLMs) have exploded a new heatwave of AI for their\nability to engage end-users in human-level conversations with detailed and\narticulate answers across many knowledge domains. In response to their fast\nadoption in many industrial applications, this survey concerns their safety and\ntrustworthiness. First, we review known vulnerabilities and limitations of the\nLLMs, categorising them into inherent issues, attacks, and unintended bugs.\nThen, we consider if and how the Verification and Validation (V&V) techniques,\nwhich have been widely developed for traditional software and deep learning\nmodels such as convolutional neural networks as independent processes to check\nthe alignment of their implementations against the specifications, can be\nintegrated and further extended throughout the lifecycle of the LLMs to provide\nrigorous analysis to the safety and trustworthiness of LLMs and their\napplications. Specifically, we consider four complementary techniques:\nfalsification and evaluation, verification, runtime monitoring, and regulations\nand ethical use. In total, 370+ references are considered to support the quick\nunderstanding of the safety and trustworthiness issues from the perspective of\nV&V. While intensive research has been conducted to identify the safety and\ntrustworthiness issues, rigorous yet practical methods are called for to ensure\nthe alignment of LLMs with safety and trustworthiness requirements.\n","authors":["Xiaowei Huang","Wenjie Ruan","Wei Huang","Gaojie Jin","Yi Dong","Changshun Wu","Saddek Bensalem","Ronghui Mu","Yi Qi","Xingyu Zhao","Kaiwen Cai","Yanghao Zhang","Sihao Wu","Peipei Xu","Dengyu Wu","Andre Freitas","Mustafa A. Mustafa"],"pdf_url":"https://arxiv.org/pdf/2305.11391v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14093v1","updated":"2023-08-27T12:35:38Z","published":"2023-08-27T12:35:38Z","title":"The inverse problem for neural networks","summary":" We study the problem of computing the preimage of a set under a neural\nnetwork with piecewise-affine activation functions. We recall an old result\nthat the preimage of a polyhedral set is again a union of polyhedral sets and\ncan be effectively computed. We show several applications of computing the\npreimage for analysis and interpretability of neural networks.\n","authors":["Marcelo Forets","Christian Schilling"],"pdf_url":"https://arxiv.org/pdf/2308.14093v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14089v1","updated":"2023-08-27T12:24:39Z","published":"2023-08-27T12:24:39Z","title":"MedAlign: A Clinician-Generated Dataset for Instruction Following with\n Electronic Medical Records","summary":" The ability of large language models (LLMs) to follow natural language\ninstructions with human-level fluency suggests many opportunities in healthcare\nto reduce administrative burden and improve quality of care. However,\nevaluating LLMs on realistic text generation tasks for healthcare remains\nchallenging. Existing question answering datasets for electronic health record\n(EHR) data fail to capture the complexity of information needs and\ndocumentation burdens experienced by clinicians. To address these challenges,\nwe introduce MedAlign, a benchmark dataset of 983 natural language instructions\nfor EHR data. MedAlign is curated by 15 clinicians (7 specialities), includes\nclinician-written reference responses for 303 instructions, and provides 276\nlongitudinal EHRs for grounding instruction-response pairs. We used MedAlign to\nevaluate 6 general domain LLMs, having clinicians rank the accuracy and quality\nof each LLM response. We found high error rates, ranging from 35% (GPT-4) to\n68% (MPT-7B-Instruct), and an 8.3% drop in accuracy moving from 32k to 2k\ncontext lengths for GPT-4. Finally, we report correlations between clinician\nrankings and automated natural language generation metrics as a way to rank\nLLMs without human review. We make MedAlign available under a research data use\nagreement to enable LLM evaluations on tasks aligned with clinician needs and\npreferences.\n","authors":["Scott L. Fleming","Alejandro Lozano","William J. Haberkorn","Jenelle A. Jindal","Eduardo P. Reis","Rahul Thapa","Louis Blankemeier","Julian Z. Genkins","Ethan Steinberg","Ashwin Nayak","Birju S. Patel","Chia-Chun Chiang","Alison Callahan","Zepeng Huo","Sergios Gatidis","Scott J. Adams","Oluseyi Fayanju","Shreya J. Shah","Thomas Savage","Ethan Goh","Akshay S. Chaudhari","Nima Aghaeepour","Christopher Sharp","Michael A. Pfeffer","Percy Liang","Jonathan H. Chen","Keith E. Morse","Emma P. Brunskill","Jason A. Fries","Nigam H. Shah"],"pdf_url":"https://arxiv.org/pdf/2308.14089v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14085v1","updated":"2023-08-27T12:16:33Z","published":"2023-08-27T12:16:33Z","title":"Sampling with flows, diffusion and autoregressive neural networks: A\n spin-glass perspective","summary":" Recent years witnessed the development of powerful generative models based on\nflows, diffusion or autoregressive neural networks, achieving remarkable\nsuccess in generating data from examples with applications in a broad range of\nareas. A theoretical analysis of the performance and understanding of the\nlimitations of these methods remain, however, challenging. In this paper, we\nundertake a step in this direction by analysing the efficiency of sampling by\nthese methods on a class of problems with a known probability distribution and\ncomparing it with the sampling performance of more traditional methods such as\nthe Monte Carlo Markov chain and Langevin dynamics. We focus on a class of\nprobability distribution widely studied in the statistical physics of\ndisordered systems that relate to spin glasses, statistical inference and\nconstraint satisfaction problems.\n We leverage the fact that sampling via flow-based, diffusion-based or\nautoregressive networks methods can be equivalently mapped to the analysis of a\nBayes optimal denoising of a modified probability measure. Our findings\ndemonstrate that these methods encounter difficulties in sampling stemming from\nthe presence of a first-order phase transition along the algorithm's denoising\npath. Our conclusions go both ways: we identify regions of parameters where\nthese methods are unable to sample efficiently, while that is possible using\nstandard Monte Carlo or Langevin approaches. We also identify regions where the\nopposite happens: standard approaches are inefficient while the discussed\ngenerative methods work well.\n","authors":["Davide Ghio","Yatin Dandi","Florent Krzakala","Lenka Zdeborová"],"pdf_url":"https://arxiv.org/pdf/2308.14085v1.pdf","comment":"39 pages, 12 figures"},{"id":"http://arxiv.org/abs/2308.03953v2","updated":"2023-08-27T11:27:13Z","published":"2023-08-07T23:44:35Z","title":"PMU measurements based short-term voltage stability assessment of power\n systems via deep transfer learning","summary":" Deep learning has emerged as an effective solution for addressing the\nchallenges of short-term voltage stability assessment (STVSA) in power systems.\nHowever, existing deep learning-based STVSA approaches face limitations in\nadapting to topological changes, sample labeling, and handling small datasets.\nTo overcome these challenges, this paper proposes a novel phasor measurement\nunit (PMU) measurements-based STVSA method by using deep transfer learning. The\nmethod leverages the real-time dynamic information captured by PMUs to create\nan initial dataset. It employs temporal ensembling for sample labeling and\nutilizes least squares generative adversarial networks (LSGAN) for data\naugmentation, enabling effective deep learning on small-scale datasets.\nAdditionally, the method enhances adaptability to topological changes by\nexploring connections between different faults. Experimental results on the\nIEEE 39-bus test system demonstrate that the proposed method improves model\nevaluation accuracy by approximately 20% through transfer learning, exhibiting\nstrong adaptability to topological changes. Leveraging the self-attention\nmechanism of the Transformer model, this approach offers significant advantages\nover shallow learning methods and other deep learning-based approaches.\n","authors":["Yang Li","Shitu Zhang","Yuanzheng Li","Jiting Cao","Shuyue Jia"],"pdf_url":"https://arxiv.org/pdf/2308.03953v2.pdf","comment":"Accepted by IEEE Transactions on Instrumentation & Measurement"},{"id":"http://arxiv.org/abs/2308.14058v1","updated":"2023-08-27T09:45:41Z","published":"2023-08-27T09:45:41Z","title":"Pruning the Unlabeled Data to Improve Semi-Supervised Learning","summary":" In the domain of semi-supervised learning (SSL), the conventional approach\ninvolves training a learner with a limited amount of labeled data alongside a\nsubstantial volume of unlabeled data, both drawn from the same underlying\ndistribution. However, for deep learning models, this standard practice may not\nyield optimal results. In this research, we propose an alternative perspective,\nsuggesting that distributions that are more readily separable could offer\nsuperior benefits to the learner as compared to the original distribution. To\nachieve this, we present PruneSSL, a practical technique for selectively\nremoving examples from the original unlabeled dataset to enhance its\nseparability. We present an empirical study, showing that although PruneSSL\nreduces the quantity of available training data for the learner, it\nsignificantly improves the performance of various competitive SSL algorithms,\nthereby achieving state-of-the-art results across several image classification\ntasks.\n","authors":["Guy Hacohen","Daphna Weinshall"],"pdf_url":"https://arxiv.org/pdf/2308.14058v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2307.15984v2","updated":"2023-08-27T12:45:33Z","published":"2023-07-29T13:12:40Z","title":"VATP360: Viewport Adaptive 360-Degree Video Streaming based on Tile\n Priority","summary":" 360-degree video becomes increasingly popular among users. In the current\nnetwork bandwidth, serving high resolution 360 degree video to users is quite\ndifficult. Most of the work has been devoted to the prediction of user\nviewports or tile-based adaptive algorithms. However, it is difficult to\npredict user viewports more accurately using only information such as user's\nhistorical viewports or video saliency maps. In this paper, we propose a\nviewport adaptive 360-degree video streaming method based on tile priority\n(VATP360), which tries to balance between the performance and the overhead. The\nproposed VATP360 consists of three main modules: viewport prediction, tile\npriority classification and bitrate allocation. In the viewport prediction\nmodule, object motion trajectory and predicted user's region-of-interest (ROI)\nare used to achieve accurate prediction of the user's future viewport. Then,\nthe predicted viewport, along with the object motion trajectory, are fed into\nthe proposed tile priority classification algorithm to assign different\npriorities to tiles, which would reduce the computational complexity of the\nbitrate allocation module. Finally in the bitrate allocation stage, we\nadaptively assign bitrates to tiles of different priority by reinforcement\nlearning. Experimental results on publicly available datasets have demonstrated\nthe effectiveness of the proposed method.\n","authors":["Zhiyu Pang"],"pdf_url":"https://arxiv.org/pdf/2307.15984v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13998v1","updated":"2023-08-27T03:55:28Z","published":"2023-08-27T03:55:28Z","title":"Computation-efficient Deep Learning for Computer Vision: A Survey","summary":" Over the past decade, deep learning models have exhibited considerable\nadvancements, reaching or even exceeding human-level performance in a range of\nvisual perception tasks. This remarkable progress has sparked interest in\napplying deep networks to real-world applications, such as autonomous vehicles,\nmobile devices, robotics, and edge computing. However, the challenge remains\nthat state-of-the-art models usually demand significant computational\nresources, leading to impractical power consumption, latency, or carbon\nemissions in real-world scenarios. This trade-off between effectiveness and\nefficiency has catalyzed the emergence of a new research focus: computationally\nefficient deep learning, which strives to achieve satisfactory performance\nwhile minimizing the computational cost during inference. This review offers an\nextensive analysis of this rapidly evolving field by examining four key areas:\n1) the development of static or dynamic light-weighted backbone models for the\nefficient extraction of discriminative deep representations; 2) the specialized\nnetwork architectures or algorithms tailored for specific computer vision\ntasks; 3) the techniques employed for compressing deep learning models; and 4)\nthe strategies for deploying efficient deep networks on hardware platforms.\nAdditionally, we provide a systematic discussion on the critical challenges\nfaced in this domain, such as network architecture design, training schemes,\npractical efficiency, and more realistic model compression approaches, as well\nas potential future research directions.\n","authors":["Yulin Wang","Yizeng Han","Chaofei Wang","Shiji Song","Qi Tian","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2308.13998v1.pdf","comment":null}]},"2023-08-26T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2308.13961v1","updated":"2023-08-26T21:38:31Z","published":"2023-08-26T21:38:31Z","title":"Translate Meanings, Not Just Words: IdiomKB's Role in Optimizing\n Idiomatic Translation with Language Models","summary":" To translate well, machine translation (MT) systems and general-purposed\nlanguage models (LMs) need a deep understanding of both source and target\nlanguages and cultures. Therefore, idioms, with their non-compositional nature,\npose particular challenges for Transformer-based systems, as literal\ntranslations often miss the intended meaning. Traditional methods, which\nreplace idioms using existing knowledge bases (KBs), often lack scale and\ncontext awareness. Addressing these challenges, our approach prioritizes\ncontext awareness and scalability, allowing for offline storage of idioms in a\nmanageable KB size. This ensures efficient serving with smaller models and\nprovides a more comprehensive understanding of idiomatic expressions. We\nintroduce a multilingual idiom KB (IdiomKB) developed using large LMs to\naddress this. This KB facilitates better translation by smaller models, such as\nBLOOMZ (7.1B), Alpaca (7B), and InstructGPT (6.7B), by retrieving idioms'\nfigurative meanings. We present a novel, GPT-4-powered metric for human-aligned\nevaluation, demonstrating that IdiomKB considerably boosts model performance.\nHuman evaluations further validate our KB's quality.\n","authors":["Shuang Li","Jiangjie Chen","Siyu Yuan","Xinyi Wu","Hao Yang","Shimin Tao","Yanghua Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.13961v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2308.13958v1","updated":"2023-08-26T20:59:21Z","published":"2023-08-26T20:59:21Z","title":"Improving Knowledge Distillation for BERT Models: Loss Functions,\n Mapping Methods, and Weight Tuning","summary":" The use of large transformer-based models such as BERT, GPT, and T5 has led\nto significant advancements in natural language processing. However, these\nmodels are computationally expensive, necessitating model compression\ntechniques that reduce their size and complexity while maintaining accuracy.\nThis project investigates and applies knowledge distillation for BERT model\ncompression, specifically focusing on the TinyBERT student model. We explore\nvarious techniques to improve knowledge distillation, including experimentation\nwith loss functions, transformer layer mapping methods, and tuning the weights\nof attention and representation loss and evaluate our proposed techniques on a\nselection of downstream tasks from the GLUE benchmark. The goal of this work is\nto improve the efficiency and effectiveness of knowledge distillation, enabling\nthe development of more efficient and accurate models for a range of natural\nlanguage processing tasks.\n","authors":["Apoorv Dankar","Adeem Jassani","Kartikaeya Kumar"],"pdf_url":"https://arxiv.org/pdf/2308.13958v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12057v2","updated":"2023-08-26T19:29:03Z","published":"2023-07-05T17:05:32Z","title":"External Reasoning: Towards Multi-Large-Language-Models Interchangeable\n Assistance with Human Feedback","summary":" Memory is identified as a crucial human faculty that allows for the retention\nof visual and linguistic information within the hippocampus and neurons in the\nbrain, which can subsequently be retrieved to address real-world challenges\nthat arise through a lifetime of learning. The resolution of complex AI tasks\nthrough the application of acquired knowledge represents a stride toward the\nrealization of artificial general intelligence. However, despite the prevalence\nof Large Language Models (LLMs) like GPT-3.5 and GPT-4 \\cite{brown2020language,\nleiter2023chatgpt, zaitsu2023distinguishing, OpenAI2023GPT4TR} , which have\ndisplayed remarkable capabilities in language comprehension, generation,\ninteraction, and reasoning, they are inhibited by constraints on context length\nthat preclude the processing of extensive, continually evolving knowledge\nbases. This paper proposes that LLMs could be augmented through the selective\nintegration of knowledge from external repositories, and in doing so,\nintroduces a novel methodology for External Reasoning, exemplified by ChatPDF.\nCentral to this approach is the establishment of a tiered policy for\n\\textbf{External Reasoning based on Multiple LLM Interchange Assistance} in\n\\cref{fig:overall}, where the level of support rendered is modulated across\nentry, intermediate, and advanced tiers based on the complexity of the query,\nwith adjustments made in response to human feedback. A comprehensive evaluation\nof this methodology is conducted using multiple LLMs and the results indicate\nstate-of-the-art performance in \\cref{comparison} , surpassing existing\nsolutions including ChatPDF.com. Moreover, the paper emphasizes that this\napproach is more efficient compared to the direct processing of full text by\nLLMs. The source code is publicly available at:\n\\url{https://github.com/AkideLiu/ANLP}.\n","authors":["Akide Liu"],"pdf_url":"https://arxiv.org/pdf/2307.12057v2.pdf","comment":"technical report, add code link. arXiv admin note: text overlap with\n arXiv:2305.11206 by other authors"},{"id":"http://arxiv.org/abs/2307.02758v2","updated":"2023-08-26T16:56:27Z","published":"2023-07-06T03:43:45Z","title":"Exploring Linguistic Style Matching in Online Communities: The Role of\n Social Context and Conversation Dynamics","summary":" Linguistic style matching (LSM) in conversations can be reflective of several\naspects of social influence such as power or persuasion. However, how LSM\nrelates to the outcomes of online communication on platforms such as Reddit is\nan unknown question. In this study, we analyze a large corpus of two-party\nconversation threads in Reddit where we identify all occurrences of LSM using\ntwo types of style: the use of function words and formality. Using this\nframework, we examine how levels of LSM differ in conversations depending on\nseveral social factors within Reddit: post and subreddit features, conversation\ndepth, user tenure, and the controversiality of a comment. Finally, we measure\nthe change of LSM following loss of status after community banning. Our\nfindings reveal the interplay of LSM in Reddit conversations with several\ncommunity metrics, suggesting the importance of understanding conversation\nengagement when understanding community dynamics.\n","authors":["Aparna Ananthasubramaniam","Hong Chen","Jason Yan","Kenan Alkiek","Jiaxin Pei","Agrima Seth","Lavinia Dunagan","Minje Choi","Benjamin Litterer","David Jurgens"],"pdf_url":"https://arxiv.org/pdf/2307.02758v2.pdf","comment":"Equal contributions from authors 1-9 (AA, HC, JY, KA, JP, AS, LD, MC,\n BL)"},{"id":"http://arxiv.org/abs/2308.13916v1","updated":"2023-08-26T16:51:17Z","published":"2023-08-26T16:51:17Z","title":"Exploring Large Language Models for Knowledge Graph Completion","summary":" Knowledge graphs play a vital role in numerous artificial intelligence tasks,\nyet they frequently face the issue of incompleteness. In this study, we explore\nutilizing Large Language Models (LLM) for knowledge graph completion. We\nconsider triples in knowledge graphs as text sequences and introduce an\ninnovative framework called Knowledge Graph LLM (KG-LLM) to model these\ntriples. Our technique employs entity and relation descriptions of a triple as\nprompts and utilizes the response for predictions. Experiments on various\nbenchmark knowledge graphs demonstrate that our method attains state-of-the-art\nperformance in tasks such as triple classification and relation prediction. We\nalso find that fine-tuning relatively smaller models (e.g., LLaMA-7B,\nChatGLM-6B) outperforms recent ChatGPT and GPT-4.\n","authors":["Liang Yao","Jiazhen Peng","Chengsheng Mao","Yuan Luo"],"pdf_url":"https://arxiv.org/pdf/2308.13916v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2308.13911v1","updated":"2023-08-26T16:10:30Z","published":"2023-08-26T16:10:30Z","title":"A Wide Evaluation of ChatGPT on Affective Computing Tasks","summary":" With the rise of foundation models, a new artificial intelligence paradigm\nhas emerged, by simply using general purpose foundation models with prompting\nto solve problems instead of training a separate machine learning model for\neach problem. Such models have been shown to have emergent properties of\nsolving problems that they were not initially trained on. The studies for the\neffectiveness of such models are still quite limited. In this work, we widely\nstudy the capabilities of the ChatGPT models, namely GPT-4 and GPT-3.5, on 13\naffective computing problems, namely aspect extraction, aspect polarity\nclassification, opinion extraction, sentiment analysis, sentiment intensity\nranking, emotions intensity ranking, suicide tendency detection, toxicity\ndetection, well-being assessment, engagement measurement, personality\nassessment, sarcasm detection, and subjectivity detection. We introduce a\nframework to evaluate the ChatGPT models on regression-based problems, such as\nintensity ranking problems, by modelling them as pairwise ranking\nclassification. We compare ChatGPT against more traditional NLP methods, such\nas end-to-end recurrent neural networks and transformers. The results\ndemonstrate the emergent abilities of the ChatGPT models on a wide range of\naffective computing problems, where GPT-3.5 and especially GPT-4 have shown\nstrong performance on many problems, particularly the ones related to\nsentiment, emotions, or toxicity. The ChatGPT models fell short for problems\nwith implicit signals, such as engagement measurement and subjectivity\ndetection.\n","authors":["Mostafa M. Amin","Rui Mao","Erik Cambria","Björn W. Schuller"],"pdf_url":"https://arxiv.org/pdf/2308.13911v1.pdf","comment":"8 pages with references, 2 tables"},{"id":"http://arxiv.org/abs/2308.13904v1","updated":"2023-08-26T15:21:47Z","published":"2023-08-26T15:21:47Z","title":"LMSanitator: Defending Prompt-Tuning Against Task-Agnostic Backdoors","summary":" Prompt-tuning has emerged as an attractive paradigm for deploying large-scale\nlanguage models due to its strong downstream task performance and efficient\nmultitask serving ability. Despite its wide adoption, we empirically show that\nprompt-tuning is vulnerable to downstream task-agnostic backdoors, which reside\nin the pretrained models and can affect arbitrary downstream tasks. The\nstate-of-the-art backdoor detection approaches cannot defend against\ntask-agnostic backdoors since they hardly converge in reversing the backdoor\ntriggers. To address this issue, we propose LMSanitator, a novel approach for\ndetecting and removing task-agnostic backdoors on Transformer models. Instead\nof directly inversing the triggers, LMSanitator aims to inverse the predefined\nattack vectors (pretrained models' output when the input is embedded with\ntriggers) of the task-agnostic backdoors, which achieves much better\nconvergence performance and backdoor detection accuracy. LMSanitator further\nleverages prompt-tuning's property of freezing the pretrained model to perform\naccurate and fast output monitoring and input purging during the inference\nphase. Extensive experiments on multiple language models and NLP tasks\nillustrate the effectiveness of LMSanitator. For instance, LMSanitator achieves\n92.8% backdoor detection accuracy on 960 models and decreases the attack\nsuccess rate to less than 1% in most scenarios.\n","authors":["Chengkun Wei","Wenlong Meng","Zhikun Zhang","Min Chen","Minghu Zhao","Wenjing Fang","Lei Wang","Zihui Zhang","Wenzhi Chen"],"pdf_url":"https://arxiv.org/pdf/2308.13904v1.pdf","comment":"To Appear in the Network and Distributed System Security (NDSS)\n Symposium 2024, 26 February - 1 March 2024, San Diego, CA, USA"},{"id":"http://arxiv.org/abs/2307.02054v3","updated":"2023-08-26T11:02:16Z","published":"2023-07-05T06:38:52Z","title":"Emoji Prediction in Tweets using BERT","summary":" In recent years, the use of emojis in social media has increased\ndramatically, making them an important element in understanding online\ncommunication. However, predicting the meaning of emojis in a given text is a\nchallenging task due to their ambiguous nature. In this study, we propose a\ntransformer-based approach for emoji prediction using BERT, a widely-used\npre-trained language model. We fine-tuned BERT on a large corpus of text\n(tweets) containing both text and emojis to predict the most appropriate emoji\nfor a given text. Our experimental results demonstrate that our approach\noutperforms several state-of-the-art models in predicting emojis with an\naccuracy of over 75 percent. This work has potential applications in natural\nlanguage processing, sentiment analysis, and social media marketing.\n","authors":["Muhammad Osama Nusrat","Zeeshan Habib","Mehreen Alam","Saad Ahmed Jamal"],"pdf_url":"https://arxiv.org/pdf/2307.02054v3.pdf","comment":"This paper is focused on predicting emojis corresponding to tweets\n using BERT"},{"id":"http://arxiv.org/abs/2308.13844v1","updated":"2023-08-26T10:35:16Z","published":"2023-08-26T10:35:16Z","title":"Solving Math Word Problem with Problem Type Classification","summary":" Math word problems (MWPs) require analyzing text descriptions and generating\nmathematical equations to derive solutions. Existing works focus on solving\nMWPs with two types of solvers: tree-based solver and large language model\n(LLM) solver. However, these approaches always solve MWPs by a single solver,\nwhich will bring the following problems: (1) Single type of solver is hard to\nsolve all types of MWPs well. (2) A single solver will result in poor\nperformance due to over-fitting. To address these challenges, this paper\nutilizes multiple ensemble approaches to improve MWP-solving ability. Firstly,\nWe propose a problem type classifier that combines the strengths of the\ntree-based solver and the LLM solver. This ensemble approach leverages their\nrespective advantages and broadens the range of MWPs that can be solved.\nFurthermore, we also apply ensemble techniques to both tree-based solver and\nLLM solver to improve their performance. For the tree-based solver, we propose\nan ensemble learning framework based on ten-fold cross-validation and voting\nmechanism. In the LLM solver, we adopt self-consistency (SC) method to improve\nanswer selection. Experimental results demonstrate the effectiveness of these\nensemble approaches in enhancing MWP-solving ability. The comprehensive\nevaluation showcases improved performance, validating the advantages of our\nproposed approach. Our code is available at this url:\nhttps://github.com/zhouzihao501/NLPCC2023-Shared-Task3-ChineseMWP.\n","authors":["Jie Yao","Zihao Zhou","Qiufeng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.13844v1.pdf","comment":"Accpected by NLPCC2023"},{"id":"http://arxiv.org/abs/2307.07851v3","updated":"2023-08-26T07:24:28Z","published":"2023-07-15T17:01:56Z","title":"AspectCSE: Sentence Embeddings for Aspect-based Semantic Textual\n Similarity Using Contrastive Learning and Structured Knowledge","summary":" Generic sentence embeddings provide a coarse-grained approximation of\nsemantic textual similarity but ignore specific aspects that make texts\nsimilar. Conversely, aspect-based sentence embeddings provide similarities\nbetween texts based on certain predefined aspects. Thus, similarity predictions\nof texts are more targeted to specific requirements and more easily\nexplainable. In this paper, we present AspectCSE, an approach for aspect-based\ncontrastive learning of sentence embeddings. Results indicate that AspectCSE\nachieves an average improvement of 3.97% on information retrieval tasks across\nmultiple aspects compared to the previous best results. We also propose using\nWikidata knowledge graph properties to train models of multi-aspect sentence\nembeddings in which multiple specific aspects are simultaneously considered\nduring similarity predictions. We demonstrate that multi-aspect embeddings\noutperform single-aspect embeddings on aspect-specific information retrieval\ntasks. Finally, we examine the aspect-based sentence embedding space and\ndemonstrate that embeddings of semantically similar aspect labels are often\nclose, even without explicit similarity training between different aspect\nlabels.\n","authors":["Tim Schopf","Emanuel Gerber","Malte Ostendorff","Florian Matthes"],"pdf_url":"https://arxiv.org/pdf/2307.07851v3.pdf","comment":"Accepted to the 14th International Conference on Recent Advances in\n Natural Language Processing (RANLP 2023)"},{"id":"http://arxiv.org/abs/2308.13782v1","updated":"2023-08-26T06:28:14Z","published":"2023-08-26T06:28:14Z","title":"Planning with Logical Graph-based Language Model for Instruction\n Generation","summary":" Despite the superior performance of large language models to generate natural\nlanguage texts, it is hard to generate texts with correct logic according to a\ngiven task, due to the difficulties for neural models to capture implied rules\nfrom free-form texts. In this paper, we propose a novel graph-based language\nmodel, Logical-GLM, to infuse logic into language models for more valid text\ngeneration and interpretability. Specifically, we first capture information\nfrom natural language instructions and construct logical bayes graphs that\ngenerally describe domains. Next, we generate logical skeletons to guide\nlanguage model training, infusing domain knowledge into language models.\nFinally, we alternately optimize the searching policy of graphs and language\nmodels until convergence. The experimental results show that Logical-GLM is\nboth effective and efficient compared with traditional language models, despite\nusing smaller-scale training data and fewer parameters. Our approach can\ngenerate instructional texts with more correct logic owing to the internalized\ndomain knowledge. Moreover, the usage of logical graphs reflects the inner\nmechanism of the language models, which improves the interpretability of\nblack-box models.\n","authors":["Fan Zhang","Kebing Jin","Hankz Hankui Zhuo"],"pdf_url":"https://arxiv.org/pdf/2308.13782v1.pdf","comment":"9 pages, 8 figures"},{"id":"http://arxiv.org/abs/2210.11694v2","updated":"2023-08-26T05:57:06Z","published":"2022-10-21T02:44:55Z","title":"Multi-View Reasoning: Consistent Contrastive Learning for Math Word\n Problem","summary":" Math word problem solver requires both precise relation reasoning about\nquantities in the text and reliable generation for the diverse equation.\nCurrent sequence-to-tree or relation extraction methods regard this only from a\nfixed view, struggling to simultaneously handle complex semantics and diverse\nequations. However, human solving naturally involves two consistent reasoning\nviews: top-down and bottom-up, just as math equations also can be expressed in\nmultiple equivalent forms: pre-order and post-order. We propose a multi-view\nconsistent contrastive learning for a more complete semantics-to-equation\nmapping. The entire process is decoupled into two independent but consistent\nviews: top-down decomposition and bottom-up construction, and the two reasoning\nviews are aligned in multi-granularity for consistency, enhancing global\ngeneration and precise reasoning. Experiments on multiple datasets across two\nlanguages show our approach significantly outperforms the existing baselines,\nespecially on complex problems. We also show after consistent alignment,\nmulti-view can absorb the merits of both views and generate more diverse\nresults consistent with the mathematical laws.\n","authors":["Wenqi Zhang","Yongliang Shen","Yanna Ma","Xiaoxia Cheng","Zeqi Tan","Qingpeng Nong","Weiming Lu"],"pdf_url":"https://arxiv.org/pdf/2210.11694v2.pdf","comment":"14 pages, 5 figures, 3 appendix figures"},{"id":"http://arxiv.org/abs/2304.06634v2","updated":"2023-08-26T05:55:48Z","published":"2023-04-13T16:02:19Z","title":"PGTask: Introducing the Task of Profile Generation from Dialogues","summary":" Recent approaches have attempted to personalize dialogue systems by\nleveraging profile information into models. However, this knowledge is scarce\nand difficult to obtain, which makes the extraction/generation of profile\ninformation from dialogues a fundamental asset. To surpass this limitation, we\nintroduce the Profile Generation Task (PGTask). We contribute with a new\ndataset for this problem, comprising profile sentences aligned with related\nutterances, extracted from a corpus of dialogues. Furthermore, using\nstate-of-the-art methods, we provide a benchmark for profile generation on this\nnovel dataset. Our experiments disclose the challenges of profile generation,\nand we hope that this introduces a new research direction.\n","authors":["Rui Ribeiro","Joao P. Carvalho","Luísa Coheur"],"pdf_url":"https://arxiv.org/pdf/2304.06634v2.pdf","comment":"Accepted at SIGDIAL 2023, 4 pages, 2 figures"},{"id":"http://arxiv.org/abs/2308.13775v1","updated":"2023-08-26T05:48:57Z","published":"2023-08-26T05:48:57Z","title":"EditSum: A Retrieve-and-Edit Framework for Source Code Summarization","summary":" Existing studies show that code summaries help developers understand and\nmaintain source code. Unfortunately, these summaries are often missing or\noutdated in software projects. Code summarization aims to generate natural\nlanguage descriptions automatically for source code. Code summaries are highly\nstructured and have repetitive patterns. Besides the patternized words, a code\nsummary also contains important keywords, which are the key to reflecting the\nfunctionality of the code. However, the state-of-the-art approaches perform\npoorly on predicting the keywords, which leads to the generated summaries\nsuffering a loss in informativeness. To alleviate this problem, this paper\nproposes a novel retrieve-and-edit approach named EditSum for code\nsummarization. Specifically, EditSum first retrieves a similar code snippet\nfrom a pre-defined corpus and treats its summary as a prototype summary to\nlearn the pattern. Then, EditSum edits the prototype automatically to combine\nthe pattern in the prototype with the semantic information of input code. Our\nmotivation is that the retrieved prototype provides a good start-point for\npost-generation because the summaries of similar code snippets often have the\nsame pattern. The post-editing process further reuses the patternized words in\nthe prototype and generates keywords based on the semantic information of input\ncode. We conduct experiments on a large-scale Java corpus and experimental\nresults demonstrate that EditSum outperforms the state-of-the-art approaches by\na substantial margin. The human evaluation also proves the summaries generated\nby EditSum are more informative and useful. We also verify that EditSum\nperforms well on predicting the patternized words and keywords.\n","authors":["Jia Allen Li","Yongmin Li","Ge Li","Xing Hu","Xin Xia","Zhi Jin"],"pdf_url":"https://arxiv.org/pdf/2308.13775v1.pdf","comment":"Accepted by the 36th IEEE/ACM International Conference on Automated\n Software Engineering (ASE 2021)"},{"id":"http://arxiv.org/abs/2308.13768v1","updated":"2023-08-26T05:20:58Z","published":"2023-08-26T05:20:58Z","title":"Adversarial Fine-Tuning of Language Models: An Iterative Optimisation\n Approach for the Generation and Detection of Problematic Content","summary":" In this paper, we tackle the emerging challenge of unintended harmful content\ngeneration in Large Language Models (LLMs) with a novel dual-stage optimisation\ntechnique using adversarial fine-tuning. Our two-pronged approach employs an\nadversarial model, fine-tuned to generate potentially harmful prompts, and a\njudge model, iteratively optimised to discern these prompts. In this\nadversarial cycle, the two models seek to outperform each other in the\nprompting phase, generating a dataset of rich examples which are then used for\nfine-tuning. This iterative application of prompting and fine-tuning allows\ncontinuous refinement and improved performance. The performance of our approach\nis evaluated through classification accuracy on a dataset consisting of\nproblematic prompts not detected by GPT-4, as well as a selection of\ncontentious but unproblematic prompts. We show considerable increase in\nclassification accuracy of the judge model on this challenging dataset as it\nundergoes the optimisation process. Furthermore, we show that a rudimentary\nmodel \\texttt{ada} can achieve 13\\% higher accuracy on the hold-out test set\nthan GPT-4 after only a few rounds of this process, and that this fine-tuning\nimproves performance in parallel tasks such as toxic comment identification.\n","authors":["Charles O'Neill","Jack Miller","Ioana Ciuca","Yuan-Sen Ting","Thang Bui"],"pdf_url":"https://arxiv.org/pdf/2308.13768v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13760v1","updated":"2023-08-26T04:49:46Z","published":"2023-08-26T04:49:46Z","title":"How Can Context Help? Exploring Joint Retrieval of Passage and\n Personalized Context","summary":" The integration of external personalized context information into\ndocument-grounded conversational systems has significant potential business\nvalue, but has not been well-studied. Motivated by the concept of personalized\ncontext-aware document-grounded conversational systems, we introduce the task\nof context-aware passage retrieval. We also construct a dataset specifically\ncurated for this purpose. We describe multiple baseline systems to address this\ntask, and propose a novel approach, Personalized Context-Aware Search (PCAS),\nthat effectively harnesses contextual information during passage retrieval.\nExperimental evaluations conducted on multiple popular dense retrieval systems\ndemonstrate that our proposed approach not only outperforms the baselines in\nretrieving the most relevant passage but also excels at identifying the\npertinent context among all the available contexts. We envision that our\ncontributions will serve as a catalyst for inspiring future research endeavors\nin this promising direction.\n","authors":["Hui Wan","Hongkang Li","Songtao Lu","Xiaodong Cui","Marina Danilevsky"],"pdf_url":"https://arxiv.org/pdf/2308.13760v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13754v1","updated":"2023-08-26T03:48:10Z","published":"2023-08-26T03:48:10Z","title":"ZC3: Zero-Shot Cross-Language Code Clone Detection","summary":" Developers introduce code clones to improve programming productivity. Many\nexisting studies have achieved impressive performance in monolingual code clone\ndetection. However, during software development, more and more developers write\nsemantically equivalent programs with different languages to support different\nplatforms and help developers translate projects from one language to another.\nConsidering that collecting cross-language parallel data, especially for\nlow-resource languages, is expensive and time-consuming, how designing an\neffective cross-language model that does not rely on any parallel data is a\nsignificant problem. In this paper, we propose a novel method named ZC3 for\nZero-shot Cross-language Code Clone detection. ZC3 designs the contrastive\nsnippet prediction to form an isomorphic representation space among different\nprogramming languages. Based on this, ZC3 exploits domain-aware learning and\ncycle consistency learning to further constrain the model to generate\nrepresentations that are aligned among different languages meanwhile are\ndiacritical for different types of clones. To evaluate our approach, we conduct\nextensive experiments on four representative cross-language clone detection\ndatasets. Experimental results show that ZC3 outperforms the state-of-the-art\nbaselines by 67.12%, 51.39%, 14.85%, and 53.01% on the MAP score, respectively.\nWe further investigate the representational distribution of different languages\nand discuss the effectiveness of our method.\n","authors":["Jia Li","Chongyang Tao","Zhi Jin","Fang Liu","Jia Allen Li","Ge Li"],"pdf_url":"https://arxiv.org/pdf/2308.13754v1.pdf","comment":"Accepted by the 38th IEEE/ACM International Conference on Automated\n Software Engineering (ASE 2023)"},{"id":"http://arxiv.org/abs/2308.13738v1","updated":"2023-08-26T02:52:42Z","published":"2023-08-26T02:52:42Z","title":"On Philomatics and Psychomatics for Combining Philosophy and Psychology\n with Mathematics","summary":" We propose the concepts of philomatics and psychomatics as hybrid\ncombinations of philosophy and psychology with mathematics. We explain four\nmotivations for this combination which are fulfilling the desire of analytical\nphilosophy, proposing science of philosophy, justifying mathematical algorithms\nby philosophy, and abstraction in both philosophy and mathematics. We enumerate\nvarious examples for philomatics and psychomatics, some of which are explained\nin more depth. The first example is the analysis of relation between the\ncontext principle, semantic holism, and the usage theory of meaning with the\nattention mechanism in mathematics. The other example is on the relations of\nPlato's theory of forms in philosophy with the holographic principle in string\ntheory, object-oriented programming, and machine learning. Finally, the\nrelation between Wittgenstein's family resemblance and clustering in\nmathematics is explained. This paper opens the door of research for combining\nphilosophy and psychology with mathematics.\n","authors":["Benyamin Ghojogh","Morteza Babaie"],"pdf_url":"https://arxiv.org/pdf/2308.13738v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18703v5","updated":"2023-08-26T02:42:49Z","published":"2023-05-30T03:00:30Z","title":"Domain Specialization as the Key to Make Large Language Models\n Disruptive: A Comprehensive Survey","summary":" Large language models (LLMs) have significantly advanced the field of natural\nlanguage processing (NLP), providing a highly useful, task-agnostic foundation\nfor a wide range of applications. However, directly applying LLMs to solve\nsophisticated problems in specific domains meets many hurdles, caused by the\nheterogeneity of domain data, the sophistication of domain knowledge, the\nuniqueness of domain objectives, and the diversity of the constraints (e.g.,\nvarious social norms, cultural conformity, religious beliefs, and ethical\nstandards in the domain applications). Domain specification techniques are key\nto make large language models disruptive in many applications. Specifically, to\nsolve these hurdles, there has been a notable increase in research and\npractices conducted in recent years on the domain specialization of LLMs. This\nemerging field of study, with its substantial potential for impact,\nnecessitates a comprehensive and systematic review to better summarize and\nguide ongoing work in this area. In this article, we present a comprehensive\nsurvey on domain specification techniques for large language models, an\nemerging direction critical for large language model applications. First, we\npropose a systematic taxonomy that categorizes the LLM domain-specialization\ntechniques based on the accessibility to LLMs and summarizes the framework for\nall the subcategories as well as their relations and differences to each other.\nSecond, we present an extensive taxonomy of critical application domains that\ncan benefit dramatically from specialized LLMs, discussing their practical\nsignificance and open challenges. Last, we offer our insights into the current\nresearch status and future trends in this area.\n","authors":["Chen Ling","Xujiang Zhao","Jiaying Lu","Chengyuan Deng","Can Zheng","Junxiang Wang","Tanmoy Chowdhury","Yun Li","Hejie Cui","Xuchao Zhang","Tianjiao Zhao","Amit Panalkar","Wei Cheng","Haoyu Wang","Yanchi Liu","Zhengzhang Chen","Haifeng Chen","Chris White","Quanquan Gu","Jian Pei","Liang Zhao"],"pdf_url":"https://arxiv.org/pdf/2305.18703v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04802v2","updated":"2023-08-26T02:21:05Z","published":"2023-06-07T21:51:56Z","title":"A Survey on Knowledge Graphs for Healthcare: Resources, Applications,\n and Promises","summary":" Healthcare knowledge graphs (HKGs) have emerged as a promising tool for\norganizing medical knowledge in a structured and interpretable way, which\nprovides a comprehensive view of medical concepts and their relationships.\nHowever, challenges such as data heterogeneity and limited coverage remain,\nemphasizing the need for further research in the field of HKGs. This survey\npaper serves as the first comprehensive overview of HKGs. We summarize the\npipeline and key techniques for HKG construction (i.e., from scratch and\nthrough integration), as well as the common utilization approaches (i.e.,\nmodel-free and model-based). To provide researchers with valuable resources, we\norganize existing HKGs (The resource is available at\nhttps://github.com/lujiaying/Awesome-HealthCare-KnowledgeBase) based on the\ndata types they capture and application domains, supplemented with pertinent\nstatistical information. In the application section, we delve into the\ntransformative impact of HKGs across various healthcare domains, spanning from\nfine-grained basic science research to high-level clinical decision support.\nLastly, we shed light on the opportunities for creating comprehensive and\naccurate HKGs in the era of large language models, presenting the potential to\nrevolutionize healthcare delivery and enhance the interpretability and\nreliability of clinical prediction.\n","authors":["Hejie Cui","Jiaying Lu","Shiyu Wang","Ran Xu","Wenjing Ma","Shaojun Yu","Yue Yu","Xuan Kan","Chen Ling","Liang Zhao","Joyce Ho","Fei Wang","Carl Yang"],"pdf_url":"https://arxiv.org/pdf/2306.04802v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11504v2","updated":"2023-08-26T00:33:23Z","published":"2023-03-20T23:54:26Z","title":"Language Model Behavior: A Comprehensive Survey","summary":" Transformer language models have received widespread public attention, yet\ntheir generated text is often surprising even to NLP researchers. In this\nsurvey, we discuss over 250 recent studies of English language model behavior\nbefore task-specific fine-tuning. Language models possess basic capabilities in\nsyntax, semantics, pragmatics, world knowledge, and reasoning, but these\ncapabilities are sensitive to specific inputs and surface features. Despite\ndramatic increases in generated text quality as models scale to hundreds of\nbillions of parameters, the models are still prone to unfactual responses,\ncommonsense errors, memorized text, and social biases. Many of these weaknesses\ncan be framed as over-generalizations or under-generalizations of learned\npatterns in text. We synthesize recent results to highlight what is currently\nknown about large language model capabilities, thus providing a resource for\napplied work and for research in adjacent fields that use language models.\n","authors":["Tyler A. Chang","Benjamin K. Bergen"],"pdf_url":"https://arxiv.org/pdf/2303.11504v2.pdf","comment":"32 pages, accepted to Computational Linguistics"},{"id":"http://arxiv.org/abs/2308.13715v1","updated":"2023-08-26T00:27:08Z","published":"2023-08-26T00:27:08Z","title":"A Computational Evaluation Framework for Singable Lyric Translation","summary":" Lyric translation plays a pivotal role in amplifying the global resonance of\nmusic, bridging cultural divides, and fostering universal connections.\nTranslating lyrics, unlike conventional translation tasks, requires a delicate\nbalance between singability and semantics. In this paper, we present a\ncomputational framework for the quantitative evaluation of singable lyric\ntranslation, which seamlessly integrates musical, linguistic, and cultural\ndimensions of lyrics. Our comprehensive framework consists of four metrics that\nmeasure syllable count distance, phoneme repetition similarity, musical\nstructure distance, and semantic similarity. To substantiate the efficacy of\nour framework, we collected a singable lyrics dataset, which precisely aligns\nEnglish, Japanese, and Korean lyrics on a line-by-line and section-by-section\nbasis, and conducted a comparative analysis between singable and non-singable\nlyrics. Our multidisciplinary approach provides insights into the key\ncomponents that underlie the art of lyric translation and establishes a solid\ngroundwork for the future of computational lyric translation assessment.\n","authors":["Haven Kim","Kento Watanabe","Masataka Goto","Juhan Nam"],"pdf_url":"https://arxiv.org/pdf/2308.13715v1.pdf","comment":"ISMIR 2023"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.13820v1","updated":"2023-08-26T09:02:21Z","published":"2023-08-26T09:02:21Z","title":"Video and Audio are Images: A Cross-Modal Mixer for Original Data on\n Video-Audio Retrieval","summary":" Cross-modal retrieval has become popular in recent years, particularly with\nthe rise of multimedia. Generally, the information from each modality exhibits\ndistinct representations and semantic information, which makes feature tends to\nbe in separate latent spaces encoded with dual-tower architecture and makes it\ndifficult to establish semantic relationships between modalities, resulting in\npoor retrieval performance. To address this issue, we propose a novel framework\nfor cross-modal retrieval which consists of a cross-modal mixer, a masked\nautoencoder for pre-training, and a cross-modal retriever for downstream\ntasks.In specific, we first adopt cross-modal mixer and mask modeling to fuse\nthe original modality and eliminate redundancy. Then, an encoder-decoder\narchitecture is applied to achieve a fuse-then-separate task in the\npre-training phase.We feed masked fused representations into the encoder and\nreconstruct them with the decoder, ultimately separating the original data of\ntwo modalities. In downstream tasks, we use the pre-trained encoder to build\nthe cross-modal retrieval method. Extensive experiments on 2 real-world\ndatasets show that our approach outperforms previous state-of-the-art methods\nin video-audio matching tasks, improving retrieval accuracy by up to 2 times.\nFurthermore, we prove our model performance by transferring it to other\ndownstream tasks as a universal model.\n","authors":["Zichen Yuan","Qi Shen","Bingyi Zheng","Yuting Liu","Linying Jiang","Guibing Guo"],"pdf_url":"https://arxiv.org/pdf/2308.13820v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09649v2","updated":"2023-08-26T07:24:00Z","published":"2023-08-18T16:10:13Z","title":"MUSE: Music Recommender System with Shuffle Play Recommendation\n Enhancement","summary":" Recommender systems have become indispensable in music streaming services,\nenhancing user experiences by personalizing playlists and facilitating the\nserendipitous discovery of new music. However, the existing recommender systems\noverlook the unique challenges inherent in the music domain, specifically\nshuffle play, which provides subsequent tracks in a random sequence. Based on\nour observation that the shuffle play sessions hinder the overall training\nprocess of music recommender systems mainly due to the high unique transition\nrates of shuffle play sessions, we propose a Music Recommender System with\nShuffle Play Recommendation Enhancement (MUSE). MUSE employs the\nself-supervised learning framework that maximizes the agreement between the\noriginal session and the augmented session, which is augmented by our novel\nsession augmentation method, called transition-based augmentation. To further\nfacilitate the alignment of the representations between the two views, we\ndevise two fine-grained matching strategies, i.e., item- and similarity-based\nmatching strategies. Through rigorous experiments conducted across diverse\nenvironments, we demonstrate MUSE's efficacy over 12 baseline models on a\nlarge-scale Music Streaming Sessions Dataset (MSSD) from Spotify. The source\ncode of MUSE is available at \\url{https://github.com/yunhak0/MUSE}.\n","authors":["Yunhak Oh","Sukwon Yun","Dongmin Hyun","Sein Kim","Chanyoung Park"],"pdf_url":"https://arxiv.org/pdf/2308.09649v2.pdf","comment":"CIKM 2023"},{"id":"http://arxiv.org/abs/2308.13774v1","updated":"2023-08-26T05:43:29Z","published":"2023-08-26T05:43:29Z","title":"Central Similarity Multi-View Hashing for Multimedia Retrieval","summary":" Hash representation learning of multi-view heterogeneous data is the key to\nimproving the accuracy of multimedia retrieval. However, existing methods\nutilize local similarity and fall short of deeply fusing the multi-view\nfeatures, resulting in poor retrieval accuracy. Current methods only use local\nsimilarity to train their model. These methods ignore global similarity.\nFurthermore, most recent works fuse the multi-view features via a weighted sum\nor concatenation. We contend that these fusion methods are insufficient for\ncapturing the interaction between various views. We present a novel Central\nSimilarity Multi-View Hashing (CSMVH) method to address the mentioned problems.\nCentral similarity learning is used for solving the local similarity problem,\nwhich can utilize the global similarity between the hash center and samples. We\npresent copious empirical data demonstrating the superiority of gate-based\nfusion over conventional approaches. On the MS COCO and NUS-WIDE, the proposed\nCSMVH performs better than the state-of-the-art methods by a large margin (up\nto 11.41% mean Average Precision (mAP) improvement).\n","authors":["Jian Zhu","Wen Cheng","Yu Cui","Chang Tang","Yuyang Dai","Yong Li","Lingfang Zeng"],"pdf_url":"https://arxiv.org/pdf/2308.13774v1.pdf","comment":"accepted by the Asia Pacific Web (APWeb) and Web-Age Information\n Management (WAIM) Joint International Conference on Web and Big Data\n (APWeb-WAIM2023)"},{"id":"http://arxiv.org/abs/2308.13760v1","updated":"2023-08-26T04:49:46Z","published":"2023-08-26T04:49:46Z","title":"How Can Context Help? Exploring Joint Retrieval of Passage and\n Personalized Context","summary":" The integration of external personalized context information into\ndocument-grounded conversational systems has significant potential business\nvalue, but has not been well-studied. Motivated by the concept of personalized\ncontext-aware document-grounded conversational systems, we introduce the task\nof context-aware passage retrieval. We also construct a dataset specifically\ncurated for this purpose. We describe multiple baseline systems to address this\ntask, and propose a novel approach, Personalized Context-Aware Search (PCAS),\nthat effectively harnesses contextual information during passage retrieval.\nExperimental evaluations conducted on multiple popular dense retrieval systems\ndemonstrate that our proposed approach not only outperforms the baselines in\nretrieving the most relevant passage but also excels at identifying the\npertinent context among all the available contexts. We envision that our\ncontributions will serve as a catalyst for inspiring future research endeavors\nin this promising direction.\n","authors":["Hui Wan","Hongkang Li","Songtao Lu","Xiaodong Cui","Marina Danilevsky"],"pdf_url":"https://arxiv.org/pdf/2308.13760v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13754v1","updated":"2023-08-26T03:48:10Z","published":"2023-08-26T03:48:10Z","title":"ZC3: Zero-Shot Cross-Language Code Clone Detection","summary":" Developers introduce code clones to improve programming productivity. Many\nexisting studies have achieved impressive performance in monolingual code clone\ndetection. However, during software development, more and more developers write\nsemantically equivalent programs with different languages to support different\nplatforms and help developers translate projects from one language to another.\nConsidering that collecting cross-language parallel data, especially for\nlow-resource languages, is expensive and time-consuming, how designing an\neffective cross-language model that does not rely on any parallel data is a\nsignificant problem. In this paper, we propose a novel method named ZC3 for\nZero-shot Cross-language Code Clone detection. ZC3 designs the contrastive\nsnippet prediction to form an isomorphic representation space among different\nprogramming languages. Based on this, ZC3 exploits domain-aware learning and\ncycle consistency learning to further constrain the model to generate\nrepresentations that are aligned among different languages meanwhile are\ndiacritical for different types of clones. To evaluate our approach, we conduct\nextensive experiments on four representative cross-language clone detection\ndatasets. Experimental results show that ZC3 outperforms the state-of-the-art\nbaselines by 67.12%, 51.39%, 14.85%, and 53.01% on the MAP score, respectively.\nWe further investigate the representational distribution of different languages\nand discuss the effectiveness of our method.\n","authors":["Jia Li","Chongyang Tao","Zhi Jin","Fang Liu","Jia Allen Li","Ge Li"],"pdf_url":"https://arxiv.org/pdf/2308.13754v1.pdf","comment":"Accepted by the 38th IEEE/ACM International Conference on Automated\n Software Engineering (ASE 2023)"}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.13879v1","updated":"2023-08-26T13:34:17Z","published":"2023-08-26T13:34:17Z","title":"The DiffuseStyleGesture+ entry to the GENEA Challenge 2023","summary":" In this paper, we introduce the DiffuseStyleGesture+, our solution for the\nGeneration and Evaluation of Non-verbal Behavior for Embodied Agents (GENEA)\nChallenge 2023, which aims to foster the development of realistic, automated\nsystems for generating conversational gestures. Participants are provided with\na pre-processed dataset and their systems are evaluated through crowdsourced\nscoring. Our proposed model, DiffuseStyleGesture+, leverages a diffusion model\nto generate gestures automatically. It incorporates a variety of modalities,\nincluding audio, text, speaker ID, and seed gestures. These diverse modalities\nare mapped to a hidden space and processed by a modified diffusion model to\nproduce the corresponding gesture for a given speech input. Upon evaluation,\nthe DiffuseStyleGesture+ demonstrated performance on par with the top-tier\nmodels in the challenge, showing no significant differences with those models\nin human-likeness, appropriateness for the interlocutor, and achieving\ncompetitive performance with the best model on appropriateness for agent\nspeech. This indicates that our model is competitive and effective in\ngenerating realistic and appropriate gestures for given speech. The code,\npre-trained models, and demos are available at\nhttps://github.com/YoungSeng/DiffuseStyleGesture/tree/DiffuseStyleGesturePlus/BEAT-TWH-main.\n","authors":["Sicheng Yang","Haiwei Xue","Zhensong Zhang","Minglei Li","Zhiyong Wu","Xiaofei Wu","Songcen Xu","Zonghong Dai"],"pdf_url":"https://arxiv.org/pdf/2308.13879v1.pdf","comment":"7 pages, 8 figures, ICMI 2023"},{"id":"http://arxiv.org/abs/2308.04156v2","updated":"2023-08-26T08:40:25Z","published":"2023-08-08T09:37:18Z","title":"Towards Top-Down Stereoscopic Image Quality Assessment via Stereo\n Attention","summary":" Stereoscopic image quality assessment (SIQA) plays a crucial role in\nevaluating and improving the visual experience of 3D content. Existing\nbinocular properties and attention-based methods for SIQA have achieved\npromising performance. However, these bottom-up approaches are inadequate in\nexploiting the inherent characteristics of the human visual system (HVS). This\npaper presents a novel network for SIQA via stereo attention, employing a\ntop-down perspective to guide the quality assessment process. Our proposed\nmethod realizes the guidance from high-level binocular signals down to\nlow-level monocular signals, while the binocular and monocular information can\nbe calibrated progressively throughout the processing pipeline. We design a\ngeneralized Stereo AttenTion (SAT) block to implement the top-down philosophy\nin stereo perception. This block utilizes the fusion-generated attention map as\na high-level binocular modulator, influencing the representation of two\nlow-level monocular features. Additionally, we introduce an Energy Coefficient\n(EC) to account for recent findings indicating that binocular responses in the\nprimate primary visual cortex are less than the sum of monocular responses. The\nadaptive EC can tune the magnitude of binocular response flexibly, thus\nenhancing the formation of robust binocular features within our framework. To\nextract the most discriminative quality information from the summation and\nsubtraction of the two branches of monocular features, we utilize a\ndual-pooling strategy that applies min-pooling and max-pooling operations to\nthe respective branches. Experimental results highlight the superiority of our\ntop-down method in simulating the property of visual perception and advancing\nthe state-of-the-art in the SIQA field. The code of this work is available at\nhttps://github.com/Fanning-Zhang/SATNet.\n","authors":["Huilin Zhang","Sumei Li","Yongli Chang"],"pdf_url":"https://arxiv.org/pdf/2308.04156v2.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.13801v1","updated":"2023-08-26T07:55:32Z","published":"2023-08-26T07:55:32Z","title":"Reinforcement Learning Based Multi-modal Feature Fusion Network for\n Novel Class Discovery","summary":" With the development of deep learning techniques, supervised learning has\nachieved performances surpassing those of humans. Researchers have designed\nnumerous corresponding models for different data modalities, achieving\nexcellent results in supervised tasks. However, with the exponential increase\nof data in multiple fields, the recognition and classification of unlabeled\ndata have gradually become a hot topic. In this paper, we employed a\nReinforcement Learning framework to simulate the cognitive processes of humans\nfor effectively addressing novel class discovery in the Open-set domain. We\ndeployed a Member-to-Leader Multi-Agent framework to extract and fuse features\nfrom multi-modal information, aiming to acquire a more comprehensive\nunderstanding of the feature space. Furthermore, this approach facilitated the\nincorporation of self-supervised learning to enhance model training. We\nemployed a clustering method with varying constraint conditions, ranging from\nstrict to loose, allowing for the generation of dependable labels for a subset\nof unlabeled data during the training phase. This iterative process is similar\nto human exploratory learning of unknown data. These mechanisms collectively\nupdate the network parameters based on rewards received from environmental\nfeedback. This process enables effective control over the extent of exploration\nlearning, ensuring the accuracy of learning in unknown data categories. We\ndemonstrate the performance of our approach in both the 3D and 2D domains by\nemploying the OS-MN40, OS-MN40-Miss, and Cifar10 datasets. Our approach\nachieves competitive competitive results.\n","authors":["Qiang Li","Qiuyang Ma","Weizhi Nie","Anan Liu"],"pdf_url":"https://arxiv.org/pdf/2308.13801v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13774v1","updated":"2023-08-26T05:43:29Z","published":"2023-08-26T05:43:29Z","title":"Central Similarity Multi-View Hashing for Multimedia Retrieval","summary":" Hash representation learning of multi-view heterogeneous data is the key to\nimproving the accuracy of multimedia retrieval. However, existing methods\nutilize local similarity and fall short of deeply fusing the multi-view\nfeatures, resulting in poor retrieval accuracy. Current methods only use local\nsimilarity to train their model. These methods ignore global similarity.\nFurthermore, most recent works fuse the multi-view features via a weighted sum\nor concatenation. We contend that these fusion methods are insufficient for\ncapturing the interaction between various views. We present a novel Central\nSimilarity Multi-View Hashing (CSMVH) method to address the mentioned problems.\nCentral similarity learning is used for solving the local similarity problem,\nwhich can utilize the global similarity between the hash center and samples. We\npresent copious empirical data demonstrating the superiority of gate-based\nfusion over conventional approaches. On the MS COCO and NUS-WIDE, the proposed\nCSMVH performs better than the state-of-the-art methods by a large margin (up\nto 11.41% mean Average Precision (mAP) improvement).\n","authors":["Jian Zhu","Wen Cheng","Yu Cui","Chang Tang","Yuyang Dai","Yong Li","Lingfang Zeng"],"pdf_url":"https://arxiv.org/pdf/2308.13774v1.pdf","comment":"accepted by the Asia Pacific Web (APWeb) and Web-Age Information\n Management (WAIM) Joint International Conference on Web and Big Data\n (APWeb-WAIM2023)"}]},"2023-08-29T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2201.06313v4","updated":"2023-08-29T17:54:26Z","published":"2022-01-17T09:54:35Z","title":"A Deep Convolutional Neural Networks Based Multi-Task Ensemble Model for\n Aspect and Polarity Classification in Persian Reviews","summary":" Aspect-based sentiment analysis is of great importance and application\nbecause of its ability to identify all aspects discussed in the text. However,\naspect-based sentiment analysis will be most effective when, in addition to\nidentifying all the aspects discussed in the text, it can also identify their\npolarity. Most previous methods use the pipeline approach, that is, they first\nidentify the aspects and then identify the polarities. Such methods are\nunsuitable for practical applications since they can lead to model errors.\nTherefore, in this study, we propose a multi-task learning model based on\nConvolutional Neural Networks (CNNs), which can simultaneously detect aspect\ncategory and detect aspect category polarity. creating a model alone may not\nprovide the best predictions and lead to errors such as bias and high variance.\nTo reduce these errors and improve the efficiency of model predictions,\ncombining several models known as ensemble learning may provide better results.\nTherefore, the main purpose of this article is to create a model based on an\nensemble of multi-task deep convolutional neural networks to enhance sentiment\nanalysis in Persian reviews. We evaluated the proposed method using a Persian\nlanguage dataset in the movie domain. Jacquard index and Hamming loss measures\nwere used to evaluate the performance of the developed models. The results\nindicate that this new approach increases the efficiency of the sentiment\nanalysis model in the Persian language.\n","authors":["Milad Vazan","Fatemeh Sadat Masoumi","Sepideh Saeedi Majd"],"pdf_url":"https://arxiv.org/pdf/2201.06313v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14120v2","updated":"2023-08-29T17:52:02Z","published":"2023-08-27T14:28:38Z","title":"Empowering Clinicians and Democratizing Data Science: Large Language\n Models Automate Machine Learning for Clinical Studies","summary":" A knowledge gap persists between Machine Learning (ML) developers (e.g., data\nscientists) and practitioners (e.g., clinicians), hampering the full\nutilization of ML for clinical data analysis. We investigated the potential of\nthe chatGPT Advanced Data Analysis (ADA), an extension of GPT-4, to bridge this\ngap and perform ML analyses efficiently. Real-world clinical datasets and study\ndetails from large trials across various medical specialties were presented to\nchatGPT ADA without specific guidance. ChatGPT ADA autonomously developed\nstate-of-the-art ML models based on the original study's training data to\npredict clinical outcomes such as cancer development, cancer progression,\ndisease complications, or biomarkers such as pathogenic gene sequences.\nStrikingly, these ML models matched or outperformed their published\ncounterparts. We conclude that chatGPT ADA offers a promising avenue to\ndemocratize ML in medicine, making advanced analytics accessible to non-ML\nexperts and promoting broader applications in medical research and practice.\n","authors":["Soroosh Tayebi Arasteh","Tianyu Han","Mahshad Lotfinia","Christiane Kuhl","Jakob Nikolas Kather","Daniel Truhn","Sven Nebelung"],"pdf_url":"https://arxiv.org/pdf/2308.14120v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15459v1","updated":"2023-08-29T17:36:02Z","published":"2023-08-29T17:36:02Z","title":"ParaGuide: Guided Diffusion Paraphrasers for Plug-and-Play Textual Style\n Transfer","summary":" Textual style transfer is the task of transforming stylistic properties of\ntext while preserving meaning. Target \"styles\" can be defined in numerous ways,\nranging from single attributes (e.g, formality) to authorship (e.g,\nShakespeare). Previous unsupervised style-transfer approaches generally rely on\nsignificant amounts of labeled data for only a fixed set of styles or require\nlarge language models. In contrast, we introduce a novel diffusion-based\nframework for general-purpose style transfer that can be flexibly adapted to\narbitrary target styles at inference time. Our parameter-efficient approach,\nParaGuide, leverages paraphrase-conditioned diffusion models alongside\ngradient-based guidance from both off-the-shelf classifiers and strong existing\nstyle embedders to transform the style of text while preserving semantic\ninformation. We validate the method on the Enron Email Corpus, with both human\nand automatic evaluations, and find that it outperforms strong baselines on\nformality, sentiment, and even authorship style transfer.\n","authors":["Zachary Horvitz","Ajay Patel","Chris Callison-Burch","Zhou Yu","Kathleen McKeown"],"pdf_url":"https://arxiv.org/pdf/2308.15459v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15452v1","updated":"2023-08-29T17:22:39Z","published":"2023-08-29T17:22:39Z","title":"When Do Program-of-Thoughts Work for Reasoning?","summary":" The reasoning capabilities of Large Language Models (LLMs) play a pivotal\nrole in the realm of embodied artificial intelligence. Although there are\neffective methods like program-of-thought prompting for LLMs which uses\nprogramming language to tackle complex reasoning tasks, the specific impact of\ncode data on the improvement of reasoning capabilities remains under-explored.\nTo address this gap, we propose complexity-impacted reasoning score (CIRS),\nwhich combines structural and logical attributes, to measure the correlation\nbetween code and reasoning abilities. Specifically, we use the abstract syntax\ntree to encode the structural information and calculate logical complexity by\nconsidering the difficulty and the cyclomatic complexity. Through an empirical\nanalysis, we find not all code data of complexity can be learned or understood\nby LLMs. Optimal level of complexity is critical to the improvement of\nreasoning abilities by program-aided prompting. Then we design an\nauto-synthesizing and stratifying algorithm, and apply it to instruction\ngeneration for mathematical reasoning and code data filtering for code\ngeneration tasks. Extensive results demonstrates the effectiveness of our\nproposed approach. Code will be integrated into the EasyInstruct framework at\nhttps://github.com/zjunlp/EasyInstruct.\n","authors":["Zhen Bi","Ningyu Zhang","Yinuo Jiang","Shumin Deng","Guozhou Zheng","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2308.15452v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2308.15448v1","updated":"2023-08-29T17:19:32Z","published":"2023-08-29T17:19:32Z","title":"Vulgar Remarks Detection in Chittagonian Dialect of Bangla","summary":" The negative effects of online bullying and harassment are increasing with\nInternet popularity, especially in social media. One solution is using natural\nlanguage processing (NLP) and machine learning (ML) methods for the automatic\ndetection of harmful remarks, but these methods are limited in low-resource\nlanguages like the Chittagonian dialect of Bangla.This study focuses on\ndetecting vulgar remarks in social media using supervised ML and deep learning\nalgorithms.Logistic Regression achieved promising accuracy (0.91) while simple\nRNN with Word2vec and fastTex had lower accuracy (0.84-0.90), highlighting the\nissue that NN algorithms require more data.\n","authors":["Tanjim Mahmud","Michal Ptaszynski","Fumito Masui"],"pdf_url":"https://arxiv.org/pdf/2308.15448v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2306.08018v2","updated":"2023-08-29T17:13:05Z","published":"2023-06-13T14:35:34Z","title":"Mol-Instructions: A Large-Scale Biomolecular Instruction Dataset for\n Large Language Models","summary":" Large Language Models (LLMs), with their remarkable task-handling\ncapabilities and innovative outputs, have catalyzed significant advancements\nacross a spectrum of fields. However, their proficiency within specialized\ndomains such as biomolecular studies remains limited. To address this\nchallenge, we introduce Mol-Instructions, a meticulously curated, comprehensive\ninstruction dataset expressly designed for the biomolecular realm.\nMol-Instructions is composed of three pivotal components: molecule-oriented\ninstructions, protein-oriented instructions, and biomolecular text\ninstructions, each curated to enhance the understanding and prediction\ncapabilities of LLMs concerning biomolecular features and behaviors. Through\nextensive instruction tuning experiments on the representative LLM, we\nunderscore the potency of Mol-Instructions to enhance the adaptability and\ncognitive acuity of large models within the complex sphere of biomolecular\nstudies, thereby promoting advancements in the biomolecular research community.\nMol-Instructions is made publicly accessible for future research endeavors and\nwill be subjected to continual updates for enhanced applicability.\n","authors":["Yin Fang","Xiaozhuan Liang","Ningyu Zhang","Kangwei Liu","Rui Huang","Zhuo Chen","Xiaohui Fan","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2306.08018v2.pdf","comment":"Project homepage: https://github.com/zjunlp/Mol-Instructions. Add\n quantitative evaluations"},{"id":"http://arxiv.org/abs/2112.09153v2","updated":"2023-08-29T17:04:19Z","published":"2021-12-16T19:00:55Z","title":"An Empirical Investigation of the Role of Pre-training in Lifelong\n Learning","summary":" The lifelong learning paradigm in machine learning is an attractive\nalternative to the more prominent isolated learning scheme not only due to its\nresemblance to biological learning but also its potential to reduce energy\nwaste by obviating excessive model re-training. A key challenge to this\nparadigm is the phenomenon of catastrophic forgetting. With the increasing\npopularity and success of pre-trained models in machine learning, we pose the\nquestion: What role does pre-training play in lifelong learning, specifically\nwith respect to catastrophic forgetting? We investigate existing methods in the\ncontext of large, pre-trained models and evaluate their performance on a\nvariety of text and image classification tasks, including a large-scale study\nusing a novel data set of 15 diverse NLP tasks. Across all settings, we observe\nthat generic pre-training implicitly alleviates the effects of catastrophic\nforgetting when learning multiple tasks sequentially compared to randomly\ninitialized models. We then further investigate why pre-training alleviates\nforgetting in this setting. We study this phenomenon by analyzing the loss\nlandscape, finding that pre-trained weights appear to ease forgetting by\nleading to wider minima. Based on this insight, we propose jointly optimizing\nfor current task loss and loss basin sharpness to explicitly encourage wider\nbasins during sequential fine-tuning. We show that this optimization approach\noutperforms several state-of-the-art task-sequential continual learning\nalgorithms across multiple settings, occasionally even without retaining a\nmemory that scales in size with the number of tasks.\n","authors":["Sanket Vaibhav Mehta","Darshan Patil","Sarath Chandar","Emma Strubell"],"pdf_url":"https://arxiv.org/pdf/2112.09153v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07740v2","updated":"2023-08-29T16:55:11Z","published":"2023-07-15T08:08:38Z","title":"Political Sentiment Analysis of Persian Tweets Using CNN-LSTM Model","summary":" Sentiment analysis is the process of identifying and categorizing people's\nemotions or opinions regarding various topics. The analysis of Twitter\nsentiment has become an increasingly popular topic in recent years. In this\npaper, we present several machine learning and a deep learning model to\nanalysis sentiment of Persian political tweets. Our analysis was conducted\nusing Bag of Words and ParsBERT for word representation. We applied Gaussian\nNaive Bayes, Gradient Boosting, Logistic Regression, Decision Trees, Random\nForests, as well as a combination of CNN and LSTM to classify the polarities of\ntweets. The results of this study indicate that deep learning with ParsBERT\nembedding performs better than machine learning. The CNN-LSTM model had the\nhighest classification accuracy with 89 percent on the first dataset and 71\npercent on the second dataset. Due to the complexity of Persian, it was a\ndifficult task to achieve this level of efficiency. The main objective of our\nresearch was to reduce the training time while maintaining the model's\nperformance. As a result, several adjustments were made to the model\narchitecture and parameters. In addition to achieving the objective, the\nperformance was slightly improved as well.\n","authors":["Mohammad Dehghani","Zahra Yazdanparast"],"pdf_url":"https://arxiv.org/pdf/2307.07740v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15419v1","updated":"2023-08-29T16:24:09Z","published":"2023-08-29T16:24:09Z","title":"Characterizing Learning Curves During Language Model Pre-Training:\n Learning, Forgetting, and Stability","summary":" How do language models learn to make predictions during pre-training? To\nstudy this question, we extract learning curves from five autoregressive\nEnglish language model pre-training runs, for 1M tokens in context. We observe\nthat the language models generate short repetitive phrases before learning to\ngenerate longer and more coherent text. We quantify the final surprisal,\nwithin-run variability, age of acquisition, forgettability, and cross-run\nvariability of learning curves for individual tokens in context. More frequent\ntokens reach lower final surprisals, exhibit less variability within and across\npre-training runs, are learned earlier, and are less likely to be \"forgotten\"\nduring pre-training. Higher n-gram probabilities further accentuate these\neffects. Independent of the target token, shorter and more frequent contexts\ncorrelate with marginally more stable and quickly acquired predictions. Effects\nof part-of-speech are also small, although nouns tend to be acquired later and\nless stably than verbs, adverbs, and adjectives. Our work contributes to a\nbetter understanding of language model pre-training dynamics and informs the\ndeployment of stable language models in practice.\n","authors":["Tyler A. Chang","Zhuowen Tu","Benjamin K. Bergen"],"pdf_url":"https://arxiv.org/pdf/2308.15419v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15399v1","updated":"2023-08-29T15:57:32Z","published":"2023-08-29T15:57:32Z","title":"Rethinking Machine Ethics -- Can LLMs Perform Moral Reasoning through\n the Lens of Moral Theories?","summary":" Making moral judgments is an essential step toward developing ethical AI\nsystems. Prevalent approaches are mostly implemented in a bottom-up manner,\nwhich uses a large set of annotated data to train models based on crowd-sourced\nopinions about morality. These approaches have been criticized for potentially\novergeneralizing a limited group of annotators' moral stances and lacking\nexplainability. In contrast, top-down approaches make moral judgments grounded\nin a set of principles. However, it remains conceptual due to the incapability\nof previous language models and the unsolved debate among moral principles. In\nthis study, we propose a flexible framework to steer Large Language Models\n(LLMs) to perform moral reasoning with well-established moral theories from\ninterdisciplinary research. The theory-guided top-down framework can\nincorporate various moral theories. Our experiments demonstrate the\neffectiveness of the proposed framework on datasets derived from moral\ntheories. Furthermore, we show the alignment between different moral theories\nand existing morality datasets. Our analysis exhibits the potentials and flaws\nin existing resources (models and datasets) in developing explainable moral\njudgment-making systems.\n","authors":["Jingyan Zhou","Minda Hu","Junan Li","Xiaoying Zhang","Xixin Wu","Irwin King","Helen Meng"],"pdf_url":"https://arxiv.org/pdf/2308.15399v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2308.12896v2","updated":"2023-08-29T15:57:02Z","published":"2023-08-24T16:16:47Z","title":"Beyond Document Page Classification: Design, Datasets, and Challenges","summary":" This paper highlights the need to bring document classification benchmarking\ncloser to real-world applications, both in the nature of data tested ($X$:\nmulti-channel, multi-paged, multi-industry; $Y$: class distributions and label\nset variety) and in classification tasks considered ($f$: multi-page document,\npage stream, and document bundle classification, ...). We identify the lack of\npublic multi-page document classification datasets, formalize different\nclassification tasks arising in application scenarios, and motivate the value\nof targeting efficient multi-page document representations. An experimental\nstudy on proposed multi-page document classification datasets demonstrates that\ncurrent benchmarks have become irrelevant and need to be updated to evaluate\ncomplete documents, as they naturally occur in practice. This reality check\nalso calls for more mature evaluation methodologies, covering calibration\nevaluation, inference complexity (time-memory), and a range of realistic\ndistribution shifts (e.g., born-digital vs. scanning noise, shifting page\norder). Our study ends on a hopeful note by recommending concrete avenues for\nfuture improvements.}\n","authors":["Jordy Van Landeghem","Sanket Biswas","Matthew B. Blaschko","Marie-Francine Moens"],"pdf_url":"https://arxiv.org/pdf/2308.12896v2.pdf","comment":"8 pages, under review"},{"id":"http://arxiv.org/abs/2308.14641v2","updated":"2023-08-29T15:48:23Z","published":"2023-08-28T15:12:34Z","title":"Challenges of GPT-3-based Conversational Agents for Healthcare","summary":" The potential to provide patients with faster information access while\nallowing medical specialists to concentrate on critical tasks makes medical\ndomain dialog agents appealing. However, the integration of large-language\nmodels (LLMs) into these agents presents certain limitations that may result in\nserious consequences. This paper investigates the challenges and risks of using\nGPT-3-based models for medical question-answering (MedQA). We perform several\nevaluations contextualized in terms of standard medical principles. We provide\na procedure for manually designing patient queries to stress-test high-risk\nlimitations of LLMs in MedQA systems. Our analysis reveals that LLMs fail to\nrespond adequately to these queries, generating erroneous medical information,\nunsafe recommendations, and content that may be considered offensive.\n","authors":["Fabian Lechner","Allison Lahnala","Charles Welch","Lucie Flek"],"pdf_url":"https://arxiv.org/pdf/2308.14641v2.pdf","comment":"12 pages, 9 Tables, accepted to RANLP 2023"},{"id":"http://arxiv.org/abs/2304.11073v2","updated":"2023-08-29T15:02:08Z","published":"2023-04-20T09:30:50Z","title":"OLISIA: a Cascade System for Spoken Dialogue State Tracking","summary":" Though Dialogue State Tracking (DST) is a core component of spoken dialogue\nsystems, recent work on this task mostly deals with chat corpora, disregarding\nthe discrepancies between spoken and written language.In this paper, we propose\nOLISIA, a cascade system which integrates an Automatic Speech Recognition (ASR)\nmodel and a DST model. We introduce several adaptations in the ASR and DST\nmodules to improve integration and robustness to spoken conversations.With\nthese adaptations, our system ranked first in DSTC11 Track 3, a benchmark to\nevaluate spoken DST. We conduct an in-depth analysis of the results and find\nthat normalizing the ASR outputs and adapting the DST inputs through data\naugmentation, along with increasing the pre-trained models size all play an\nimportant role in reducing the performance discrepancy between written and\nspoken conversations.\n","authors":["Léo Jacqmin","Lucas Druart","Yannick Estève","Benoît Favre","Lina Maria Rojas-Barahona","Valentin Vielzeuf"],"pdf_url":"https://arxiv.org/pdf/2304.11073v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15363v1","updated":"2023-08-29T14:59:54Z","published":"2023-08-29T14:59:54Z","title":"Text-to-SQL Empowered by Large Language Models: A Benchmark Evaluation","summary":" Large language models (LLMs) have emerged as a new paradigm for Text-to-SQL\ntask. However, the absence of a systematical benchmark inhibits the development\nof designing effective, efficient and economic LLM-based Text-to-SQL solutions.\nTo address this challenge, in this paper, we first conduct a systematical and\nextensive comparison over existing prompt engineering methods, including\nquestion representation, example selection and example organization, and with\nthese experimental results, we elaborates their pros and cons. Based on these\nfindings, we propose a new integrated solution, named DAIL-SQL, which refreshes\nthe Spider leaderboard with 86.6% execution accuracy and sets a new bar.\nTowards an efficient and economic LLM-based Text-to-SQL solution, we emphasize\nthe token efficiency in prompt engineering and compare the prior studies under\nthis metric. Additionally, we investigate open-source LLMs in in-context\nlearning, and further enhance their performance with task-specific supervised\nfine-tuning. Our explorations highlight open-source LLMs' potential in\nText-to-SQL, as well as the advantages and disadvantages of the task-specific\nsupervised fine-tuning. We hope that our work provides a deeper understanding\nof Text-to-SQL with LLMs, and inspire further investigations and broad\napplications.\n","authors":["Dawei Gao","Haibin Wang","Yaliang Li","Xiuyu Sun","Yichen Qian","Bolin Ding","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.15363v1.pdf","comment":"We have released code on https://github.com/BeachWang/DAIL-SQL"},{"id":"http://arxiv.org/abs/2302.02083v4","updated":"2023-08-29T14:55:37Z","published":"2023-02-04T03:50:01Z","title":"Theory of Mind Might Have Spontaneously Emerged in Large Language Models","summary":" We explore the intriguing possibility that theory of mind (ToM), or the\nuniquely human ability to impute unobservable mental states to others, might\nhave spontaneously emerged in large language models (LLMs). We designed 40\nfalse-belief tasks, considered a gold standard in testing ToM in humans, and\nadministered them to several LLMs. Each task included a false-belief scenario,\nthree closely matched true-belief controls, and the reversed versions of all\nfour. Smaller and older models solved no tasks; GPT-3-davinci-001 (from May\n2020) and GPT-3-davinci-002 (from January 2022) solved 10%; and\nGPT-3-davinci-003 (from November 2022) and ChatGPT-3.5-turbo (from March 2023)\nsolved 35% of the tasks, mirroring the performance of three-year-old children.\nChatGPT-4 (from June 2023) solved 90% of the tasks, matching the performance of\nseven-year-old children. These findings suggest the intriguing possibility that\nToM, previously considered exclusive to humans, may have spontaneously emerged\nas a byproduct of LLMs' improving language skills.\n","authors":["Michal Kosinski"],"pdf_url":"https://arxiv.org/pdf/2302.02083v4.pdf","comment":"TRY RUNNING ToM EXPERIMENTS ON YOUR OWN: The code and tasks used in\n this study are available at Colab\n (https://colab.research.google.com/drive/1ZRtmw87CdA4xp24DNS_Ik_uA2ypaRnoU).\n Don't worry if you are not an expert coder, you should be able to run this\n code with no-to-minimum Python skills. Or copy-paste the tasks to ChatGPT's\n web interface"},{"id":"http://arxiv.org/abs/2308.15352v1","updated":"2023-08-29T14:47:08Z","published":"2023-08-29T14:47:08Z","title":"Historical patterns of rice farming explain modern-day language use in\n China and Japan more than modernization and urbanization","summary":" We used natural language processing to analyze a billion words to study\ncultural differences on Weibo, one of China's largest social media platforms.\nWe compared predictions from two common explanations about cultural differences\nin China (economic development and urban-rural differences) against the\nless-obvious legacy of rice versus wheat farming. Rice farmers had to\ncoordinate shared irrigation networks and exchange labor to cope with higher\nlabor requirements. In contrast, wheat relied on rainfall and required half as\nmuch labor. We test whether this legacy made southern China more\ninterdependent. Across all word categories, rice explained twice as much\nvariance as economic development and urbanization. Rice areas used more words\nreflecting tight social ties, holistic thought, and a cautious, prevention\norientation. We then used Twitter data comparing prefectures in Japan, which\nlargely replicated the results from China. This provides crucial evidence of\nthe rice theory in a different nation, language, and platform.\n","authors":["Sharath Chandra Guntuku","Thomas Talhelm","Garrick Sherman","Angel Fan","Salvatore Giorgi","Liuqing Wei","Lyle H. Ungar"],"pdf_url":"https://arxiv.org/pdf/2308.15352v1.pdf","comment":"Includes Supplemental Materials"},{"id":"http://arxiv.org/abs/2308.15334v1","updated":"2023-08-29T14:29:57Z","published":"2023-08-29T14:29:57Z","title":"A Framework for Responsible Development of Automated Student Feedback\n with Generative AI","summary":" Providing rich feedback to students is essential for supporting student\nlearning. Recent advances in generative AI, particularly within large language\nmodelling (LLM), provide the opportunity to deliver repeatable, scalable and\ninstant automatically generated feedback to students, making abundant a\npreviously scarce and expensive learning resource. Such an approach is feasible\nfrom a technical perspective due to these recent advances in Artificial\nIntelligence (AI) and Natural Language Processing (NLP); while the potential\nupside is a strong motivator, doing so introduces a range of potential ethical\nissues that must be considered as we apply these technologies. The\nattractiveness of AI systems is that they can effectively automate the most\nmundane tasks; but this risks introducing a \"tyranny of the majority\", where\nthe needs of minorities in the long tail are overlooked because they are\ndifficult to automate.\n Developing machine learning models that can generate valuable and authentic\nfeedback requires the input of human domain experts. The choices we make in\ncapturing this expertise -- whose, which, when, and how -- will have\nsignificant consequences for the nature of the resulting feedback. How we\nmaintain our models will affect how that feedback remains relevant given\ntemporal changes in context, theory, and prior learning profiles of student\ncohorts. These questions are important from an ethical perspective; but they\nare also important from an operational perspective. Unless they can be\nanswered, our AI generated systems will lack the trust necessary for them to be\nuseful features in the contemporary learning environment.\n This article will outline the frontiers of automated feedback, identify the\nethical issues involved in the provision of automated feedback and present a\nframework to assist academics to develop such systems responsibly.\n","authors":["Euan D Lindsay","Aditya Johri","Johannes Bjerva"],"pdf_url":"https://arxiv.org/pdf/2308.15334v1.pdf","comment":"10 pages, under review at IEEE TLT"},{"id":"http://arxiv.org/abs/2308.04645v2","updated":"2023-08-29T14:09:49Z","published":"2023-08-09T01:02:06Z","title":"Cross-Lingual Constituency Parsing for Middle High German: A\n Delexicalized Approach","summary":" Constituency parsing plays a fundamental role in advancing natural language\nprocessing (NLP) tasks. However, training an automatic syntactic analysis\nsystem for ancient languages solely relying on annotated parse data is a\nformidable task due to the inherent challenges in building treebanks for such\nlanguages. It demands extensive linguistic expertise, leading to a scarcity of\navailable resources. To overcome this hurdle, cross-lingual transfer techniques\nwhich require minimal or even no annotated data for low-resource target\nlanguages offer a promising solution. In this study, we focus on building a\nconstituency parser for $\\mathbf{M}$iddle $\\mathbf{H}$igh $\\mathbf{G}$erman\n($\\mathbf{MHG}$) under realistic conditions, where no annotated MHG treebank is\navailable for training. In our approach, we leverage the linguistic continuity\nand structural similarity between MHG and $\\mathbf{M}$odern $\\mathbf{G}$erman\n($\\mathbf{MG}$), along with the abundance of MG treebank resources.\nSpecifically, by employing the $\\mathit{delexicalization}$ method, we train a\nconstituency parser on MG parse datasets and perform cross-lingual transfer to\nMHG parsing. Our delexicalized constituency parser demonstrates remarkable\nperformance on the MHG test set, achieving an F1-score of 67.3%. It outperforms\nthe best zero-shot cross-lingual baseline by a margin of 28.6% points. These\nencouraging results underscore the practicality and potential for automatic\nsyntactic analysis in other ancient languages that face similar challenges as\nMHG.\n","authors":["Ercong Nie","Helmut Schmid","Hinrich Schütze"],"pdf_url":"https://arxiv.org/pdf/2308.04645v2.pdf","comment":"Accepted to ALP 2023"},{"id":"http://arxiv.org/abs/2305.13862v2","updated":"2023-08-29T13:55:13Z","published":"2023-05-23T09:35:37Z","title":"A Trip Towards Fairness: Bias and De-Biasing in Large Language Models","summary":" Cheap-to-Build Very Large-Language Models (CtB-LLMs) with affordable training\nare emerging as the next big revolution in natural language processing and\nunderstanding. These CtB-LLMs are democratizing access to trainable Very\nLarge-Language Models (VLLMs) and, thus, may represent the building blocks of\nmany NLP systems solving downstream tasks. Hence, a little or a large bias in\nCtB-LLMs may cause huge harm. In this paper, we performed a large investigation\nof the bias of three families of CtB-LLMs, and we showed that debiasing\ntechniques are effective and usable. Indeed, according to current tests, the\nLLaMA and the OPT families have an important bias in gender, race, religion,\nand profession. In contrast to the analysis for other LLMs, we discovered that\nbias depends not on the number of parameters but on the perplexity. Finally,\nthe debiasing of OPT using LoRA reduces bias up to 4.12 points in the\nnormalized stereotype score.\n","authors":["Leonardo Ranaldi","Elena Sofia Ruzzetti","Davide Venditti","Dario Onorati","Fabio Massimo Zanzotto"],"pdf_url":"https://arxiv.org/pdf/2305.13862v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15299v1","updated":"2023-08-29T13:36:45Z","published":"2023-08-29T13:36:45Z","title":"TaskLAMA: Probing the Complex Task Understanding of Language Models","summary":" Structured Complex Task Decomposition (SCTD) is the problem of breaking down\na complex real-world task (such as planning a wedding) into a directed acyclic\ngraph over individual steps that contribute to achieving the task, with edges\nspecifying temporal dependencies between them. SCTD is an important component\nof assistive planning tools, and a challenge for commonsense reasoning systems.\nWe probe how accurately SCTD can be done with the knowledge extracted from\nLarge Language Models (LLMs). We introduce a high-quality human-annotated\ndataset for this problem and novel metrics to fairly assess performance of LLMs\nagainst several baselines. Our experiments reveal that LLMs are able to\ndecompose complex tasks into individual steps effectively, with a relative\nimprovement of 15% to 280% over the best baseline. We also propose a number of\napproaches to further improve their performance, with a relative improvement of\n7% to 37% over the base model. However, we find that LLMs still struggle to\npredict pairwise temporal dependencies, which reveals a gap in their\nunderstanding of complex tasks.\n","authors":["Quan Yuan","Mehran Kazemi","Xin Xu","Isaac Noble","Vaiva Imbrasaite","Deepak Ramachandran"],"pdf_url":"https://arxiv.org/pdf/2308.15299v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15298v1","updated":"2023-08-29T13:35:51Z","published":"2023-08-29T13:35:51Z","title":"KGConv, a Conversational Corpus grounded in Wikidata","summary":" We present KGConv, a large, conversational corpus of 71k conversations where\neach question-answer pair is grounded in a Wikidata fact. Conversations contain\non average 8.6 questions and for each Wikidata fact, we provide multiple\nvariants (12 on average) of the corresponding question using templates, human\nannotations, hand-crafted rules and a question rewriting neural model. We\nprovide baselines for the task of Knowledge-Based, Conversational Question\nGeneration. KGConv can further be used for other generation and analysis tasks\nsuch as single-turn question generation from Wikidata triples, question\nrewriting, question answering from conversation or from knowledge graphs and\nquiz generation.\n","authors":["Quentin Brabant","Gwenole Lecorve","Lina M. Rojas-Barahona","Claire Gardent"],"pdf_url":"https://arxiv.org/pdf/2308.15298v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.11167v3","updated":"2023-08-29T13:33:52Z","published":"2023-06-19T21:14:57Z","title":"Large Language Models are Fixated by Red Herrings: Exploring Creative\n Problem Solving and Einstellung Effect using the Only Connect Wall Dataset","summary":" The quest for human imitative AI has been an enduring topic in AI research\nsince its inception. The technical evolution and emerging capabilities of the\nlatest cohort of large language models (LLMs) have reinvigorated the subject\nbeyond academia to the cultural zeitgeist. While recent NLP evaluation\nbenchmark tasks test some aspects of human-imitative behaviour (e.g.,\nBIG-bench's 'human-like behavior' tasks), few, if not none, examine creative\nproblem solving abilities. Creative problem solving in humans is a well-studied\ntopic in cognitive neuroscience with standardized tests that predominantly use\nthe ability to associate (heterogeneous) connections among clue words as a\nmetric for creativity. Exposure to misleading stimuli - distractors dubbed red\nherrings - impede human performance in such tasks via the fixation effect and\nEinstellung paradigm. In cognitive neuroscience studies, such fixations are\nexperimentally induced by pre-exposing participants to orthographically similar\nincorrect words to subsequent word-fragments or clues. The popular British quiz\nshow Only Connect's Connecting Wall segment essentially mimics Mednick's Remote\nAssociates Test (RAT) formulation with built-in, deliberate red herrings, which\nmakes it an ideal proxy dataset to explore and study fixation effect and\nEinstellung paradigm from cognitive neuroscience in LLMs. In this paper we\npresent the novel Only Connect Wall (OCW) dataset and report results from our\nevaluation of selected pre-trained language models and LLMs on creative problem\nsolving tasks like grouping clue words by heterogeneous connections, and\nidentifying correct open knowledge domain connections in respective groups. We\nsynthetically generate two additional datasets: OCW-Randomized, OCW-WordNet to\nfurther analyze our red-herrings hypothesis in language models. The code and\nlink to the dataset are available at https://github.com/TaatiTeam/OCW.\n","authors":["Saeid Naeini","Raeid Saqur","Mozhgan Saeidi","John Giorgi","Babak Taati"],"pdf_url":"https://arxiv.org/pdf/2306.11167v3.pdf","comment":"V3: Minor cosmetic adjustment from V2. Fixed Fig. 2 caption\n overlapping with text in S2.2. V2: with added OCW-Randomized and OCW-WordNet\n results in Section 4.3 (added). 22 pages with Appendix"},{"id":"http://arxiv.org/abs/2307.09162v2","updated":"2023-08-29T13:15:24Z","published":"2023-07-18T11:38:45Z","title":"Unveiling Gender Bias in Terms of Profession Across LLMs: Analyzing and\n Addressing Sociological Implications","summary":" Gender bias in artificial intelligence (AI) and natural language processing\nhas garnered significant attention due to its potential impact on societal\nperceptions and biases. This research paper aims to analyze gender bias in\nLarge Language Models (LLMs) with a focus on multiple comparisons between GPT-2\nand GPT-3.5, some prominent language models, to better understand its\nimplications. Through a comprehensive literature review, the study examines\nexisting research on gender bias in AI language models and identifies gaps in\nthe current knowledge. The methodology involves collecting and preprocessing\ndata from GPT-2 and GPT-3.5, and employing in-depth quantitative analysis\ntechniques to evaluate gender bias in the generated text. The findings shed\nlight on gendered word associations, language usage, and biased narratives\npresent in the outputs of these Large Language Models. The discussion explores\nthe ethical implications of gender bias and its potential consequences on\nsocial perceptions and marginalized communities. Additionally, the paper\npresents strategies for reducing gender bias in LLMs, including algorithmic\napproaches and data augmentation techniques. The research highlights the\nimportance of interdisciplinary collaborations and the role of sociological\nstudies in mitigating gender bias in AI models. By addressing these issues, we\ncan pave the way for more inclusive and unbiased AI systems that have a\npositive impact on society.\n","authors":["Vishesh Thakur"],"pdf_url":"https://arxiv.org/pdf/2307.09162v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15262v1","updated":"2023-08-29T12:41:50Z","published":"2023-08-29T12:41:50Z","title":"Enhancing OCR Performance through Post-OCR Models: Adopting Glyph\n Embedding for Improved Correction","summary":" The study investigates the potential of post-OCR models to overcome\nlimitations in OCR models and explores the impact of incorporating glyph\nembedding on post-OCR correction performance. In this study, we have developed\nour own post-OCR correction model. The novelty of our approach lies in\nembedding the OCR output using CharBERT and our unique embedding technique,\ncapturing the visual characteristics of characters. Our findings show that\npost-OCR correction effectively addresses deficiencies in inferior OCR models,\nand glyph embedding enables the model to achieve superior results, including\nthe ability to correct individual words.\n","authors":["Yung-Hsin Chen","Yuli Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.15262v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01681v3","updated":"2023-08-29T12:20:15Z","published":"2023-08-03T10:48:30Z","title":"NBIAS: A Natural Language Processing Framework for Bias Identification\n in Text","summary":" Bias in textual data can lead to skewed interpretations and outcomes when the\ndata is used. These biases could perpetuate stereotypes, discrimination, or\nother forms of unfair treatment. An algorithm trained on biased data may end up\nmaking decisions that disproportionately impact a certain group of people.\nTherefore, it is crucial to detect and remove these biases to ensure the fair\nand ethical use of data. To this end, we develop a comprehensive and robust\nframework NBIAS that consists of four main layers: data, corpus construction,\nmodel development and an evaluation layer. The dataset is constructed by\ncollecting diverse data from various domains, including social media,\nhealthcare, and job hiring portals. As such, we applied a transformer-based\ntoken classification model that is able to identify bias words/ phrases through\na unique named entity BIAS. In the evaluation procedure, we incorporate a blend\nof quantitative and qualitative measures to gauge the effectiveness of our\nmodels. We achieve accuracy improvements ranging from 1% to 8% compared to\nbaselines. We are also able to generate a robust understanding of the model\nfunctioning. The proposed approach is applicable to a variety of biases and\ncontributes to the fair and ethical use of textual data.\n","authors":["Shaina Raza","Muskan Garg","Deepak John Reji","Syed Raza Bashir","Chen Ding"],"pdf_url":"https://arxiv.org/pdf/2308.01681v3.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2308.15246v1","updated":"2023-08-29T12:12:53Z","published":"2023-08-29T12:12:53Z","title":"A Classification-Guided Approach for Adversarial Attacks against Neural\n Machine Translation","summary":" Neural Machine Translation (NMT) models have been shown to be vulnerable to\nadversarial attacks, wherein carefully crafted perturbations of the input can\nmislead the target model. In this paper, we introduce ACT, a novel adversarial\nattack framework against NMT systems guided by a classifier. In our attack, the\nadversary aims to craft meaning-preserving adversarial examples whose\ntranslations by the NMT model belong to a different class than the original\ntranslations in the target language. Unlike previous attacks, our new approach\nhas a more substantial effect on the translation by altering the overall\nmeaning, which leads to a different class determined by a classifier. To\nevaluate the robustness of NMT models to this attack, we propose enhancements\nto existing black-box word-replacement-based attacks by incorporating output\ntranslations of the target NMT model and the output logits of a classifier\nwithin the attack process. Extensive experiments in various settings, including\na comparison with existing untargeted attacks, demonstrate that the proposed\nattack is considerably more successful in altering the class of the output\ntranslation and has more effect on the translation. This new paradigm can show\nthe vulnerabilities of NMT systems by focusing on the class of translation\nrather than the mere translation quality as studied traditionally.\n","authors":["Sahar Sadrizadeh","Ljiljana Dolamic","Pascal Frossard"],"pdf_url":"https://arxiv.org/pdf/2308.15246v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15235v1","updated":"2023-08-29T11:46:27Z","published":"2023-08-29T11:46:27Z","title":"PronounFlow: A Hybrid Approach for Calibrating Pronouns in Sentences","summary":" Flip through any book or listen to any song lyrics, and you will come across\npronouns that, in certain cases, can hinder meaning comprehension, especially\nfor machines. As the role of having cognitive machines becomes pervasive in our\nlives, numerous systems have been developed to resolve pronouns under various\nchallenges. Commensurate with this, it is believed that having systems able to\ndisambiguate pronouns in sentences will help towards the endowment of machines\nwith commonsense and reasoning abilities like those found in humans. However,\none problem these systems face with modern English is the lack of gender\npronouns, where people try to alternate by using masculine, feminine, or plural\nto avoid the whole issue. Since humanity aims to the building of systems in the\nfull-bodied sense we usually reserve for people, what happens when pronouns in\nwritten text, like plural or epicene ones, refer to unspecified entities whose\ngender is not necessarily known? Wouldn't that put extra barriers to existing\ncoreference resolution systems? Towards answering those questions, through the\nimplementation of a neural-symbolic system that utilizes the best of both\nworlds, we are employing PronounFlow, a system that reads any English sentence\nwith pronouns and entities, identifies which of them are not tied to each\nother, and makes suggestions on which to use to avoid biases. Undertaken\nexperiments show that PronounFlow not only alternates pronouns in sentences\nbased on the collective human knowledge around us but also considerably helps\ncoreference resolution systems with the pronoun disambiguation process.\n","authors":["Nicos Isaak"],"pdf_url":"https://arxiv.org/pdf/2308.15235v1.pdf","comment":"13 pages, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2308.15232v1","updated":"2023-08-29T11:40:24Z","published":"2023-08-29T11:40:24Z","title":"Classification-Aware Neural Topic Model Combined With Interpretable\n Analysis -- For Conflict Classification","summary":" A large number of conflict events are affecting the world all the time. In\norder to analyse such conflict events effectively, this paper presents a\nClassification-Aware Neural Topic Model (CANTM-IA) for Conflict Information\nClassification and Topic Discovery. The model provides a reliable\ninterpretation of classification results and discovered topics by introducing\ninterpretability analysis. At the same time, interpretation is introduced into\nthe model architecture to improve the classification performance of the model\nand to allow interpretation to focus further on the details of the data.\nFinally, the model architecture is optimised to reduce the complexity of the\nmodel.\n","authors":["Tianyu Liang","Yida Mu","Soonho Kim","Darline Larissa Kengne Kuate","Julie Lang","Rob Vos","Xingyi Song"],"pdf_url":"https://arxiv.org/pdf/2308.15232v1.pdf","comment":"Accepted by RANLP 2023"},{"id":"http://arxiv.org/abs/2308.15231v1","updated":"2023-08-29T11:40:03Z","published":"2023-08-29T11:40:03Z","title":"Multi-party Goal Tracking with LLMs: Comparing Pre-training,\n Fine-tuning, and Prompt Engineering","summary":" This paper evaluates the extent to which current Large Language Models (LLMs)\ncan capture task-oriented multi-party conversations (MPCs). We have recorded\nand transcribed 29 MPCs between patients, their companions, and a social robot\nin a hospital. We then annotated this corpus for multi-party goal-tracking and\nintent-slot recognition. People share goals, answer each other's goals, and\nprovide other people's goals in MPCs - none of which occur in dyadic\ninteractions. To understand user goals in MPCs, we compared three methods in\nzero-shot and few-shot settings: we fine-tuned T5, created pre-training tasks\nto train DialogLM using LED, and employed prompt engineering techniques with\nGPT-3.5-turbo, to determine which approach can complete this novel task with\nlimited data. GPT-3.5-turbo significantly outperformed the others in a few-shot\nsetting. The `reasoning' style prompt, when given 7% of the corpus as example\nannotated conversations, was the best performing method. It correctly annotated\n62.32% of the goal tracking MPCs, and 69.57% of the intent-slot recognition\nMPCs. A `story' style prompt increased model hallucination, which could be\ndetrimental if deployed in safety-critical settings. We conclude that\nmulti-party conversations still challenge state-of-the-art LLMs.\n","authors":["Angus Addlesee","Weronika Sieińska","Nancie Gunson","Daniel Hernández Garcia","Christian Dondrup","Oliver Lemon"],"pdf_url":"https://arxiv.org/pdf/2308.15231v1.pdf","comment":"Accepted and will appear in the Proceedings of SIGdial 2023"},{"id":"http://arxiv.org/abs/2308.15226v1","updated":"2023-08-29T11:29:43Z","published":"2023-08-29T11:29:43Z","title":"CLIPTrans: Transferring Visual Knowledge with Pre-trained Models for\n Multimodal Machine Translation","summary":" There has been a growing interest in developing multimodal machine\ntranslation (MMT) systems that enhance neural machine translation (NMT) with\nvisual knowledge. This problem setup involves using images as auxiliary\ninformation during training, and more recently, eliminating their use during\ninference. Towards this end, previous works face a challenge in training\npowerful MMT models from scratch due to the scarcity of annotated multilingual\nvision-language data, especially for low-resource languages. Simultaneously,\nthere has been an influx of multilingual pre-trained models for NMT and\nmultimodal pre-trained models for vision-language tasks, primarily in English,\nwhich have shown exceptional generalisation ability. However, these are not\ndirectly applicable to MMT since they do not provide aligned multimodal\nmultilingual features for generative tasks. To alleviate this issue, instead of\ndesigning complex modules for MMT, we propose CLIPTrans, which simply adapts\nthe independently pre-trained multimodal M-CLIP and the multilingual mBART. In\norder to align their embedding spaces, mBART is conditioned on the M-CLIP\nfeatures by a prefix sequence generated through a lightweight mapping network.\nWe train this in a two-stage pipeline which warms up the model with image\ncaptioning before the actual translation task. Through experiments, we\ndemonstrate the merits of this framework and consequently push forward the\nstate-of-the-art across standard benchmarks by an average of +2.67 BLEU. The\ncode can be found at www.github.com/devaansh100/CLIPTrans.\n","authors":["Devaansh Gupta","Siddhant Kharbanda","Jiawei Zhou","Wanhua Li","Hanspeter Pfister","Donglai Wei"],"pdf_url":"https://arxiv.org/pdf/2308.15226v1.pdf","comment":"15 pages, 9 figures, to be published In Proceedings of International\n Conference of Computer Vision(ICCV), 2023"},{"id":"http://arxiv.org/abs/2308.15214v1","updated":"2023-08-29T11:08:40Z","published":"2023-08-29T11:08:40Z","title":"FurChat: An Embodied Conversational Agent using LLMs, Combining Open and\n Closed-Domain Dialogue with Facial Expressions","summary":" We demonstrate an embodied conversational agent that can function as a\nreceptionist and generate a mixture of open and closed-domain dialogue along\nwith facial expressions, by using a large language model (LLM) to develop an\nengaging conversation. We deployed the system onto a Furhat robot, which is\nhighly expressive and capable of using both verbal and nonverbal cues during\ninteraction. The system was designed specifically for the National Robotarium\nto interact with visitors through natural conversations, providing them with\ninformation about the facilities, research, news, upcoming events, etc. The\nsystem utilises the state-of-the-art GPT-3.5 model to generate such information\nalong with domain-general conversations and facial expressions based on prompt\nengineering.\n","authors":["Neeraj Cherakara","Finny Varghese","Sheena Shabana","Nivan Nelson","Abhiram Karukayil","Rohith Kulothungan","Mohammed Afil Farhan","Birthe Nesset","Meriam Moujahid","Tanvi Dinkar","Verena Rieser","Oliver Lemon"],"pdf_url":"https://arxiv.org/pdf/2308.15214v1.pdf","comment":"5 pages, 2 figures, Accepted at SIGDIAL 2023 (24th Meeting of the\n Special Interest Group on Discourse and Dialogue), for the demo video, see\n https://youtu.be/fwtUl1kl22s"},{"id":"http://arxiv.org/abs/2308.15209v1","updated":"2023-08-29T10:55:44Z","published":"2023-08-29T10:55:44Z","title":"Shared Lexical Items as Triggers of Code Switching","summary":" Why do bilingual speakers code-switch (mix their two languages)? Among the\nseveral theories that attempt to explain this natural and ubiquitous\nphenomenon, the Triggering Hypothesis relates code-switching to the presence of\nlexical triggers, specifically cognates and proper names, adjacent to the\nswitch point. We provide a fuller, more nuanced and refined exploration of the\ntriggering hypothesis, based on five large datasets in three language pairs,\nreflecting both spoken and written bilingual interactions. Our results show\nthat words that are assumed to reside in a mental lexicon shared by both\nlanguages indeed trigger code-switching; that the tendency to switch depends on\nthe distance of the trigger from the switch point; and on whether the trigger\nprecedes or succeeds the switch; but not on the etymology of the trigger words.\nWe thus provide strong, robust, evidence-based confirmation to several\nhypotheses on the relationships between lexical triggers and code-switching.\n","authors":["Shuly Wintner","Safaa Shehadi","Yuli Zeira","Doreen Osmelak","Yuval Nov"],"pdf_url":"https://arxiv.org/pdf/2308.15209v1.pdf","comment":"This is the author's final version; the article has been accepted for\n publication in the Transactions of the Association for Computational\n Linguistics (TACL)"},{"id":"http://arxiv.org/abs/2308.15202v1","updated":"2023-08-29T10:40:46Z","published":"2023-08-29T10:40:46Z","title":"Benchmarking the Generation of Fact Checking Explanations","summary":" Fighting misinformation is a challenging, yet crucial, task. Despite the\ngrowing number of experts being involved in manual fact-checking, this activity\nis time-consuming and cannot keep up with the ever-increasing amount of Fake\nNews produced daily. Hence, automating this process is necessary to help curb\nmisinformation. Thus far, researchers have mainly focused on claim veracity\nclassification. In this paper, instead, we address the generation of\njustifications (textual explanation of why a claim is classified as either true\nor false) and benchmark it with novel datasets and advanced baselines. In\nparticular, we focus on summarization approaches over unstructured knowledge\n(i.e. news articles) and we experiment with several extractive and abstractive\nstrategies. We employed two datasets with different styles and structures, in\norder to assess the generalizability of our findings. Results show that in\njustification production summarization benefits from the claim information,\nand, in particular, that a claim-driven extractive step improves abstractive\nsummarization performances. Finally, we show that although cross-dataset\nexperiments suffer from performance degradation, a unique model trained on a\ncombination of the two datasets is able to retain style information in an\nefficient manner.\n","authors":["Daniel Russo","Serra Sinem Tekiroglu","Marco Guerini"],"pdf_url":"https://arxiv.org/pdf/2308.15202v1.pdf","comment":"Accepted to TACL. This arXiv version is a pre-MIT Press publication\n version"},{"id":"http://arxiv.org/abs/2308.15192v1","updated":"2023-08-29T10:20:53Z","published":"2023-08-29T10:20:53Z","title":"Enhancing Psychological Counseling with Large Language Model: A\n Multifaceted Decision-Support System for Non-Professionals","summary":" In the contemporary landscape of social media, an alarming number of users\nexpress negative emotions, some of which manifest as strong suicidal\nintentions. This situation underscores a profound need for trained\npsychological counselors who can enact effective mental interventions. However,\nthe development of these professionals is often an imperative but\ntime-consuming task. Consequently, the mobilization of non-professionals or\nvolunteers in this capacity emerges as a pressing concern. Leveraging the\ncapabilities of artificial intelligence, and in particular, the recent advances\nin large language models, offers a viable solution to this challenge. This\npaper introduces a novel model constructed on the foundation of large language\nmodels to fully assist non-professionals in providing psychological\ninterventions on online user discourses. This framework makes it plausible to\nharness the power of non-professional counselors in a meaningful way. A\ncomprehensive study was conducted involving ten professional psychological\ncounselors of varying expertise, evaluating the system across five critical\ndimensions. The findings affirm that our system is capable of analyzing\npatients' issues with relative accuracy and proffering professional-level\nstrategies recommendations, thereby enhancing support for non-professionals.\nThis research serves as a compelling validation of the application of large\nlanguage models in the field of psychology and lays the groundwork for a new\nparadigm of community-based mental health support.\n","authors":["Guanghui Fu","Qing Zhao","Jianqiang Li","Dan Luo","Changwei Song","Wei Zhai","Shuo Liu","Fan Wang","Yan Wang","Lijuan Cheng","Juan Zhang","Bing Xiang Yang"],"pdf_url":"https://arxiv.org/pdf/2308.15192v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15154v1","updated":"2023-08-29T09:35:23Z","published":"2023-08-29T09:35:23Z","title":"The Anatomy of Conspirators: Unveiling Traits using a Comprehensive\n Twitter Dataset","summary":" The discourse around conspiracy theories is currently thriving amidst the\nrampant misinformation prevalent in online environments. Research in this field\nhas been focused on detecting conspiracy theories on social media, often\nrelying on limited datasets. In this study, we present a novel methodology for\nconstructing a Twitter dataset that encompasses accounts engaged in\nconspiracy-related activities throughout the year 2022. Our approach centers on\ndata collection that is independent of specific conspiracy theories and\ninformation operations. Additionally, our dataset includes a control group\ncomprising randomly selected users who can be fairly compared to the\nindividuals involved in conspiracy activities. This comprehensive collection\neffort yielded a total of 15K accounts and 37M tweets extracted from their\ntimelines. We conduct a comparative analysis of the two groups across three\ndimensions: topics, profiles, and behavioral characteristics. The results\nindicate that conspiracy and control users exhibit similarity in terms of their\nprofile metadata characteristics. However, they diverge significantly in terms\nof behavior and activity, particularly regarding the discussed topics, the\nterminology used, and their stance on trending subjects. Interestingly, there\nis no significant disparity in the presence of bot users between the two\ngroups, suggesting that conspiracy and automation are orthogonal concepts.\nFinally, we develop a classifier to identify conspiracy users using 93\nfeatures, some of which are commonly employed in literature for troll\nidentification. The results demonstrate a high accuracy level (with an average\nF1 score of 0.98%), enabling us to uncover the most discriminative features\nassociated with conspiracy-related accounts.\n","authors":["Margherita Gambini","Serena Tardelli","Maurizio Tesconi"],"pdf_url":"https://arxiv.org/pdf/2308.15154v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15126v1","updated":"2023-08-29T08:51:24Z","published":"2023-08-29T08:51:24Z","title":"Evaluation and Analysis of Hallucination in Large Vision-Language Models","summary":" Large Vision-Language Models (LVLMs) have recently achieved remarkable\nsuccess. However, LVLMs are still plagued by the hallucination problem, which\nlimits the practicality in many scenarios. Hallucination refers to the\ninformation of LVLMs' responses that does not exist in the visual input, which\nposes potential risks of substantial consequences. There has been limited work\nstudying hallucination evaluation in LVLMs. In this paper, we propose\nHallucination Evaluation based on Large Language Models (HaELM), an LLM-based\nhallucination evaluation framework. HaELM achieves an approximate 95%\nperformance comparable to ChatGPT and has additional advantages including low\ncost, reproducibility, privacy preservation and local deployment. Leveraging\nthe HaELM, we evaluate the hallucination in current LVLMs. Furthermore, we\nanalyze the factors contributing to hallucination in LVLMs and offer helpful\nsuggestions to mitigate the hallucination problem. Our training data and human\nannotation hallucination data will be made public soon.\n","authors":["Junyang Wang","Yiyang Zhou","Guohai Xu","Pengcheng Shi","Chenlin Zhao","Haiyang Xu","Qinghao Ye","Ming Yan","Ji Zhang","Jihua Zhu","Jitao Sang","Haoyu Tang"],"pdf_url":"https://arxiv.org/pdf/2308.15126v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.15122v1","updated":"2023-08-29T08:41:16Z","published":"2023-08-29T08:41:16Z","title":"SpikeBERT: A Language Spikformer Trained with Two-Stage Knowledge\n Distillation from BERT","summary":" Spiking neural networks (SNNs) offer a promising avenue to implement deep\nneural networks in a more energy-efficient way. However, the network\narchitectures of existing SNNs for language tasks are too simplistic, and deep\narchitectures have not been fully explored, resulting in a significant\nperformance gap compared to mainstream transformer-based networks such as BERT.\nTo this end, we improve a recently-proposed spiking transformer (i.e.,\nSpikformer) to make it possible to process language tasks and propose a\ntwo-stage knowledge distillation method for training it, which combines\npre-training by distilling knowledge from BERT with a large collection of\nunlabelled texts and fine-tuning with task-specific instances via knowledge\ndistillation again from the BERT fine-tuned on the same training examples.\nThrough extensive experimentation, we show that the models trained with our\nmethod, named SpikeBERT, outperform state-of-the-art SNNs and even achieve\ncomparable results to BERTs on text classification tasks for both English and\nChinese with much less energy consumption.\n","authors":["Changze Lv","Tianlong Li","Jianhan Xu","Chenxi Gu","Zixuan Ling","Cenyuan Zhang","Xiaoqing Zheng","Xuanjing Huang"],"pdf_url":"https://arxiv.org/pdf/2308.15122v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15118v1","updated":"2023-08-29T08:36:30Z","published":"2023-08-29T08:36:30Z","title":"Large Language Models on the Chessboard: A Study on ChatGPT's Formal\n Language Comprehension and Complex Reasoning Skills","summary":" While large language models have made strides in natural language processing,\ntheir proficiency in complex reasoning tasks requiring formal language\ncomprehension, such as chess, remains less investigated. This paper probes the\nperformance of ChatGPT, a sophisticated language model by OpenAI in tackling\nsuch complex reasoning tasks, using chess as a case study. Through robust\nmetrics examining both the legality and quality of moves, we assess ChatGPT's\nunderstanding of the chessboard, adherence to chess rules, and strategic\ndecision-making abilities. Our evaluation identifies limitations within\nChatGPT's attention mechanism that affect its formal language comprehension and\nuncovers the model's underdeveloped self-regulation abilities. Our study also\nreveals ChatGPT's propensity for a coherent strategy in its gameplay and a\nnoticeable uptick in decision-making assertiveness when the model is presented\nwith a greater volume of natural language or possesses a more lucid\nunderstanding of the state of the chessboard. These findings contribute to the\ngrowing exploration of language models' abilities beyond natural language\nprocessing, providing valuable information for future research towards models\ndemonstrating human-like cognitive abilities.\n","authors":["Mu-Tien Kuo","Chih-Chung Hsueh","Richard Tzong-Han Tsai"],"pdf_url":"https://arxiv.org/pdf/2308.15118v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15097v1","updated":"2023-08-29T08:07:26Z","published":"2023-08-29T08:07:26Z","title":"Sequential annotations for naturally-occurring HRI: first insights","summary":" We explain the methodology we developed for improving the interactions\naccomplished by an embedded conversational agent, drawing from Conversation\nAnalytic sequential and multimodal analysis. The use case is a Pepper robot\nthat is expected to inform and orient users in a library. In order to propose\nand learn better interactive schema, we are creating a corpus of\nnaturally-occurring interactions that will be made available to the community.\nTo do so, we propose an annotation practice based on some theoretical\nunderpinnings about the use of language and multimodal resources in human-robot\ninteraction. CCS CONCEPTS $\\bullet$ Computing methodologies $\\rightarrow$\nDiscourse, dialogue and pragmatics; $\\bullet$ Human-centered computing\n$\\rightarrow$ Text input; HCI theory, concepts and models; Field studies.\n","authors":["Lucien Tisserand","Frédéric Armetta","Heike Baldauf-Quilliatre","Antoine Bouquin","Salima Hassas","Mathieu Lefort"],"pdf_url":"https://arxiv.org/pdf/2308.15097v1.pdf","comment":"Peer-reviewed workshop paper accepted for the ''Human-Robot\n Conversational Interaction'' workshop that took place at the ''ACM/IEEE\n International Conference on Human-Robot Interaction'' 2023 Conference in\n Stockholm, Sweden"},{"id":"http://arxiv.org/abs/2308.15090v1","updated":"2023-08-29T07:53:17Z","published":"2023-08-29T07:53:17Z","title":"Killing two birds with one stone: Can an audio captioning system also be\n used for audio-text retrieval?","summary":" Automated Audio Captioning (AAC) aims to develop systems capable of\ndescribing an audio recording using a textual sentence. In contrast, Audio-Text\nRetrieval (ATR) systems seek to find the best matching audio recording(s) for a\ngiven textual query (Text-to-Audio) or vice versa (Audio-to-Text). These tasks\nrequire different types of systems: AAC employs a sequence-to-sequence model,\nwhile ATR utilizes a ranking model that compares audio and text representations\nwithin a shared projection subspace. However, this work investigates the\nrelationship between AAC and ATR by exploring the ATR capabilities of an\nunmodified AAC system, without fine-tuning for the new task. Our AAC system\nconsists of an audio encoder (ConvNeXt-Tiny) trained on AudioSet for audio\ntagging, and a transformer decoder responsible for generating sentences. For\nAAC, it achieves a high SPIDEr-FL score of 0.298 on Clotho and 0.472 on\nAudioCaps on average. For ATR, we propose using the standard Cross-Entropy loss\nvalues obtained for any audio/caption pair. Experimental results on the Clotho\nand AudioCaps datasets demonstrate decent recall values using this simple\napproach. For instance, we obtained a Text-to-Audio R@1 value of 0.382 for\nAu-dioCaps, which is above the current state-of-the-art method without external\ndata. Interestingly, we observe that normalizing the loss values was necessary\nfor Audio-to-Text retrieval.\n","authors":["Etienne Labbé","Thomas Pellegrini","Julien Pinquier"],"pdf_url":"https://arxiv.org/pdf/2308.15090v1.pdf","comment":"cam ready version (14/08/23)"},{"id":"http://arxiv.org/abs/2305.10666v2","updated":"2023-08-29T07:16:52Z","published":"2023-05-18T02:57:54Z","title":"a unified front-end framework for english text-to-speech synthesis","summary":" The front-end is a critical component of English text-to-speech (TTS)\nsystems, responsible for extracting linguistic features that are essential for\na text-to-speech model to synthesize speech, such as prosodies and phonemes.\nThe English TTS front-end typically consists of a text normalization (TN)\nmodule, a prosody word prosody phrase (PWPP) module, and a grapheme-to-phoneme\n(G2P) module. However, current research on the English TTS front-end focuses\nsolely on individual modules, neglecting the interdependence between them and\nresulting in sub-optimal performance for each module. Therefore, this paper\nproposes a unified front-end framework that captures the dependencies among the\nEnglish TTS front-end modules. Extensive experiments have demonstrated that the\nproposed method achieves state-of-the-art (SOTA) performance in all modules.\n","authors":["Zelin Ying","Chen Li","Yu Dong","Qiuqiang Kong","Qiao Tian","Yuanyuan Huo","Yuxuan Wang"],"pdf_url":"https://arxiv.org/pdf/2305.10666v2.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.15055v1","updated":"2023-08-29T06:31:21Z","published":"2023-08-29T06:31:21Z","title":"Taxonomic Loss for Morphological Glossing of Low-Resource Languages","summary":" Morpheme glossing is a critical task in automated language documentation and\ncan benefit other downstream applications greatly. While state-of-the-art\nglossing systems perform very well for languages with large amounts of existing\ndata, it is more difficult to create useful models for low-resource languages.\nIn this paper, we propose the use of a taxonomic loss function that exploits\nmorphological information to make morphological glossing more performant when\ndata is scarce. We find that while the use of this loss function does not\noutperform a standard loss function with regards to single-label prediction\naccuracy, it produces better predictions when considering the top-n predicted\nlabels. We suggest this property makes the taxonomic loss function useful in a\nhuman-in-the-loop annotation setting.\n","authors":["Michael Ginn","Alexis Palmer"],"pdf_url":"https://arxiv.org/pdf/2308.15055v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15053v1","updated":"2023-08-29T06:27:58Z","published":"2023-08-29T06:27:58Z","title":"Adapting text-based dialogue state tracker for spoken dialogues","summary":" Although there have been remarkable advances in dialogue systems through the\ndialogue systems technology competition (DSTC), it remains one of the key\nchallenges to building a robust task-oriented dialogue system with a speech\ninterface. Most of the progress has been made for text-based dialogue systems\nsince there are abundant datasets with written corpora while those with spoken\ndialogues are very scarce. However, as can be seen from voice assistant systems\nsuch as Siri and Alexa, it is of practical importance to transfer the success\nto spoken dialogues. In this paper, we describe our engineering effort in\nbuilding a highly successful model that participated in the speech-aware\ndialogue systems technology challenge track in DSTC11. Our model consists of\nthree major modules: (1) automatic speech recognition error correction to\nbridge the gap between the spoken and the text utterances, (2) text-based\ndialogue system (D3ST) for estimating the slots and values using slot\ndescriptions, and (3) post-processing for recovering the error of the estimated\nslot value. Our experiments show that it is important to use an explicit\nautomatic speech recognition error correction module, post-processing, and data\naugmentation to adapt a text-based dialogue state tracker for spoken dialogue\ncorpora.\n","authors":["Jaeseok Yoon","Seunghyun Hwang","Ran Han","Jeonguk Bang","Kee-Eung Kim"],"pdf_url":"https://arxiv.org/pdf/2308.15053v1.pdf","comment":"8 pages, 5 figures, Accepted at the DSTC 11 Workshop to be located at\n SIGDIAL 2023"},{"id":"http://arxiv.org/abs/2308.15047v1","updated":"2023-08-29T06:09:47Z","published":"2023-08-29T06:09:47Z","title":"Large language models converge toward human-like concept organization","summary":" Large language models show human-like performance in knowledge extraction,\nreasoning and dialogue, but it remains controversial whether this performance\nis best explained by memorization and pattern matching, or whether it reflects\nhuman-like inferential semantics and world knowledge. Knowledge bases such as\nWikiData provide large-scale, high-quality representations of inferential\nsemantics and world knowledge. We show that large language models learn to\norganize concepts in ways that are strikingly similar to how concepts are\norganized in such knowledge bases. Knowledge bases model collective,\ninstitutional knowledge, and large language models seem to induce such\nknowledge from raw text. We show that bigger and better models exhibit more\nhuman-like concept organization, across four families of language models and\nthree knowledge graph embeddings.\n","authors":["Mathias Lykke Gammelgaard","Jonathan Gabel Christiansen","Anders Søgaard"],"pdf_url":"https://arxiv.org/pdf/2308.15047v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.12095v5","updated":"2023-08-29T05:34:25Z","published":"2023-02-22T11:01:20Z","title":"On the Robustness of ChatGPT: An Adversarial and Out-of-distribution\n Perspective","summary":" ChatGPT is a recent chatbot service released by OpenAI and is receiving\nincreasing attention over the past few months. While evaluations of various\naspects of ChatGPT have been done, its robustness, i.e., the performance to\nunexpected inputs, is still unclear to the public. Robustness is of particular\nconcern in responsible AI, especially for safety-critical applications. In this\npaper, we conduct a thorough evaluation of the robustness of ChatGPT from the\nadversarial and out-of-distribution (OOD) perspective. To do so, we employ the\nAdvGLUE and ANLI benchmarks to assess adversarial robustness and the Flipkart\nreview and DDXPlus medical diagnosis datasets for OOD evaluation. We select\nseveral popular foundation models as baselines. Results show that ChatGPT shows\nconsistent advantages on most adversarial and OOD classification and\ntranslation tasks. However, the absolute performance is far from perfection,\nwhich suggests that adversarial and OOD robustness remains a significant threat\nto foundation models. Moreover, ChatGPT shows astounding performance in\nunderstanding dialogue-related texts and we find that it tends to provide\ninformal suggestions for medical tasks instead of definitive answers. Finally,\nwe present in-depth discussions of possible research directions.\n","authors":["Jindong Wang","Xixu Hu","Wenxin Hou","Hao Chen","Runkai Zheng","Yidong Wang","Linyi Yang","Haojun Huang","Wei Ye","Xiubo Geng","Binxin Jiao","Yue Zhang","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2302.12095v5.pdf","comment":"Highlighted paper at ICLR 2023 workshop on Trustworthy and Reliable\n Large-Scale Machine Learning Models; code is at:\n https://github.com/microsoft/robustlearn; more works:\n https://llm-eval.github.io/"},{"id":"http://arxiv.org/abs/2308.15027v1","updated":"2023-08-29T05:18:47Z","published":"2023-08-29T05:18:47Z","title":"Improving Neural Ranking Models with Traditional IR Methods","summary":" Neural ranking methods based on large transformer models have recently gained\nsignificant attention in the information retrieval community, and have been\nadopted by major commercial solutions. Nevertheless, they are computationally\nexpensive to create, and require a great deal of labeled data for specialized\ncorpora. In this paper, we explore a low resource alternative which is a\nbag-of-embedding model for document retrieval and find that it is competitive\nwith large transformer models fine tuned on information retrieval tasks. Our\nresults show that a simple combination of TF-IDF, a traditional keyword\nmatching method, with a shallow embedding model provides a low cost path to\ncompete well with the performance of complex neural ranking models on 3\ndatasets. Furthermore, adding TF-IDF measures improves the performance of\nlarge-scale fine tuned models on these tasks.\n","authors":["Anik Saha","Oktie Hassanzadeh","Alex Gittens","Jian Ni","Kavitha Srinivas","Bulent Yener"],"pdf_url":"https://arxiv.org/pdf/2308.15027v1.pdf","comment":"Short paper, 4 pages"},{"id":"http://arxiv.org/abs/2308.15022v1","updated":"2023-08-29T04:59:53Z","published":"2023-08-29T04:59:53Z","title":"Recursively Summarizing Enables Long-Term Dialogue Memory in Large\n Language Models","summary":" Most open-domain dialogue systems suffer from forgetting important\ninformation, especially in a long-term conversation. Existing works usually\ntrain the specific retriever or summarizer to obtain key information from the\npast, which is time-consuming and highly depends on the quality of labeled\ndata. To alleviate this problem, we propose to recursively generate summaries/\nmemory using large language models (LLMs) to enhance long-term memory ability.\nSpecifically, our method first stimulates LLMs to memorize small dialogue\ncontexts and then recursively produce new memory using previous memory and\nfollowing contexts. Finally, the LLM can easily generate a highly consistent\nresponse with the help of the latest memory. We evaluate our method using\nChatGPT and text-davinci-003, and the experiments on the widely-used public\ndataset show that our method can generate more consistent responses in a\nlong-context conversation. Notably, our method is a potential solution to\nenable the LLM to model the extremely long context. Code and scripts will be\nreleased later.\n","authors":["Qingyue Wang","Liang Ding","Yanan Cao","Zhiliang Tian","Shi Wang","Dacheng Tao","Li Guo"],"pdf_url":"https://arxiv.org/pdf/2308.15022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15010v1","updated":"2023-08-29T04:16:57Z","published":"2023-08-29T04:16:57Z","title":"TransPrompt v2: A Transferable Prompting Framework for Cross-task Text\n Classification","summary":" Text classification is one of the most imperative tasks in natural language\nprocessing (NLP). Recent advances with pre-trained language models (PLMs) have\nshown remarkable success on this task. However, the satisfying results obtained\nby PLMs heavily depend on the large amounts of task-specific labeled data,\nwhich may not be feasible in many application scenarios due to data access and\nprivacy constraints. The recently-proposed prompt-based fine-tuning paradigm\nimproves the performance of PLMs for few-shot text classification with\ntask-specific templates. Yet, it is unclear how the prompting knowledge can be\ntransferred across tasks, for the purpose of mutual reinforcement. We propose\nTransPrompt v2, a novel transferable prompting framework for few-shot learning\nacross similar or distant text classification tasks. For learning across\nsimilar tasks, we employ a multi-task meta-knowledge acquisition (MMA)\nprocedure to train a meta-learner that captures the cross-task transferable\nknowledge. For learning across distant tasks, we further inject the task type\ndescriptions into the prompt, and capture the intra-type and inter-type prompt\nembeddings among multiple distant tasks. Additionally, two de-biasing\ntechniques are further designed to make the trained meta-learner more\ntask-agnostic and unbiased towards any tasks. After that, the meta-learner can\nbe adapted to each specific task with better parameters initialization.\nExtensive experiments show that TransPrompt v2 outperforms single-task and\ncross-task strong baselines over multiple NLP tasks and datasets. We further\nshow that the meta-learner can effectively improve the performance of PLMs on\npreviously unseen tasks. In addition, TransPrompt v2 also outperforms strong\nfine-tuning baselines when learning with full training sets.\n","authors":["Jianing Wang","Chengyu Wang","Cen Chen","Ming Gao","Jun Huang","Aoying Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.15010v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09539v2","updated":"2023-08-29T01:08:30Z","published":"2023-06-15T22:48:08Z","title":"Block-State Transformer","summary":" State space models (SSMs) have shown impressive results on tasks that require\nmodeling long-range dependencies and efficiently scale to long sequences owing\nto their subquadratic runtime complexity. Originally designed for continuous\nsignals, SSMs have shown superior performance on a plethora of tasks, in vision\nand audio; however, SSMs still lag Transformer performance in Language Modeling\ntasks. In this work, we propose a hybrid layer named Block-State Transformer\n(BST), that internally combines an SSM sublayer for long-range\ncontextualization, and a Block Transformer sublayer for short-term\nrepresentation of sequences. We study three different, and completely\nparallelizable, variants that integrate SSMs and block-wise attention. We show\nthat our model outperforms similar Transformer-based architectures on language\nmodeling perplexity and generalizes to longer sequences. In addition, the\nBlock-State Transformer demonstrates more than tenfold increase in speed at the\nlayer level compared to the Block-Recurrent Transformer when model\nparallelization is employed.\n","authors":["Mahan Fathi","Jonathan Pilault","Pierre-Luc Bacon","Christopher Pal","Orhan Firat","Ross Goroshin"],"pdf_url":"https://arxiv.org/pdf/2306.09539v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.07224v3","updated":"2023-08-29T00:56:16Z","published":"2023-05-12T03:31:24Z","title":"Asymmetric feature interaction for interpreting model predictions","summary":" In natural language processing (NLP), deep neural networks (DNNs) could model\ncomplex interactions between context and have achieved impressive results on a\nrange of NLP tasks. Prior works on feature interaction attribution mainly focus\non studying symmetric interaction that only explains the additional influence\nof a set of words in combination, which fails to capture asymmetric influence\nthat contributes to model prediction. In this work, we propose an asymmetric\nfeature interaction attribution explanation model that aims to explore\nasymmetric higher-order feature interactions in the inference of deep neural\nNLP models. By representing our explanation with an directed interaction graph,\nwe experimentally demonstrate interpretability of the graph to discover\nasymmetric feature interactions. Experimental results on two sentiment\nclassification datasets show the superiority of our model against the\nstate-of-the-art feature interaction attribution methods in identifying\ninfluential features for model predictions. Our code is available at\nhttps://github.com/StillLu/ASIV.\n","authors":["Xiaolei Lu","Jianghong Ma","Haode Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.07224v3.pdf","comment":"Accepted by Findings of the Association for Computational\n Linguistics: ACL 2023 (long paper)"},{"id":"http://arxiv.org/abs/2308.14951v1","updated":"2023-08-29T00:44:27Z","published":"2023-08-29T00:44:27Z","title":"Robust Open-Set Spoken Language Identification and the CU MultiLang\n Dataset","summary":" Most state-of-the-art spoken language identification models are closed-set;\nin other words, they can only output a language label from the set of classes\nthey were trained on. Open-set spoken language identification systems, however,\ngain the ability to detect when an input exhibits none of the original\nlanguages. In this paper, we implement a novel approach to open-set spoken\nlanguage identification that uses MFCC and pitch features, a TDNN model to\nextract meaningful feature embeddings, confidence thresholding on softmax\noutputs, and LDA and pLDA for learning to classify new unknown languages. We\npresent a spoken language identification system that achieves 91.76% accuracy\non trained languages and has the capability to adapt to unknown languages on\nthe fly. To that end, we also built the CU MultiLang Dataset, a large and\ndiverse multilingual speech corpus which was used to train and evaluate our\nsystem.\n","authors":["Mustafa Eyceoz","Justin Lee","Siddharth Pittie","Homayoon Beigi"],"pdf_url":"https://arxiv.org/pdf/2308.14951v1.pdf","comment":"6pages, 1 table, 6 figures"},{"id":"http://arxiv.org/abs/2307.08303v3","updated":"2023-08-29T21:52:58Z","published":"2023-07-17T07:55:47Z","title":"Soft Prompt Tuning for Augmenting Dense Retrieval with Large Language\n Models","summary":" Dense retrieval (DR) converts queries and documents into dense embeddings and\nmeasures the similarity between queries and documents in vector space. One of\nthe challenges in DR is the lack of domain-specific training data. While DR\nmodels can learn from large-scale public datasets like MS MARCO through\ntransfer learning, evidence shows that not all DR models and domains can\nbenefit from transfer learning equally. Recently, some researchers have\nresorted to large language models (LLMs) to improve the zero-shot and few-shot\nDR models. However, the hard prompts or human-written prompts utilized in these\nworks cannot guarantee the good quality of generated weak queries. To tackle\nthis, we propose soft prompt tuning for augmenting DR (SPTAR): For each task,\nwe leverage soft prompt-tuning to optimize a task-specific soft prompt on\nlimited ground truth data and then prompt the LLMs to tag unlabeled documents\nwith weak queries, yielding enough weak document-query pairs to train\ntask-specific dense retrievers. We design a filter to select high-quality\nexample document-query pairs in the prompt to further improve the quality of\nweak tagged queries. To the best of our knowledge, there is no prior work\nutilizing soft prompt tuning to augment DR models. The experiments demonstrate\nthat SPTAR outperforms the unsupervised baselines BM25 and the recently\nproposed LLMs-based augmentation method for DR.\n","authors":["Zhiyuan Peng","Xuyang Wu","Yi Fang"],"pdf_url":"https://arxiv.org/pdf/2307.08303v3.pdf","comment":"fix typos"},{"id":"http://arxiv.org/abs/2308.14306v2","updated":"2023-08-29T20:10:50Z","published":"2023-08-28T04:57:07Z","title":"Evaluating the Robustness to Instructions of Large Language Models","summary":" Recently, Instruction fine-tuning has risen to prominence as a potential\nmethod for enhancing the zero-shot capabilities of Large Language Models (LLMs)\non novel tasks. This technique has shown an exceptional ability to boost the\nperformance of moderately sized LLMs, sometimes even reaching performance\nlevels comparable to those of much larger model variants. The focus is on the\nrobustness of instruction-tuned LLMs to seen and unseen tasks. We conducted an\nexploration of six models including Alpaca, Vicuna, WizardLM, and Traditional\nTask-oriented Models(Flan-T5-XL/XXL, T0++) using real-world relation extraction\ndatasets as case studies. We carried out a comprehensive evaluation of these\ninstruction-following LLMs which have been tuned based on open-domain\ninstructions and task-oriented instructions. The main discussion is their\nperformance and robustness towards instructions. We have observed that in most\ncases, the model's performance in dealing with unfamiliar instructions tends to\nworsen significantly, and the robustness of the model for RE instructions\ndeteriorates compared to QA. Further, we discovered that up until a certain\nparameter size threshold (3B), the performance of the FLAN-T5 model improves as\nthe parameter count increases. The robustness of different scales of FLAN-T5\nmodels to RE instruction is worse than the robustness to QA instruction.\n","authors":["Yuansheng Ni","Sichao Jiang","Xinyu wu","Hui Shen","Yuli Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.14306v2.pdf","comment":"In our study, erroneous data analysis inadvertently led to misleading\n outcomes. Incorrect variables were included, distorting results. This\n emphasizes the significance of robust data processing and analysis techniques\n in research"},{"id":"http://arxiv.org/abs/2212.10003v2","updated":"2023-08-29T19:36:32Z","published":"2022-12-20T05:25:12Z","title":"(QA)$^2$: Question Answering with Questionable Assumptions","summary":" Naturally occurring information-seeking questions often contain questionable\nassumptions -- assumptions that are false or unverifiable. Questions containing\nquestionable assumptions are challenging because they require a distinct answer\nstrategy that deviates from typical answers for information-seeking questions.\nFor instance, the question \"When did Marie Curie discover Uranium?\" cannot be\nanswered as a typical \"when\" question without addressing the false assumption\n\"Marie Curie discovered Uranium\". In this work, we propose (QA)$^2$ (Question\nAnswering with Questionable Assumptions), an open-domain evaluation dataset\nconsisting of naturally occurring search engine queries that may or may not\ncontain questionable assumptions. To be successful on (QA)$^2$, systems must be\nable to detect questionable assumptions and also be able to produce adequate\nresponses for both typical information-seeking questions and ones with\nquestionable assumptions. Through human rater acceptability on end-to-end QA\nwith (QA)$^2$, we find that current models do struggle with handling\nquestionable assumptions, leaving substantial headroom for progress.\n","authors":["Najoung Kim","Phu Mon Htut","Samuel R. Bowman","Jackson Petty"],"pdf_url":"https://arxiv.org/pdf/2212.10003v2.pdf","comment":"ACL 2023 camera-ready"},{"id":"http://arxiv.org/abs/2308.13399v2","updated":"2023-08-29T18:28:13Z","published":"2023-08-25T14:23:40Z","title":"EntropyRank: Unsupervised Keyphrase Extraction via Side-Information\n Optimization for Language Model-based Text Compression","summary":" We propose an unsupervised method to extract keywords and keyphrases from\ntexts based on a pre-trained language model (LM) and Shannon's information\nmaximization. Specifically, our method extracts phrases having the highest\nconditional entropy under the LM. The resulting set of keyphrases turns out to\nsolve a relevant information-theoretic problem: if provided as side\ninformation, it leads to the expected minimal binary code length in compressing\nthe text using the LM and an entropy encoder. Alternately, the resulting set is\nan approximation via a causal LM to the set of phrases that minimize the\nentropy of the text when conditioned upon it. Empirically, the method provides\nresults comparable to the most commonly used methods in various keyphrase\nextraction benchmark challenges.\n","authors":["Alexander Tsvetkov","Alon Kipnis"],"pdf_url":"https://arxiv.org/pdf/2308.13399v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15517v1","updated":"2023-08-29T16:58:03Z","published":"2023-08-29T16:58:03Z","title":"Document AI: A Comparative Study of Transformer-Based, Graph-Based\n Models, and Convolutional Neural Networks For Document Layout Analysis","summary":" Document AI aims to automatically analyze documents by leveraging natural\nlanguage processing and computer vision techniques. One of the major tasks of\nDocument AI is document layout analysis, which structures document pages by\ninterpreting the content and spatial relationships of layout, image, and text.\nThis task can be image-centric, wherein the aim is to identify and label\nvarious regions such as authors and paragraphs, or text-centric, where the\nfocus is on classifying individual words in a document. Although there are\nincreasingly sophisticated methods for improving layout analysis, doubts remain\nabout the extent to which their findings can be generalized to a broader\ncontext. Specifically, prior work developed systems based on very different\narchitectures, such as transformer-based, graph-based, and CNNs. However, no\nwork has mentioned the effectiveness of these models in a comparative analysis.\nMoreover, while language-independent Document AI models capable of knowledge\ntransfer have been developed, it remains to be investigated to what degree they\ncan effectively transfer knowledge. In this study, we aim to fill these gaps by\nconducting a comparative evaluation of state-of-the-art models in document\nlayout analysis and investigating the potential of cross-lingual layout\nanalysis by utilizing machine translation techniques.\n","authors":["Sotirios Kastanas","Shaomu Tan","Yi He"],"pdf_url":"https://arxiv.org/pdf/2308.15517v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.15479v1","updated":"2023-08-29T17:58:55Z","published":"2023-08-29T17:58:55Z","title":"3D Adversarial Augmentations for Robust Out-of-Domain Predictions","summary":" Since real-world training datasets cannot properly sample the long tail of\nthe underlying data distribution, corner cases and rare out-of-domain samples\ncan severely hinder the performance of state-of-the-art models. This problem\nbecomes even more severe for dense tasks, such as 3D semantic segmentation,\nwhere points of non-standard objects can be confidently associated to the wrong\nclass. In this work, we focus on improving the generalization to out-of-domain\ndata. We achieve this by augmenting the training set with adversarial examples.\nFirst, we learn a set of vectors that deform the objects in an adversarial\nfashion. To prevent the adversarial examples from being too far from the\nexisting data distribution, we preserve their plausibility through a series of\nconstraints, ensuring sensor-awareness and shapes smoothness. Then, we perform\nadversarial augmentation by applying the learned sample-independent vectors to\nthe available objects when training a model. We conduct extensive experiments\nacross a variety of scenarios on data from KITTI, Waymo, and CrashD for 3D\nobject detection, and on data from SemanticKITTI, Waymo, and nuScenes for 3D\nsemantic segmentation. Despite training on a standard single dataset, our\napproach substantially improves the robustness and generalization of both 3D\nobject detection and 3D semantic segmentation methods to out-of-domain data.\n","authors":["Alexander Lehner","Stefano Gasperini","Alvaro Marcos-Ramiro","Michael Schmidt","Nassir Navab","Benjamin Busam","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2308.15479v1.pdf","comment":"37 pages, 12 figures"},{"id":"http://arxiv.org/abs/2308.15478v1","updated":"2023-08-29T17:57:20Z","published":"2023-08-29T17:57:20Z","title":"An Adaptive Tangent Feature Perspective of Neural Networks","summary":" In order to better understand feature learning in neural networks, we propose\na framework for understanding linear models in tangent feature space where the\nfeatures are allowed to be transformed during training. We consider linear\ntransformations of features, resulting in a joint optimization over parameters\nand transformations with a bilinear interpolation constraint. We show that this\noptimization problem has an equivalent linearly constrained optimization with\nstructured regularization that encourages approximately low rank solutions.\nSpecializing to neural network structure, we gain insights into how the\nfeatures and thus the kernel function change, providing additional nuance to\nthe phenomenon of kernel alignment when the target function is poorly\nrepresented using tangent features. In addition to verifying our theoretical\nobservations in real neural networks on a simple regression problem, we\nempirically show that an adaptive feature implementation of tangent feature\nclassification has an order of magnitude lower sample complexity than the fixed\ntangent feature model on MNIST and CIFAR-10.\n","authors":["Daniel LeJeune","Sina Alemohammad"],"pdf_url":"https://arxiv.org/pdf/2308.15478v1.pdf","comment":"15 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.15474v1","updated":"2023-08-29T17:52:10Z","published":"2023-08-29T17:52:10Z","title":"A General-Purpose Self-Supervised Model for Computational Pathology","summary":" Tissue phenotyping is a fundamental computational pathology (CPath) task in\nlearning objective characterizations of histopathologic biomarkers in anatomic\npathology. However, whole-slide imaging (WSI) poses a complex computer vision\nproblem in which the large-scale image resolutions of WSIs and the enormous\ndiversity of morphological phenotypes preclude large-scale data annotation.\nCurrent efforts have proposed using pretrained image encoders with either\ntransfer learning from natural image datasets or self-supervised pretraining on\npublicly-available histopathology datasets, but have not been extensively\ndeveloped and evaluated across diverse tissue types at scale. We introduce UNI,\na general-purpose self-supervised model for pathology, pretrained using over\n100 million tissue patches from over 100,000 diagnostic haematoxylin and\neosin-stained WSIs across 20 major tissue types, and evaluated on 33\nrepresentative CPath clinical tasks in CPath of varying diagnostic\ndifficulties. In addition to outperforming previous state-of-the-art models, we\ndemonstrate new modeling capabilities in CPath such as resolution-agnostic\ntissue classification, slide classification using few-shot class prototypes,\nand disease subtyping generalization in classifying up to 108 cancer types in\nthe OncoTree code classification system. UNI advances unsupervised\nrepresentation learning at scale in CPath in terms of both pretraining data and\ndownstream evaluation, enabling data-efficient AI models that can generalize\nand transfer to a gamut of diagnostically-challenging tasks and clinical\nworkflows in anatomic pathology.\n","authors":["Richard J. Chen","Tong Ding","Ming Y. Lu","Drew F. K. Williamson","Guillaume Jaume","Bowen Chen","Andrew Zhang","Daniel Shao","Andrew H. Song","Muhammad Shaban","Mane Williams","Anurag Vaidya","Sharifa Sahai","Lukas Oldenburg","Luca L. Weishaupt","Judy J. Wang","Walt Williams","Long Phi Le","Georg Gerber","Faisal Mahmood"],"pdf_url":"https://arxiv.org/pdf/2308.15474v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15472v1","updated":"2023-08-29T17:51:22Z","published":"2023-08-29T17:51:22Z","title":"Learning Modulated Transformation in GANs","summary":" The success of style-based generators largely benefits from style modulation,\nwhich helps take care of the cross-instance variation within data. However, the\ninstance-wise stochasticity is typically introduced via regular convolution,\nwhere kernels interact with features at some fixed locations, limiting its\ncapacity for modeling geometric variation. To alleviate this problem, we equip\nthe generator in generative adversarial networks (GANs) with a plug-and-play\nmodule, termed as modulated transformation module (MTM). This module predicts\nspatial offsets under the control of latent codes, based on which the\nconvolution operation can be applied at variable locations for different\ninstances, and hence offers the model an additional degree of freedom to handle\ngeometry deformation. Extensive experiments suggest that our approach can be\nfaithfully generalized to various generative tasks, including image generation,\n3D-aware image synthesis, and video generation, and get compatible with\nstate-of-the-art frameworks without any hyper-parameter tuning. It is\nnoteworthy that, towards human generation on the challenging TaiChi dataset, we\nimprove the FID of StyleGAN3 from 21.36 to 13.60, demonstrating the efficacy of\nlearning modulated geometry transformation.\n","authors":["Ceyuan Yang","Qihang Zhang","Yinghao Xu","Jiapeng Zhu","Yujun Shen","Bo Dai"],"pdf_url":"https://arxiv.org/pdf/2308.15472v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2308.15469v1","updated":"2023-08-29T17:48:33Z","published":"2023-08-29T17:48:33Z","title":"Multimodal Contrastive Learning and Tabular Attention for Automated\n Alzheimer's Disease Prediction","summary":" Alongside neuroimaging such as MRI scans and PET, Alzheimer's disease (AD)\ndatasets contain valuable tabular data including AD biomarkers and clinical\nassessments. Existing computer vision approaches struggle to utilize this\nadditional information. To address these needs, we propose a generalizable\nframework for multimodal contrastive learning of image data and tabular data, a\nnovel tabular attention module for amplifying and ranking salient features in\ntables, and the application of these techniques onto Alzheimer's disease\nprediction. Experimental evaulations demonstrate the strength of our framework\nby detecting Alzheimer's disease (AD) from over 882 MR image slices from the\nADNI database. We take advantage of the high interpretability of tabular data\nand our novel tabular attention approach and through attribution of the\nattention scores for each row of the table, we note and rank the most\npredominant features. Results show that the model is capable of an accuracy of\nover 83.8%, almost a 10% increase from previous state of the art.\n","authors":["Weichen Huang"],"pdf_url":"https://arxiv.org/pdf/2308.15469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15466v1","updated":"2023-08-29T17:47:42Z","published":"2023-08-29T17:47:42Z","title":"Input margins can predict generalization too","summary":" Understanding generalization in deep neural networks is an active area of\nresearch. A promising avenue of exploration has been that of margin\nmeasurements: the shortest distance to the decision boundary for a given sample\nor its representation internal to the network. While margins have been shown to\nbe correlated with the generalization ability of a model when measured at its\nhidden representations (hidden margins), no such link between large margins and\ngeneralization has been established for input margins. We show that while input\nmargins are not generally predictive of generalization, they can be if the\nsearch space is appropriately constrained. We develop such a measure based on\ninput margins, which we refer to as `constrained margins'. The predictive power\nof this new measure is demonstrated on the 'Predicting Generalization in Deep\nLearning' (PGDL) dataset and contrasted with hidden representation margins. We\nfind that constrained margins achieve highly competitive scores and outperform\nother margin measurements in general. This provides a novel insight on the\nrelationship between generalization and classification margins, and highlights\nthe importance of considering the data manifold for investigations of\ngeneralization in DNNs.\n","authors":["Coenraad Mouton","Marthinus W. Theunissen","Marelie H. Davel"],"pdf_url":"https://arxiv.org/pdf/2308.15466v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15462v1","updated":"2023-08-29T17:40:57Z","published":"2023-08-29T17:40:57Z","title":"Online Overexposed Pixels Hallucination in Videos with Adaptive\n Reference Frame Selection","summary":" Low dynamic range (LDR) cameras cannot deal with wide dynamic range inputs,\nfrequently leading to local overexposure issues. We present a learning-based\nsystem to reduce these artifacts without resorting to complex acquisition\nmechanisms like alternating exposures or costly processing that are typical of\nhigh dynamic range (HDR) imaging. We propose a transformer-based deep neural\nnetwork (DNN) to infer the missing HDR details. In an ablation study, we show\nthe importance of using a multiscale DNN and train it with the proper cost\nfunction to achieve state-of-the-art quality. To aid the reconstruction of the\noverexposed areas, our DNN takes a reference frame from the past as an\nadditional input. This leverages the commonly occurring temporal instabilities\nof autoexposure to our advantage: since well-exposed details in the current\nframe may be overexposed in the future, we use reinforcement learning to train\na reference frame selection DNN that decides whether to adopt the current frame\nas a future reference. Without resorting to alternating exposures, we obtain\ntherefore a causal, HDR hallucination algorithm with potential application in\ncommon video acquisition settings. Our demo video can be found at\nhttps://drive.google.com/file/d/1-r12BKImLOYCLUoPzdebnMyNjJ4Rk360/view\n","authors":["Yazhou Xing","Amrita Mazumdar","Anjul Patney","Chao Liu","Hongxu Yin","Qifeng Chen","Jan Kautz","Iuri Frosio"],"pdf_url":"https://arxiv.org/pdf/2308.15462v1.pdf","comment":"The demo video can be found at\n https://drive.google.com/file/d/1-r12BKImLOYCLUoPzdebnMyNjJ4Rk360/view"},{"id":"http://arxiv.org/abs/2301.13803v2","updated":"2023-08-29T17:38:45Z","published":"2023-01-31T17:44:59Z","title":"Fairness-aware Vision Transformer via Debiased Self-Attention","summary":" Vision Transformer (ViT) has recently gained significant interest in solving\ncomputer vision (CV) problems due to its capability of extracting informative\nfeatures and modeling long-range dependencies through the self-attention\nmechanism. To fully realize the advantages of ViT in real-world applications,\nrecent works have explored the trustworthiness of ViT, including its robustness\nand explainability. However, another desiderata, fairness has not yet been\nadequately addressed in the literature. We establish that the existing\nfairness-aware algorithms (primarily designed for CNNs) do not perform well on\nViT. This necessitates the need for developing our novel framework via Debiased\nSelf-Attention (DSA). DSA is a fairness-through-blindness approach that\nenforces ViT to eliminate spurious features correlated with the sensitive\nattributes for bias mitigation. Notably, adversarial examples are leveraged to\nlocate and mask the spurious features in the input image patches. In addition,\nDSA utilizes an attention weights alignment regularizer in the training\nobjective to encourage learning informative features for target prediction.\nImportantly, our DSA framework leads to improved fairness guarantees over prior\nworks on multiple prediction tasks without compromising target prediction\nperformance.\n","authors":["Yao Qiang","Chengyin Li","Prashant Khanduri","Dongxiao Zhu"],"pdf_url":"https://arxiv.org/pdf/2301.13803v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15461v1","updated":"2023-08-29T17:38:33Z","published":"2023-08-29T17:38:33Z","title":"Canonical Factors for Hybrid Neural Fields","summary":" Factored feature volumes offer a simple way to build more compact, efficient,\nand intepretable neural fields, but also introduce biases that are not\nnecessarily beneficial for real-world data. In this work, we (1) characterize\nthe undesirable biases that these architectures have for axis-aligned signals\n-- they can lead to radiance field reconstruction differences of as high as 2\nPSNR -- and (2) explore how learning a set of canonicalizing transformations\ncan improve representations by removing these biases. We prove in a\ntwo-dimensional model problem that simultaneously learning these\ntransformations together with scene appearance succeeds with drastically\nimproved efficiency. We validate the resulting architectures, which we call\nTILTED, using image, signed distance, and radiance field reconstruction tasks,\nwhere we observe improvements across quality, robustness, compactness, and\nruntime. Results demonstrate that TILTED can enable capabilities comparable to\nbaselines that are 2x larger, while highlighting weaknesses of neural field\nevaluation procedures.\n","authors":["Brent Yi","Weijia Zeng","Sam Buchanan","Yi Ma"],"pdf_url":"https://arxiv.org/pdf/2308.15461v1.pdf","comment":"ICCV 2023. Project webpage: https://brentyi.github.io/tilted/"},{"id":"http://arxiv.org/abs/2308.15453v1","updated":"2023-08-29T17:23:33Z","published":"2023-08-29T17:23:33Z","title":"Pseudo-Boolean Polynomials Approach To Edge Detection And Image\n Segmentation","summary":" We introduce a deterministic approach to edge detection and image\nsegmentation by formulating pseudo-Boolean polynomials on image patches. The\napproach works by applying a binary classification of blob and edge regions in\nan image based on the degrees of pseudo-Boolean polynomials calculated on\npatches extracted from the provided image. We test our method on simple images\ncontaining primitive shapes of constant and contrasting colour and establish\nthe feasibility before applying it to complex instances like aerial landscape\nimages. The proposed method is based on the exploitation of the reduction,\npolynomial degree, and equivalence properties of penalty-based pseudo-Boolean\npolynomials.\n","authors":["Tendai Mapungwana Chikake","Boris Goldengorin","Alexey Samosyuk"],"pdf_url":"https://arxiv.org/pdf/2308.15453v1.pdf","comment":"14 pages, 8 figures, submitted to the International Conference Data\n Analysis, Optimization and Their Applications on the Occasion of Boris\n Mirkin's 80th Birthday January 30-31, 2023, Dolgoprudny, Moscow Region,\n Moscow Institute of Physics and Technology\n https://mipt.ru/education/chairs/dm/conferences/data-analysis-optimization-and-their-applications-2023.php"},{"id":"http://arxiv.org/abs/2112.09153v2","updated":"2023-08-29T17:04:19Z","published":"2021-12-16T19:00:55Z","title":"An Empirical Investigation of the Role of Pre-training in Lifelong\n Learning","summary":" The lifelong learning paradigm in machine learning is an attractive\nalternative to the more prominent isolated learning scheme not only due to its\nresemblance to biological learning but also its potential to reduce energy\nwaste by obviating excessive model re-training. A key challenge to this\nparadigm is the phenomenon of catastrophic forgetting. With the increasing\npopularity and success of pre-trained models in machine learning, we pose the\nquestion: What role does pre-training play in lifelong learning, specifically\nwith respect to catastrophic forgetting? We investigate existing methods in the\ncontext of large, pre-trained models and evaluate their performance on a\nvariety of text and image classification tasks, including a large-scale study\nusing a novel data set of 15 diverse NLP tasks. Across all settings, we observe\nthat generic pre-training implicitly alleviates the effects of catastrophic\nforgetting when learning multiple tasks sequentially compared to randomly\ninitialized models. We then further investigate why pre-training alleviates\nforgetting in this setting. We study this phenomenon by analyzing the loss\nlandscape, finding that pre-trained weights appear to ease forgetting by\nleading to wider minima. Based on this insight, we propose jointly optimizing\nfor current task loss and loss basin sharpness to explicitly encourage wider\nbasins during sequential fine-tuning. We show that this optimization approach\noutperforms several state-of-the-art task-sequential continual learning\nalgorithms across multiple settings, occasionally even without retaining a\nmemory that scales in size with the number of tasks.\n","authors":["Sanket Vaibhav Mehta","Darshan Patil","Sarath Chandar","Emma Strubell"],"pdf_url":"https://arxiv.org/pdf/2112.09153v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15427v1","updated":"2023-08-29T16:33:16Z","published":"2023-08-29T16:33:16Z","title":"Complementing Onboard Sensors with Satellite Map: A New Perspective for\n HD Map Construction","summary":" High-Definition (HD) maps play a crucial role in autonomous driving systems.\nRecent methods have attempted to construct HD maps in real-time based on\ninformation obtained from vehicle onboard sensors. However, the performance of\nthese methods is significantly susceptible to the environment surrounding the\nvehicle due to the inherent limitation of onboard sensors, such as weak\ncapacity for long-range detection. In this study, we demonstrate that\nsupplementing onboard sensors with satellite maps can enhance the performance\nof HD map construction methods, leveraging the broad coverage capability of\nsatellite maps. For the purpose of further research, we release the satellite\nmap tiles as a complementary dataset of nuScenes dataset. Meanwhile, we propose\na hierarchical fusion module that enables better fusion of satellite maps\ninformation with existing methods. Specifically, we design an attention mask\nbased on segmentation and distance, applying the cross-attention mechanism to\nfuse onboard Bird's Eye View (BEV) features and satellite features in\nfeature-level fusion. An alignment module is introduced before concatenation in\nBEV-level fusion to mitigate the impact of misalignment between the two\nfeatures. The experimental results on the augmented nuScenes dataset showcase\nthe seamless integration of our module into three existing HD map construction\nmethods. It notably enhances their performance in both HD map semantic\nsegmentation and instance detection tasks.\n","authors":["Wenjie Gao","Jiawei Fu","Haodong Jing","Nanning Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.15427v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15413v1","updated":"2023-08-29T16:13:04Z","published":"2023-08-29T16:13:04Z","title":"WrappingNet: Mesh Autoencoder via Deep Sphere Deformation","summary":" There have been recent efforts to learn more meaningful representations via\nfixed length codewords from mesh data, since a mesh serves as a complete model\nof underlying 3D shape compared to a point cloud. However, the mesh\nconnectivity presents new difficulties when constructing a deep learning\npipeline for meshes. Previous mesh unsupervised learning approaches typically\nassume category-specific templates, e.g., human face/body templates. It\nrestricts the learned latent codes to only be meaningful for objects in a\nspecific category, so the learned latent spaces are unable to be used across\ndifferent types of objects. In this work, we present WrappingNet, the first\nmesh autoencoder enabling general mesh unsupervised learning over heterogeneous\nobjects. It introduces a novel base graph in the bottleneck dedicated to\nrepresenting mesh connectivity, which is shown to facilitate learning a shared\nlatent space representing object shape. The superiority of WrappingNet mesh\nlearning is further demonstrated via improved reconstruction quality and\ncompetitive classification compared to point cloud learning, as well as latent\ninterpolation between meshes of different categories.\n","authors":["Eric Lei","Muhammad Asad Lodhi","Jiahao Pang","Junghyun Ahn","Dong Tian"],"pdf_url":"https://arxiv.org/pdf/2308.15413v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15405v1","updated":"2023-08-29T16:07:18Z","published":"2023-08-29T16:07:18Z","title":"Robust Long-Tailed Learning via Label-Aware Bounded CVaR","summary":" Data in the real-world classification problems are always imbalanced or\nlong-tailed, wherein the majority classes have the most of the samples that\ndominate the model training. In such setting, the naive model tends to have\npoor performance on the minority classes. Previously, a variety of loss\nmodifications have been proposed to address the long-tailed leaning problem,\nwhile these methods either treat the samples in the same class\nindiscriminatingly or lack a theoretical guarantee. In this paper, we propose\ntwo novel approaches based on CVaR (Conditional Value at Risk) to improve the\nperformance of long-tailed learning with a solid theoretical ground.\nSpecifically, we firstly introduce a Label-Aware Bounded CVaR (LAB-CVaR) loss\nto overcome the pessimistic result of the original CVaR, and further design the\noptimal weight bounds for LAB-CVaR theoretically. Based on LAB-CVaR, we\nadditionally propose a LAB-CVaR with logit adjustment (LAB-CVaR-logit) loss to\nstabilize the optimization process, where we also offer the theoretical\nsupport. Extensive experiments on real-world datasets with long-tailed label\ndistributions verify the superiority of our proposed methods.\n","authors":["Hong Zhu","Runpeng Yu","Xing Tang","Yifei Wang","Yuan Fang","Yisen Wang"],"pdf_url":"https://arxiv.org/pdf/2308.15405v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12896v2","updated":"2023-08-29T15:57:02Z","published":"2023-08-24T16:16:47Z","title":"Beyond Document Page Classification: Design, Datasets, and Challenges","summary":" This paper highlights the need to bring document classification benchmarking\ncloser to real-world applications, both in the nature of data tested ($X$:\nmulti-channel, multi-paged, multi-industry; $Y$: class distributions and label\nset variety) and in classification tasks considered ($f$: multi-page document,\npage stream, and document bundle classification, ...). We identify the lack of\npublic multi-page document classification datasets, formalize different\nclassification tasks arising in application scenarios, and motivate the value\nof targeting efficient multi-page document representations. An experimental\nstudy on proposed multi-page document classification datasets demonstrates that\ncurrent benchmarks have become irrelevant and need to be updated to evaluate\ncomplete documents, as they naturally occur in practice. This reality check\nalso calls for more mature evaluation methodologies, covering calibration\nevaluation, inference complexity (time-memory), and a range of realistic\ndistribution shifts (e.g., born-digital vs. scanning noise, shifting page\norder). Our study ends on a hopeful note by recommending concrete avenues for\nfuture improvements.}\n","authors":["Jordy Van Landeghem","Sanket Biswas","Matthew B. Blaschko","Marie-Francine Moens"],"pdf_url":"https://arxiv.org/pdf/2308.12896v2.pdf","comment":"8 pages, under review"},{"id":"http://arxiv.org/abs/2308.15397v1","updated":"2023-08-29T15:56:38Z","published":"2023-08-29T15:56:38Z","title":"Color Aesthetics: Fuzzy based User-driven Method for Harmony and\n Preference Prediction","summary":" Color is the most important intrinsic sensory feature that has a powerful\nimpact on product sales. Color is even responsible for raising the aesthetic\nsenses in our brains. Account for individual differences is crucial in color\naesthetics. It requires user-driven mechanisms for various e-commerce\napplications. We propose a method for quantitative evaluation of all types of\nperceptual responses to color(s): distinct color preference, color harmony, and\ncolor combination preference. Preference for color schemes can be predicted by\ncombining preferences for the basic colors and ratings of color harmony.\nHarmonious pallets are extracted from big data set using comparison algorithms\nbased on fuzzy similarity and grouping. The proposed model results in useful\npredictions of harmony and preference of multicolored images. For example, in\nthe context of apparel coordination, it allows predicting a preference for a\nlook based on clothing colors. Our approach differs from standard aesthetic\nmodels, since in accounts for a personal variation. In addition, it can process\nnot only lower-order color pairs, but also groups of several colors.\n","authors":["Pakizar Shamoi","Atsushi Inoue","Hiroharu Kawanaka"],"pdf_url":"https://arxiv.org/pdf/2308.15397v1.pdf","comment":"It was accepted as a short paper. IFSA-SCIS 2017 Conference held in\n Otsu, Japan"},{"id":"http://arxiv.org/abs/2308.15386v1","updated":"2023-08-29T15:29:06Z","published":"2023-08-29T15:29:06Z","title":"Shape-Margin Knowledge Augmented Network for Thyroid Nodule Segmentation\n and Diagnosis","summary":" Thyroid nodule segmentation is a crucial step in the diagnostic procedure of\nphysicians and computer-aided diagnosis systems. Mostly, current studies treat\nsegmentation and diagnosis as independent tasks without considering the\ncorrelation between these tasks. The sequence steps of these independent tasks\nin computer-aided diagnosis systems may lead to the accumulation of errors.\nTherefore, it is worth combining them as a whole through exploring the\nrelationship between thyroid nodule segmentation and diagnosis. According to\nthe thyroid imaging reporting and data system (TI-RADS), the assessment of\nshape and margin characteristics is the prerequisite for the discrimination of\nbenign and malignant thyroid nodules. These characteristics can be observed in\nthe thyroid nodule segmentation masks. Inspired by the diagnostic procedure of\nTI-RADS, this paper proposes a shape-margin knowledge augmented network\n(SkaNet) for simultaneously thyroid nodule segmentation and diagnosis. Due to\nthe similarity in visual features between segmentation and diagnosis, SkaNet\nshares visual features in the feature extraction stage and then utilizes a\ndual-branch architecture to perform thyroid nodule segmentation and diagnosis\ntasks simultaneously. To enhance effective discriminative features, an\nexponential mixture module is devised, which incorporates convolutional feature\nmaps and self-attention maps by exponential weighting. Then, SkaNet is jointly\noptimized by a knowledge augmented multi-task loss function with a constraint\npenalty term. It embeds shape and margin characteristics through numerical\ncomputation and models the relationship between the thyroid nodule diagnosis\nresults and segmentation masks.\n","authors":["Weihua Liu","Chaochao Lin"],"pdf_url":"https://arxiv.org/pdf/2308.15386v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00371v2","updated":"2023-08-29T15:25:30Z","published":"2023-07-01T15:48:33Z","title":"Learning Content-enhanced Mask Transformer for Domain Generalized\n Urban-Scene Segmentation","summary":" Domain-generalized urban-scene semantic segmentation (USSS) aims to learn\ngeneralized semantic predictions across diverse urban-scene styles. Unlike\ndomain gap challenges, USSS is unique in that the semantic categories are often\nsimilar in different urban scenes, while the styles can vary significantly due\nto changes in urban landscapes, weather conditions, lighting, and other\nfactors. Existing approaches typically rely on convolutional neural networks\n(CNNs) to learn the content of urban scenes.\n In this paper, we propose a Content-enhanced Mask TransFormer (CMFormer) for\ndomain-generalized USSS. The main idea is to enhance the focus of the\nfundamental component, the mask attention mechanism, in Transformer\nsegmentation models on content information. To achieve this, we introduce a\nnovel content-enhanced mask attention mechanism. It learns mask queries from\nboth the image feature and its down-sampled counterpart, as lower-resolution\nimage features usually contain more robust content information and are less\nsensitive to style variations. These features are fused into a Transformer\ndecoder and integrated into a multi-resolution content-enhanced mask attention\nlearning scheme.\n Extensive experiments conducted on various domain-generalized urban-scene\nsegmentation datasets demonstrate that the proposed CMFormer significantly\noutperforms existing CNN-based methods for domain-generalized semantic\nsegmentation, achieving improvements of up to 14.00\\% in terms of mIoU (mean\nintersection over union). The source code for CMFormer will be made available\nat this\n\\href{https://github.com/BiQiWHU/domain-generalized-urban-scene-segmentation}{repository}.\n","authors":["Qi Bi","Shaodi You","Theo Gevers"],"pdf_url":"https://arxiv.org/pdf/2307.00371v2.pdf","comment":"18 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.15378v1","updated":"2023-08-29T15:16:51Z","published":"2023-08-29T15:16:51Z","title":"On the Robustness of Object Detection Models in Aerial Images","summary":" The robustness of object detection models is a major concern when applied to\nreal-world scenarios. However, the performance of most object detection models\ndegrades when applied to images subjected to corruptions, since they are\nusually trained and evaluated on clean datasets. Enhancing the robustness of\nobject detection models is of utmost importance, especially for those designed\nfor aerial images, which feature complex backgrounds, substantial variations in\nscales and orientations of objects. This paper addresses the challenge of\nassessing the robustness of object detection models in aerial images, with a\nspecific emphasis on scenarios where images are affected by clouds. In this\nstudy, we introduce two novel benchmarks based on DOTA-v1.0. The first\nbenchmark encompasses 19 prevalent corruptions, while the second focuses on\ncloud-corrupted images-a phenomenon uncommon in natural pictures yet frequent\nin aerial photography. We systematically evaluate the robustness of mainstream\nobject detection models and perform numerous ablation experiments. Through our\ninvestigations, we find that enhanced model architectures, larger networks,\nwell-crafted modules, and judicious data augmentation strategies collectively\nenhance the robustness of aerial object detection models. The benchmarks we\npropose and our comprehensive experimental analyses can facilitate research on\nrobust object detection in aerial images. Codes and datasets are available at:\n(https://github.com/hehaodong530/DOTA-C)\n","authors":["Haodong He","Jian Ding","Gui-Song Xia"],"pdf_url":"https://arxiv.org/pdf/2308.15378v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2308.15367v1","updated":"2023-08-29T15:03:05Z","published":"2023-08-29T15:03:05Z","title":"Efficient Model Personalization in Federated Learning via\n Client-Specific Prompt Generation","summary":" Federated learning (FL) emerges as a decentralized learning framework which\ntrains models from multiple distributed clients without sharing their data to\npreserve privacy. Recently, large-scale pre-trained models (e.g., Vision\nTransformer) have shown a strong capability of deriving robust representations.\nHowever, the data heterogeneity among clients, the limited computation\nresources, and the communication bandwidth restrict the deployment of\nlarge-scale models in FL frameworks. To leverage robust representations from\nlarge-scale models while enabling efficient model personalization for\nheterogeneous clients, we propose a novel personalized FL framework of\nclient-specific Prompt Generation (pFedPG), which learns to deploy a\npersonalized prompt generator at the server for producing client-specific\nvisual prompts that efficiently adapts frozen backbones to local data\ndistributions. Our proposed framework jointly optimizes the stages of\npersonalized prompt adaptation locally and personalized prompt generation\nglobally. The former aims to train visual prompts that adapt foundation models\nto each client, while the latter observes local optimization directions to\ngenerate personalized prompts for all clients. Through extensive experiments on\nbenchmark datasets, we show that our pFedPG is favorable against\nstate-of-the-art personalized FL methods under various types of data\nheterogeneity, allowing computation and communication efficient model\npersonalization.\n","authors":["Fu-En Yang","Chien-Yi Wang","Yu-Chiang Frank Wang"],"pdf_url":"https://arxiv.org/pdf/2308.15367v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.15366v1","updated":"2023-08-29T15:02:53Z","published":"2023-08-29T15:02:53Z","title":"AnomalyGPT: Detecting Industrial Anomalies using Large Vision-Language\n Models","summary":" Large Vision-Language Models (LVLMs) such as MiniGPT-4 and LLaVA have\ndemonstrated the capability of understanding images and achieved remarkable\nperformance in various visual tasks. Despite their strong abilities in\nrecognizing common objects due to extensive training datasets, they lack\nspecific domain knowledge and have a weaker understanding of localized details\nwithin objects, which hinders their effectiveness in the Industrial Anomaly\nDetection (IAD) task. On the other hand, most existing IAD methods only provide\nanomaly scores and necessitate the manual setting of thresholds to distinguish\nbetween normal and abnormal samples, which restricts their practical\nimplementation. In this paper, we explore the utilization of LVLM to address\nthe IAD problem and propose AnomalyGPT, a novel IAD approach based on LVLM. We\ngenerate training data by simulating anomalous images and producing\ncorresponding textual descriptions for each image. We also employ an image\ndecoder to provide fine-grained semantic and design a prompt learner to\nfine-tune the LVLM using prompt embeddings. Our AnomalyGPT eliminates the need\nfor manual threshold adjustments, thus directly assesses the presence and\nlocations of anomalies. Additionally, AnomalyGPT supports multi-turn dialogues\nand exhibits impressive few-shot in-context learning capabilities. With only\none normal shot, AnomalyGPT achieves the state-of-the-art performance with an\naccuracy of 86.1%, an image-level AUC of 94.1%, and a pixel-level AUC of 95.3%\non the MVTec-AD dataset. Code is available at\nhttps://github.com/CASIA-IVA-Lab/AnomalyGPT.\n","authors":["Zhaopeng Gu","Bingke Zhu","Guibo Zhu","Yingying Chen","Ming Tang","Jinqiao Wang"],"pdf_url":"https://arxiv.org/pdf/2308.15366v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10720v2","updated":"2023-08-29T14:59:28Z","published":"2023-06-19T06:41:19Z","title":"Exploring the Relationship between Samples and Masks for Robust Defect\n Localization","summary":" Defect detection aims to detect and localize regions out of the normal\ndistribution.Previous approaches model normality and compare it with the input\nto identify defective regions, potentially limiting their generalizability.This\npaper proposes a one-stage framework that detects defective patterns directly\nwithout the modeling process.This ability is adopted through the joint efforts\nof three parties: a generative adversarial network (GAN), a newly proposed\nscaled pattern loss, and a dynamic masked cycle-consistent auxiliary network.\nExplicit information that could indicate the position of defects is\nintentionally excluded to avoid learning any direct mapping.Experimental\nresults on the texture class of the challenging MVTec AD dataset show that the\nproposed method is 2.9\\% higher than the SOTA methods in F1-Score, while\nsubstantially outperforming SOTA methods in generalizability.\n","authors":["Jiang Lin","Yaping yan"],"pdf_url":"https://arxiv.org/pdf/2306.10720v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15357v1","updated":"2023-08-29T14:53:16Z","published":"2023-08-29T14:53:16Z","title":"Ego-Motion Estimation and Dynamic Motion Separation from 3D Point Clouds\n for Accumulating Data and Improving 3D Object Detection","summary":" New 3+1D high-resolution radar sensors are gaining importance for 3D object\ndetection in the automotive domain due to their relative affordability and\nimproved detection compared to classic low-resolution radar sensors. One\nlimitation of high-resolution radar sensors, compared to lidar sensors, is the\nsparsity of the generated point cloud. This sparsity could be partially\novercome by accumulating radar point clouds of subsequent time steps. This\ncontribution analyzes limitations of accumulating radar point clouds on the\nView-of-Delft dataset. By employing different ego-motion estimation approaches,\nthe dataset's inherent constraints, and possible solutions are analyzed.\nAdditionally, a learning-based instance motion estimation approach is deployed\nto investigate the influence of dynamic motion on the accumulated point cloud\nfor object detection. Experiments document an improved object detection\nperformance by applying an ego-motion estimation and dynamic motion correction\napproach.\n","authors":["Patrick Palmer","Martin Krueger","Richard Altendorfer","Torsten Bertram"],"pdf_url":"https://arxiv.org/pdf/2308.15357v1.pdf","comment":"Published at: AmE 2023 - Automotive meets Electronics; 14. GMM\n Symposium (https://ieeexplore.ieee.org/document/10227711)"},{"id":"http://arxiv.org/abs/2307.12676v5","updated":"2023-08-29T14:48:37Z","published":"2023-07-24T10:30:54Z","title":"Few-shot $\\mathbf{1/a}$ Anomalies Feedback : Damage Vision Mining\n Opportunity and Embedding Feature Imbalance","summary":" Over the past decade, previous balanced datasets have been used to advance\ndeep learning algorithms for industrial applications. In urban infrastructures\nand living environments, damage data mining cannot avoid imbalanced data issues\nbecause of rare unseen events and the high-quality status of improved\noperations. For visual inspection, the deteriorated class acquired from the\nsurface of concrete and steel components are occasionally imbalanced. From\nnumerous related surveys, we conclude that imbalanced data problems can be\ncategorised into four types: 1) missing range of target and label valuables, 2)\nmajority-minority class imbalance, 3) foreground background of spatial\nimbalance, and 4) long-tailed class of pixel-wise imbalance. Since 2015, many\nimbalanced studies have been conducted using deep-learning approaches,\nincluding regression, image classification, object detection, and semantic\nsegmentation. However, anomaly detection for imbalanced data is not well known.\nIn this study, we highlight a one-class anomaly detection application, whether\nanomalous class or not, and demonstrate clear examples of imbalanced vision\ndatasets: medical disease, hazardous behaviour, material deterioration, plant\ndisease, river sludge, and disaster damage. We provide key results on the\nadvantage of damage-vision mining, hypothesising that the more effective the\nrange of the positive ratio, the higher the accuracy gain of the anomalies\nfeedback. In our imbalanced studies, compared with the balanced case with a\npositive ratio of $1/1$, we find that there is an applicable positive ratio\n$1/a$ where the accuracy is consistently high. However, the extremely\nimbalanced range is from one shot to $1/2a$, the accuracy of which is inferior\nto that of the applicable ratio. In contrast, with a positive ratio ranging\nover $2/a$, it shifts in the over-mining phase without an effective gain in\naccuracy.\n","authors":["Takato Yasuno"],"pdf_url":"https://arxiv.org/pdf/2307.12676v5.pdf","comment":"34 pages, 53 figures, 28 tables"},{"id":"http://arxiv.org/abs/2308.15353v1","updated":"2023-08-29T14:48:29Z","published":"2023-08-29T14:48:29Z","title":"Detect, Augment, Compose, and Adapt: Four Steps for Unsupervised Domain\n Adaptation in Object Detection","summary":" Unsupervised domain adaptation (UDA) plays a crucial role in object detection\nwhen adapting a source-trained detector to a target domain without annotated\ndata. In this paper, we propose a novel and effective four-step UDA approach\nthat leverages self-supervision and trains source and target data concurrently.\nWe harness self-supervised learning to mitigate the lack of ground truth in the\ntarget domain. Our method consists of the following steps: (1) identify the\nregion with the highest-confidence set of detections in each target image,\nwhich serve as our pseudo-labels; (2) crop the identified region and generate a\ncollection of its augmented versions; (3) combine these latter into a composite\nimage; (4) adapt the network to the target domain using the composed image.\nThrough extensive experiments under cross-camera, cross-weather, and\nsynthetic-to-real scenarios, our approach achieves state-of-the-art\nperformance, improving upon the nearest competitor by more than 2% in terms of\nmean Average Precision (mAP). The code is available at\nhttps://github.com/MohamedTEV/DACA.\n","authors":["Mohamed L. Mekhalfi","Davide Boscaini","Fabio Poiesi"],"pdf_url":"https://arxiv.org/pdf/2308.15353v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15346v1","updated":"2023-08-29T14:41:40Z","published":"2023-08-29T14:41:40Z","title":"Enhancing Mobile Face Anti-Spoofing: A Robust Framework for Diverse\n Attack Types under Screen Flash","summary":" Face anti-spoofing (FAS) is crucial for securing face recognition systems.\nHowever, existing FAS methods with handcrafted binary or pixel-wise labels have\nlimitations due to diverse presentation attacks (PAs). In this paper, we\npropose an attack type robust face anti-spoofing framework under light flash,\ncalled ATR-FAS. Due to imaging differences caused by various attack types,\ntraditional FAS methods based on single binary classification network may\nresult in excessive intra-class distance of spoof faces, leading to a challenge\nof decision boundary learning. Therefore, we employed multiple networks to\nreconstruct multi-frame depth maps as auxiliary supervision, and each network\nexperts in one type of attack. A dual gate module (DGM) consisting of a type\ngate and a frame-attention gate is introduced, which perform attack type\nrecognition and multi-frame attention generation, respectively. The outputs of\nDGM are utilized as weight to mix the result of multiple expert networks. The\nmulti-experts mixture enables ATR-FAS to generate spoof-differentiated depth\nmaps, and stably detects spoof faces without being affected by different types\nof PAs. Moreover, we design a differential normalization procedure to convert\noriginal flash frames into differential frames. This simple but effective\nprocessing enhances the details in flash frames, aiding in the generation of\ndepth maps. To verify the effectiveness of our framework, we collected a\nlarge-scale dataset containing 12,660 live and spoof videos with diverse PAs\nunder dynamic flash from the smartphone screen. Extensive experiments\nillustrate that the proposed ATR-FAS significantly outperforms existing\nstate-of-the-art methods. The code and dataset will be available at\nhttps://github.com/Chaochao-Lin/ATR-FAS.\n","authors":["Weihua Liu","Chaochao Lin","Yu Yan"],"pdf_url":"https://arxiv.org/pdf/2308.15346v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15345v1","updated":"2023-08-29T14:41:10Z","published":"2023-08-29T14:41:10Z","title":"IndGIC: Supervised Action Recognition under Low Illumination","summary":" Technologies of human action recognition in the dark are gaining more and\nmore attention as huge demand in surveillance, motion control and\nhuman-computer interaction. However, because of limitation in image enhancement\nmethod and low-lighting video datasets, e.g. labeling cost, existing methods\nmeet some problems. Some video-based approached are effect and efficient in\nspecific datasets but cannot generalize to most cases while others methods\nusing multiple sensors rely heavily to prior knowledge to deal with noisy\nnature from video stream. In this paper, we proposes action recognition method\nusing deep multi-input network. Furthermore, we proposed a Independent Gamma\nIntensity Corretion (Ind-GIC) to enhance poor-illumination video, generating\none gamma for one frame to increase enhancement performance. To prove our\nmethod is effective, there is some evaluation and comparison between our method\nand existing methods. Experimental results show that our model achieves high\naccuracy in on ARID dataset.\n","authors":["Jingbo Zeng"],"pdf_url":"https://arxiv.org/pdf/2308.15345v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15344v1","updated":"2023-08-29T14:41:05Z","published":"2023-08-29T14:41:05Z","title":"Imperceptible Adversarial Attack on Deep Neural Networks from Image\n Boundary","summary":" Although Deep Neural Networks (DNNs), such as the convolutional neural\nnetworks (CNN) and Vision Transformers (ViTs), have been successfully applied\nin the field of computer vision, they are demonstrated to be vulnerable to\nwell-sought Adversarial Examples (AEs) that can easily fool the DNNs. The\nresearch in AEs has been active, and many adversarial attacks and explanations\nhave been proposed since they were discovered in 2014. The mystery of the AE's\nexistence is still an open question, and many studies suggest that DNN training\nalgorithms have blind spots. The salient objects usually do not overlap with\nboundaries; hence, the boundaries are not the DNN model's attention.\nNevertheless, recent studies show that the boundaries can dominate the behavior\nof the DNN models. Hence, this study aims to look at the AEs from a different\nperspective and proposes an imperceptible adversarial attack that systemically\nattacks the input image boundary for finding the AEs. The experimental results\nhave shown that the proposed boundary attacking method effectively attacks six\nCNN models and the ViT using only 32% of the input image content (from the\nboundaries) with an average success rate (SR) of 95.2% and an average peak\nsignal-to-noise ratio of 41.37 dB. Correlation analyses are conducted,\nincluding the relation between the adversarial boundary's width and the SR and\nhow the adversarial boundary changes the DNN model's attention. This paper's\ndiscoveries can potentially advance the understanding of AEs and provide a\ndifferent perspective on how AEs can be constructed.\n","authors":["Fahad Alrasheedi","Xin Zhong"],"pdf_url":"https://arxiv.org/pdf/2308.15344v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15327v1","updated":"2023-08-29T14:23:44Z","published":"2023-08-29T14:23:44Z","title":"Enhancing Robot Learning through Learned Human-Attention Feature Maps","summary":" Robust and efficient learning remains a challenging problem in robotics, in\nparticular with complex visual inputs. Inspired by human attention mechanism,\nwith which we quickly process complex visual scenes and react to changes in the\nenvironment, we think that embedding auxiliary information about focus point\ninto robot learning would enhance efficiency and robustness of the learning\nprocess. In this paper, we propose a novel approach to model and emulate the\nhuman attention with an approximate prediction model. We then leverage this\noutput and feed it as a structured auxiliary feature map into downstream\nlearning tasks. We validate this idea by learning a prediction model from\nhuman-gaze recordings of manual driving in the real world. We test our approach\non two learning tasks - object detection and imitation learning. Our\nexperiments demonstrate that the inclusion of predicted human attention leads\nto improved robustness of the trained models to out-of-distribution samples and\nfaster learning in low-data regime settings. Our work highlights the potential\nof incorporating structured auxiliary information in representation learning\nfor robotics and opens up new avenues for research in this direction. All code\nand data are available online.\n","authors":["Daniel Scheuchenstuhl","Stefan Ulmer","Felix Resch","Luigi Berducci","Radu Grosu"],"pdf_url":"https://arxiv.org/pdf/2308.15327v1.pdf","comment":"This work has been accepted for the RAP4Robots workshop at ICRA 2023\n in London"},{"id":"http://arxiv.org/abs/2308.15323v1","updated":"2023-08-29T14:20:13Z","published":"2023-08-29T14:20:13Z","title":"Occlusion-Aware Deep Convolutional Neural Network via Homogeneous\n Tanh-transforms for Face Parsing","summary":" Face parsing infers a pixel-wise label map for each semantic facial\ncomponent. Previous methods generally work well for uncovered faces, however\noverlook the facial occlusion and ignore some contextual area outside a single\nface, especially when facial occlusion has become a common situation during the\nCOVID-19 epidemic. Inspired by the illumination theory of image, we propose a\nnovel homogeneous tanh-transforms for image preprocessing, which made up of\nfour tanh-transforms, that fuse the central vision and the peripheral vision\ntogether. Our proposed method addresses the dilemma of face parsing under\nocclusion and compresses more information of surrounding context. Based on\nhomogeneous tanh-transforms, we propose an occlusion-aware convolutional neural\nnetwork for occluded face parsing. It combines the information both in\nTanh-polar space and Tanh-Cartesian space, capable of enhancing receptive\nfields. Furthermore, we introduce an occlusion-aware loss to focus on the\nboundaries of occluded regions. The network is simple and flexible, and can be\ntrained end-to-end. To facilitate future research of occluded face parsing, we\nalso contribute a new cleaned face parsing dataset, which is manually purified\nfrom several academic or industrial datasets, including CelebAMask-HQ,\nShort-video Face Parsing as well as Helen dataset and will make it public.\nExperiments demonstrate that our method surpasses state-of-art methods of face\nparsing under occlusion.\n","authors":["Weihua Liu","Chaochao Lin","Haoping Yu","Said Boumaraf","Zhaoqiong Pi"],"pdf_url":"https://arxiv.org/pdf/2308.15323v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15321v1","updated":"2023-08-29T14:16:09Z","published":"2023-08-29T14:16:09Z","title":"Elucidating the Exposure Bias in Diffusion Models","summary":" Diffusion models have demonstrated impressive generative capabilities, but\ntheir 'exposure bias' problem, described as the input mismatch between training\nand sampling, lacks in-depth exploration. In this paper, we systematically\ninvestigate the exposure bias problem in diffusion models by first analytically\nmodelling the sampling distribution, based on which we then attribute the\nprediction error at each sampling step as the root cause of the exposure bias\nissue. Furthermore, we discuss potential solutions to this issue and propose an\nintuitive metric for it. Along with the elucidation of exposure bias, we\npropose a simple, yet effective, training-free method called Epsilon Scaling to\nalleviate the exposure bias. We show that Epsilon Scaling explicitly moves the\nsampling trajectory closer to the vector field learned in the training phase by\nscaling down the network output (Epsilon), mitigating the input mismatch\nbetween training and sampling. Experiments on various diffusion frameworks\n(ADM, DDPM/DDIM, LDM), unconditional and conditional settings, and\ndeterministic vs. stochastic sampling verify the effectiveness of our method.\n","authors":["Mang Ning","Mingxiao Li","Jianlin Su","Albert Ali Salah","Itir Onal Ertugrul"],"pdf_url":"https://arxiv.org/pdf/2308.15321v1.pdf","comment":"7 pages, code available soon"},{"id":"http://arxiv.org/abs/2303.09790v4","updated":"2023-08-29T14:16:04Z","published":"2023-03-17T06:18:16Z","title":"Reliable Multimodality Eye Disease Screening via Mixture of Student's t\n Distributions","summary":" Multimodality eye disease screening is crucial in ophthalmology as it\nintegrates information from diverse sources to complement their respective\nperformances. However, the existing methods are weak in assessing the\nreliability of each unimodality, and directly fusing an unreliable modality may\ncause screening errors. To address this issue, we introduce a novel\nmultimodality evidential fusion pipeline for eye disease screening, EyeMoSt,\nwhich provides a measure of confidence for unimodality and elegantly integrates\nthe multimodality information from a multi-distribution fusion perspective.\nSpecifically, our model estimates both local uncertainty for unimodality and\nglobal uncertainty for the fusion modality to produce reliable classification\nresults. More importantly, the proposed mixture of Student's $t$ distributions\nadaptively integrates different modalities to endow the model with heavy-tailed\nproperties, increasing robustness and reliability. Our experimental findings on\nboth public and in-house datasets show that our model is more reliable than\ncurrent methods. Additionally, EyeMost has the potential ability to serve as a\ndata quality discriminator, enabling reliable decision-making for multimodality\neye disease screening.\n","authors":["Ke Zou","Tian Lin","Xuedong Yuan","Haoyu Chen","Xiaojing Shen","Meng Wang","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2303.09790v4.pdf","comment":"MICCAI 2023 (Early accept):11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.15316v1","updated":"2023-08-29T14:02:27Z","published":"2023-08-29T14:02:27Z","title":"3D-MuPPET: 3D Multi-Pigeon Pose Estimation and Tracking","summary":" Markerless methods for animal posture tracking have been developing recently,\nbut frameworks and benchmarks for tracking large animal groups in 3D are still\nlacking. To overcome this gap in the literature, we present 3D-MuPPET, a\nframework to estimate and track 3D poses of up to 10 pigeons at interactive\nspeed using multiple-views. We train a pose estimator to infer 2D keypoints and\nbounding boxes of multiple pigeons, then triangulate the keypoints to 3D. For\ncorrespondence matching, we first dynamically match 2D detections to global\nidentities in the first frame, then use a 2D tracker to maintain\ncorrespondences accross views in subsequent frames. We achieve comparable\naccuracy to a state of the art 3D pose estimator for Root Mean Square Error\n(RMSE) and Percentage of Correct Keypoints (PCK). We also showcase a novel use\ncase where our model trained with data of single pigeons provides comparable\nresults on data containing multiple pigeons. This can simplify the domain shift\nto new species because annotating single animal data is less labour intensive\nthan multi-animal data. Additionally, we benchmark the inference speed of\n3D-MuPPET, with up to 10 fps in 2D and 1.5 fps in 3D, and perform quantitative\ntracking evaluation, which yields encouraging results. Finally, we show that\n3D-MuPPET also works in natural environments without model fine-tuning on\nadditional annotations. To the best of our knowledge we are the first to\npresent a framework for 2D/3D posture and trajectory tracking that works in\nboth indoor and outdoor environments.\n","authors":["Urs Waldmann","Alex Hoi Hang Chan","Hemal Naik","Máté Nagy","Iain D. Couzin","Oliver Deussen","Bastian Goldluecke","Fumihiro Kano"],"pdf_url":"https://arxiv.org/pdf/2308.15316v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15313v1","updated":"2023-08-29T14:00:55Z","published":"2023-08-29T14:00:55Z","title":"Spatio-temporal MLP-graph network for 3D human pose estimation","summary":" Graph convolutional networks and their variants have shown significant\npromise in 3D human pose estimation. Despite their success, most of these\nmethods only consider spatial correlations between body joints and do not take\ninto account temporal correlations, thereby limiting their ability to capture\nrelationships in the presence of occlusions and inherent ambiguity. To address\nthis potential weakness, we propose a spatio-temporal network architecture\ncomposed of a joint-mixing multi-layer perceptron block that facilitates\ncommunication among different joints and a graph weighted Jacobi network block\nthat enables communication among various feature channels. The major novelty of\nour approach lies in a new weighted Jacobi feature propagation rule obtained\nthrough graph filtering with implicit fairing. We leverage temporal information\nfrom the 2D pose sequences, and integrate weight modulation into the model to\nenable untangling of the feature transformations of distinct nodes. We also\nemploy adjacency modulation with the aim of learning meaningful correlations\nbeyond defined linkages between body joints by altering the graph topology\nthrough a learnable modulation matrix. Extensive experiments on two benchmark\ndatasets demonstrate the effectiveness of our model, outperforming recent\nstate-of-the-art methods for 3D human pose estimation.\n","authors":["Tanvir Hassan","A. Ben Hamza"],"pdf_url":"https://arxiv.org/pdf/2308.15313v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.03981v3","updated":"2023-08-29T13:50:43Z","published":"2023-04-08T10:47:41Z","title":"Uncertainty-inspired Open Set Learning for Retinal Anomaly\n Identification","summary":" Failure to recognize samples from the classes unseen during training is a\nmajor limitation of artificial intelligence in the real-world implementation\nfor recognition and classification of retinal anomalies. We established an\nuncertainty-inspired open-set (UIOS) model, which was trained with fundus\nimages of 9 retinal conditions. Besides assessing the probability of each\ncategory, UIOS also calculated an uncertainty score to express its confidence.\nOur UIOS model with thresholding strategy achieved an F1 score of 99.55%,\n97.01% and 91.91% for the internal testing set, external target categories\n(TC)-JSIEC dataset and TC-unseen testing set, respectively, compared to the F1\nscore of 92.20%, 80.69% and 64.74% by the standard AI model. Furthermore, UIOS\ncorrectly predicted high uncertainty scores, which would prompt the need for a\nmanual check in the datasets of non-target categories retinal diseases,\nlow-quality fundus images, and non-fundus images. UIOS provides a robust method\nfor real-world screening of retinal anomalies.\n","authors":["Meng Wang","Tian Lin","Lianyu Wang","Aidi Lin","Ke Zou","Xinxing Xu","Yi Zhou","Yuanyuan Peng","Qingquan Meng","Yiming Qian","Guoyao Deng","Zhiqun Wu","Junhong Chen","Jianhong Lin","Mingzhi Zhang","Weifang Zhu","Changqing Zhang","Daoqiang Zhang","Rick Siow Mong Goh","Yong Liu","Chi Pui Pang","Xinjian Chen","Haoyu Chen","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2304.03981v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07494v2","updated":"2023-08-29T13:43:37Z","published":"2023-07-14T17:27:22Z","title":"TALL: Thumbnail Layout for Deepfake Video Detection","summary":" The growing threats of deepfakes to society and cybersecurity have raised\nenormous public concerns, and increasing efforts have been devoted to this\ncritical topic of deepfake video detection. Existing video methods achieve good\nperformance but are computationally intensive. This paper introduces a simple\nyet effective strategy named Thumbnail Layout (TALL), which transforms a video\nclip into a pre-defined layout to realize the preservation of spatial and\ntemporal dependencies. Specifically, consecutive frames are masked in a fixed\nposition in each frame to improve generalization, then resized to sub-images\nand rearranged into a pre-defined layout as the thumbnail. TALL is\nmodel-agnostic and extremely simple by only modifying a few lines of code.\nInspired by the success of vision transformers, we incorporate TALL into Swin\nTransformer, forming an efficient and effective method TALL-Swin. Extensive\nexperiments on intra-dataset and cross-dataset validate the validity and\nsuperiority of TALL and SOTA TALL-Swin. TALL-Swin achieves 90.79$\\%$ AUC on the\nchallenging cross-dataset task, FaceForensics++ $\\to$ Celeb-DF. The code is\navailable at https://github.com/rainy-xu/TALL4Deepfake.\n","authors":["Yuting Xu","Jian Liang","Gengyun Jia","Ziming Yang","Yanhao Zhang","Ran He"],"pdf_url":"https://arxiv.org/pdf/2307.07494v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2304.04902v2","updated":"2023-08-29T13:42:27Z","published":"2023-04-11T00:17:34Z","title":"Weakly Supervised Intracranial Hemorrhage Segmentation using Head-Wise\n Gradient-Infused Self-Attention Maps from a Swin Transformer in Categorical\n Learning","summary":" Intracranial hemorrhage (ICH) is a life-threatening medical emergency that\nrequires timely and accurate diagnosis for effective treatment and improved\npatient survival rates. While deep learning techniques have emerged as the\nleading approach for medical image analysis and processing, the most commonly\nemployed supervised learning often requires large, high-quality annotated\ndatasets that can be costly to obtain, particularly for pixel/voxel-wise image\nsegmentation. To address this challenge and facilitate ICH treatment decisions,\nwe introduce a novel weakly supervised method for ICH segmentation, utilizing a\nSwin transformer trained on an ICH classification task with categorical labels.\nOur approach leverages a hierarchical combination of head-wise gradient-infused\nself-attention maps to generate accurate image segmentation. Additionally, we\nconducted an exploratory study on different learning strategies and showed that\nbinary ICH classification has a more positive impact on self-attention maps\ncompared to full ICH subtyping. With a mean Dice score of 0.44, our technique\nachieved similar ICH segmentation performance as the popular U-Net and\nSwin-UNETR models with full supervision and outperformed a similar weakly\nsupervised approach using GradCAM, demonstrating the excellent potential of the\nproposed framework in challenging medical image segmentation tasks. Our code is\navailable at https://github.com/HealthX-Lab/HGI-SAM.\n","authors":["Amirhossein Rasoulian","Soorena Salari","Yiming Xiao"],"pdf_url":"https://arxiv.org/pdf/2304.04902v2.pdf","comment":"Accepted for publication at the Journal of Machine Learning for\n Biomedical Imaging (MELBA) https://melba-journal.org/2023:012"},{"id":"http://arxiv.org/abs/2308.15300v1","updated":"2023-08-29T13:38:35Z","published":"2023-08-29T13:38:35Z","title":"MSFlow: Multi-Scale Flow-based Framework for Unsupervised Anomaly\n Detection","summary":" Unsupervised anomaly detection (UAD) attracts a lot of research interest and\ndrives widespread applications, where only anomaly-free samples are available\nfor training. Some UAD applications intend to further locate the anomalous\nregions without any anomaly information.\n Although the absence of anomalous samples and annotations deteriorates the\nUAD performance, an inconspicuous yet powerful statistics model, the\nnormalizing flows, is appropriate for anomaly detection and localization in an\nunsupervised fashion. The flow-based probabilistic models, only trained on\nanomaly-free data, can efficiently distinguish unpredictable anomalies by\nassigning them much lower likelihoods than normal data.\n Nevertheless, the size variation of unpredictable anomalies introduces\nanother inconvenience to the flow-based methods for high-precision anomaly\ndetection and localization. To generalize the anomaly size variation, we\npropose a novel Multi-Scale Flow-based framework dubbed MSFlow composed of\nasymmetrical parallel flows followed by a fusion flow to exchange multi-scale\nperceptions. Moreover, different multi-scale aggregation strategies are adopted\nfor image-wise anomaly detection and pixel-wise anomaly localization according\nto the discrepancy between them. The proposed MSFlow is evaluated on three\nanomaly detection datasets, significantly outperforming existing methods.\nNotably, on the challenging MVTec AD benchmark, our MSFlow achieves a new\nstate-of-the-art with a detection AUORC score of up to 99.7%, localization\nAUCROC score of 98.8%, and PRO score of 97.1%. The reproducible code is\navailable at https://github.com/cool-xuan/msflow.\n","authors":["Yixuan Zhou","Xing Xu","Jingkuan Song","Fumin Shen","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2308.15300v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15284v1","updated":"2023-08-29T13:15:13Z","published":"2023-08-29T13:15:13Z","title":"ARTxAI: Explainable Artificial Intelligence Curates Deep Representation\n Learning for Artistic Images using Fuzzy Techniques","summary":" Automatic art analysis employs different image processing techniques to\nclassify and categorize works of art. When working with artistic images, we\nneed to take into account further considerations compared to classical image\nprocessing. This is because such artistic paintings change drastically\ndepending on the author, the scene depicted, and their artistic style. This can\nresult in features that perform very well in a given task but do not grasp the\nwhole of the visual and symbolic information contained in a painting. In this\npaper, we show how the features obtained from different tasks in artistic image\nclassification are suitable to solve other ones of similar nature. We present\ndifferent methods to improve the generalization capabilities and performance of\nartistic classification systems. Furthermore, we propose an explainable\nartificial intelligence method to map known visual traits of an image with the\nfeatures used by the deep learning model considering fuzzy rules. These rules\nshow the patterns and variables that are relevant to solve each task and how\neffective is each of the patterns found. Our results show that our proposed\ncontext-aware features can achieve up to $6\\%$ and $26\\%$ more accurate results\nthan other context- and non-context-aware solutions, respectively, depending on\nthe specific task. We also show that some of the features used by these models\ncan be more clearly correlated to visual traits in the original image than\nothers.\n","authors":["Javier Fumanal-Idocin","Javier Andreu-Perez","Oscar Cordón","Hani Hagras","Humberto Bustince"],"pdf_url":"https://arxiv.org/pdf/2308.15284v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.00135v4","updated":"2023-08-29T13:10:56Z","published":"2022-12-31T06:32:36Z","title":"TeViS:Translating Text Synopses to Video Storyboards","summary":" A video storyboard is a roadmap for video creation which consists of\nshot-by-shot images to visualize key plots in a text synopsis. Creating video\nstoryboards, however, remains challenging which not only requires cross-modal\nassociation between high-level texts and images but also demands long-term\nreasoning to make transitions smooth across shots. In this paper, we propose a\nnew task called Text synopsis to Video Storyboard (TeViS) which aims to\nretrieve an ordered sequence of images as the video storyboard to visualize the\ntext synopsis. We construct a MovieNet-TeViS dataset based on the public\nMovieNet dataset. It contains 10K text synopses each paired with keyframes\nmanually selected from corresponding movies by considering both relevance and\ncinematic coherence. To benchmark the task, we present strong CLIP-based\nbaselines and a novel VQ-Trans. VQ-Trans first encodes text synopsis and images\ninto a joint embedding space and uses vector quantization (VQ) to improve the\nvisual representation. Then, it auto-regressively generates a sequence of\nvisual features for retrieval and ordering. Experimental results demonstrate\nthat VQ-Trans significantly outperforms prior methods and the CLIP-based\nbaselines. Nevertheless, there is still a large gap compared to human\nperformance suggesting room for promising future work. The code and data are\navailable at: \\url{https://ruc-aimind.github.io/projects/TeViS/}\n","authors":["Xu Gu","Yuchong Sun","Feiyue Ni","Shizhe Chen","Xihua Wang","Ruihua Song","Boyuan Li","Xiang Cao"],"pdf_url":"https://arxiv.org/pdf/2301.00135v4.pdf","comment":"Accepted to ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.15280v1","updated":"2023-08-29T13:10:53Z","published":"2023-08-29T13:10:53Z","title":"ADFA: Attention-augmented Differentiable top-k Feature Adaptation for\n Unsupervised Medical Anomaly Detection","summary":" The scarcity of annotated data, particularly for rare diseases, limits the\nvariability of training data and the range of detectable lesions, presenting a\nsignificant challenge for supervised anomaly detection in medical imaging. To\nsolve this problem, we propose a novel unsupervised method for medical image\nanomaly detection: Attention-Augmented Differentiable top-k Feature Adaptation\n(ADFA). The method utilizes Wide-ResNet50-2 (WR50) network pre-trained on\nImageNet to extract initial feature representations. To reduce the channel\ndimensionality while preserving relevant channel information, we employ an\nattention-augmented patch descriptor on the extracted features. We then apply\ndifferentiable top-k feature adaptation to train the patch descriptor, mapping\nthe extracted feature representations to a new vector space, enabling effective\ndetection of anomalies. Experiments show that ADFA outperforms state-of-the-art\n(SOTA) methods on multiple challenging medical image datasets, confirming its\neffectiveness in medical anomaly detection.\n","authors":["Yiming Huang","Guole Liu","Yaoru Luo","Ge Yang"],"pdf_url":"https://arxiv.org/pdf/2308.15280v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15273v1","updated":"2023-08-29T13:02:35Z","published":"2023-08-29T13:02:35Z","title":"Cross-Modal Retrieval Meets Inference:Improving Zero-Shot Classification\n with Cross-Modal Retrieval","summary":" Contrastive language-image pre-training (CLIP) has demonstrated remarkable\nzero-shot classification ability, namely image classification using novel text\nlabels. Existing works have attempted to enhance CLIP by fine-tuning on\ndownstream tasks, but these have inadvertently led to performance degradation\non unseen classes, thus harming zero-shot generalization. This paper aims to\naddress this challenge by leveraging readily available image-text pairs from an\nexternal dataset for cross-modal guidance during inference. To this end, we\npropose X-MoRe, a novel inference method comprising two key steps: (1)\ncross-modal retrieval and (2) modal-confidence-based ensemble. Given a query\nimage, we harness the power of CLIP's cross-modal representations to retrieve\nrelevant textual information from an external image-text pair dataset. Then, we\nassign higher weights to the more reliable modality between the original query\nimage and retrieved text, contributing to the final prediction. X-MoRe\ndemonstrates robust performance across a diverse set of tasks without the need\nfor additional training, showcasing the effectiveness of utilizing cross-modal\nfeatures to maximize CLIP's zero-shot ability.\n","authors":["Seongha Eom","Namgyu Ho","Jaehoon Oh","Se-Young Yun"],"pdf_url":"https://arxiv.org/pdf/2308.15273v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07439v2","updated":"2023-08-29T12:57:17Z","published":"2023-07-14T16:04:03Z","title":"Atlas-Based Interpretable Age Prediction In Whole-Body MR Images","summary":" Age prediction is an important part of medical assessments and research. It\ncan aid in detecting diseases as well as abnormal ageing by highlighting the\ndiscrepancy between chronological and biological age. To gain a comprehensive\nunderstanding of age-related changes observed in various body parts, we\ninvestigate them on a larger scale by using whole-body images. We utilise the\nGrad-CAM interpretability method to determine the body areas most predictive of\na person's age. We expand our analysis beyond individual subjects by employing\nregistration techniques to generate population-wide interpretability maps.\nFurthermore, we set state-of-the-art whole-body age prediction with a model\nthat achieves a mean absolute error of 2.76 years. Our findings reveal three\nprimary areas of interest: the spine, the autochthonous back muscles, and the\ncardiac region, which exhibits the highest importance.\n","authors":["Sophie Starck","Yadunandan Vivekanand Kini","Jessica Johanna Maria Ritter","Rickmer Braren","Daniel Rueckert","Tamara Mueller"],"pdf_url":"https://arxiv.org/pdf/2307.07439v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15266v1","updated":"2023-08-29T12:51:04Z","published":"2023-08-29T12:51:04Z","title":"NOVIS: A Case for End-to-End Near-Online Video Instance Segmentation","summary":" Until recently, the Video Instance Segmentation (VIS) community operated\nunder the common belief that offline methods are generally superior to a frame\nby frame online processing. However, the recent success of online methods\nquestions this belief, in particular, for challenging and long video sequences.\nWe understand this work as a rebuttal of those recent observations and an\nappeal to the community to focus on dedicated near-online VIS approaches. To\nsupport our argument, we present a detailed analysis on different processing\nparadigms and the new end-to-end trainable NOVIS (Near-Online Video Instance\nSegmentation) method. Our transformer-based model directly predicts\nspatio-temporal mask volumes for clips of frames and performs instance tracking\nbetween clips via overlap embeddings. NOVIS represents the first near-online\nVIS approach which avoids any handcrafted tracking heuristics. We outperform\nall existing VIS methods by large margins and provide new state-of-the-art\nresults on both YouTube-VIS (2019/2021) and the OVIS benchmarks.\n","authors":["Tim Meinhardt","Matt Feiszli","Yuchen Fan","Laura Leal-Taixe","Rakesh Ranjan"],"pdf_url":"https://arxiv.org/pdf/2308.15266v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14105v2","updated":"2023-08-29T12:43:53Z","published":"2023-08-27T13:22:55Z","title":"Unified and Dynamic Graph for Temporal Character Grouping in Long Videos","summary":" Video temporal character grouping locates appearing moments of major\ncharacters within a video according to their identities. To this end, recent\nworks have evolved from unsupervised clustering to graph-based supervised\nclustering. However, graph methods are built upon the premise of fixed affinity\ngraphs, bringing many inexact connections. Besides, they extract multi-modal\nfeatures with kinds of models, which are unfriendly to deployment. In this\npaper, we present a unified and dynamic graph (UniDG) framework for temporal\ncharacter grouping. This is accomplished firstly by a unified representation\nnetwork that learns representations of multiple modalities within the same\nspace and still preserves the modality's uniqueness simultaneously. Secondly,\nwe present a dynamic graph clustering where the neighbors of different\nquantities are dynamically constructed for each node via a cyclic matching\nstrategy, leading to a more reliable affinity graph. Thirdly, a progressive\nassociation method is introduced to exploit spatial and temporal contexts among\ndifferent modalities, allowing multi-modal clustering results to be well fused.\nAs current datasets only provide pre-extracted features, we evaluate our UniDG\nmethod on a collected dataset named MTCG, which contains each character's\nappearing clips of face and body and speaking voice tracks. We also evaluate\nour key components on existing clustering and retrieval datasets to verify the\ngeneralization ability. Experimental results manifest that our method can\nachieve promising results and outperform several state-of-the-art approaches.\n","authors":["Xiujun Shu","Wei Wen","Liangsheng Xu","Mingbao Lin","Ruizhi Qiao","Taian Guo","Hanjun Li","Bei Gan","Xiao Wang","Xing Sun"],"pdf_url":"https://arxiv.org/pdf/2308.14105v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15262v1","updated":"2023-08-29T12:41:50Z","published":"2023-08-29T12:41:50Z","title":"Enhancing OCR Performance through Post-OCR Models: Adopting Glyph\n Embedding for Improved Correction","summary":" The study investigates the potential of post-OCR models to overcome\nlimitations in OCR models and explores the impact of incorporating glyph\nembedding on post-OCR correction performance. In this study, we have developed\nour own post-OCR correction model. The novelty of our approach lies in\nembedding the OCR output using CharBERT and our unique embedding technique,\ncapturing the visual characteristics of characters. Our findings show that\npost-OCR correction effectively addresses deficiencies in inferior OCR models,\nand glyph embedding enables the model to achieve superior results, including\nthe ability to correct individual words.\n","authors":["Yung-Hsin Chen","Yuli Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.15262v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07392v2","updated":"2023-08-29T12:37:04Z","published":"2023-08-14T18:23:18Z","title":"A Unified Query-based Paradigm for Camouflaged Instance Segmentation","summary":" Due to the high similarity between camouflaged instances and the background,\nthe recently proposed camouflaged instance segmentation (CIS) faces challenges\nin accurate localization and instance segmentation. To this end, inspired by\nquery-based transformers, we propose a unified query-based multi-task learning\nframework for camouflaged instance segmentation, termed UQFormer, which builds\na set of mask queries and a set of boundary queries to learn a shared composed\nquery representation and efficiently integrates global camouflaged object\nregion and boundary cues, for simultaneous instance segmentation and instance\nboundary detection in camouflaged scenarios. Specifically, we design a composed\nquery learning paradigm that learns a shared representation to capture object\nregion and boundary features by the cross-attention interaction of mask queries\nand boundary queries in the designed multi-scale unified learning transformer\ndecoder. Then, we present a transformer-based multi-task learning framework for\nsimultaneous camouflaged instance segmentation and camouflaged instance\nboundary detection based on the learned composed query representation, which\nalso forces the model to learn a strong instance-level query representation.\nNotably, our model views the instance segmentation as a query-based direct set\nprediction problem, without other post-processing such as non-maximal\nsuppression. Compared with 14 state-of-the-art approaches, our UQFormer\nsignificantly improves the performance of camouflaged instance segmentation.\nOur code will be available at https://github.com/dongbo811/UQFormer.\n","authors":["Bo Dong","Jialun Pei","Rongrong Gao","Tian-Zhu Xiang","Shuo Wang","Huan Xiong"],"pdf_url":"https://arxiv.org/pdf/2308.07392v2.pdf","comment":"This paper has been accepted by ACM MM2023"},{"id":"http://arxiv.org/abs/2304.09121v3","updated":"2023-08-29T12:32:01Z","published":"2023-04-18T16:37:18Z","title":"Fast Neural Scene Flow","summary":" Neural Scene Flow Prior (NSFP) is of significant interest to the vision\ncommunity due to its inherent robustness to out-of-distribution (OOD) effects\nand its ability to deal with dense lidar points. The approach utilizes a\ncoordinate neural network to estimate scene flow at runtime, without any\ntraining. However, it is up to 100 times slower than current state-of-the-art\nlearning methods. In other applications such as image, video, and radiance\nfunction reconstruction innovations in speeding up the runtime performance of\ncoordinate networks have centered upon architectural changes. In this paper, we\ndemonstrate that scene flow is different -- with the dominant computational\nbottleneck stemming from the loss function itself (i.e., Chamfer distance).\nFurther, we rediscover the distance transform (DT) as an efficient,\ncorrespondence-free loss function that dramatically speeds up the runtime\noptimization. Our fast neural scene flow (FNSF) approach reports for the first\ntime real-time performance comparable to learning methods, without any training\nor OOD bias on two of the largest open autonomous driving (AV) lidar datasets\nWaymo Open and Argoverse.\n","authors":["Xueqian Li","Jianqiao Zheng","Francesco Ferroni","Jhony Kaesemodel Pontes","Simon Lucey"],"pdf_url":"https://arxiv.org/pdf/2304.09121v3.pdf","comment":"17 pages, 11 figures, 6 tables"},{"id":"http://arxiv.org/abs/2308.15236v1","updated":"2023-08-29T11:51:27Z","published":"2023-08-29T11:51:27Z","title":"Rotation Augmented Distillation for Exemplar-Free Class Incremental\n Learning with Detailed Analysis","summary":" Class incremental learning (CIL) aims to recognize both the old and new\nclasses along the increment tasks. Deep neural networks in CIL suffer from\ncatastrophic forgetting and some approaches rely on saving exemplars from\nprevious tasks, known as the exemplar-based setting, to alleviate this problem.\nOn the contrary, this paper focuses on the Exemplar-Free setting with no old\nclass sample preserved. Balancing the plasticity and stability in deep feature\nlearning with only supervision from new classes is more challenging. Most\nexisting Exemplar-Free CIL methods report the overall performance only and lack\nfurther analysis. In this work, different methods are examined with\ncomplementary metrics in greater detail. Moreover, we propose a simple CIL\nmethod, Rotation Augmented Distillation (RAD), which achieves one of the\ntop-tier performances under the Exemplar-Free setting. Detailed analysis shows\nour RAD benefits from the superior balance between plasticity and stability.\nFinally, more challenging exemplar-free settings with fewer initial classes are\nundertaken for further demonstrations and comparisons among the\nstate-of-the-art methods.\n","authors":["Xiuwei Chen","Xiaobin Chang"],"pdf_url":"https://arxiv.org/pdf/2308.15236v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14397v2","updated":"2023-08-29T11:46:44Z","published":"2023-08-28T08:24:25Z","title":"Ensemble of Anchor-Free Models for Robust Bangla Document Layout\n Segmentation","summary":" In this research paper, we introduce a novel approach designed for the\npurpose of segmenting the layout of Bangla documents. Our methodology involves\nthe utilization of a sophisticated ensemble of YOLOv8 models, which were\ntrained for the DL Sprint 2.0 - BUET CSE Fest 2023 Competition focused on\nBangla document layout segmentation. Our primary emphasis lies in enhancing\nvarious aspects of the task, including techniques such as image augmentation,\nmodel architecture, and the incorporation of model ensembles. We deliberately\nreduce the quality of a subset of document images to enhance the resilience of\nmodel training, thereby resulting in an improvement in our cross-validation\nscore. By employing Bayesian optimization, we determine the optimal confidence\nand Intersection over Union (IoU) thresholds for our model ensemble. Through\nour approach, we successfully demonstrate the effectiveness of anchor-free\nmodels in achieving robust layout segmentation in Bangla documents.\n","authors":["U Mong Sain Chak","Md. Asib Rahman"],"pdf_url":"https://arxiv.org/pdf/2308.14397v2.pdf","comment":"4 pages, 5 figures, 6 Tables"},{"id":"http://arxiv.org/abs/2308.15226v1","updated":"2023-08-29T11:29:43Z","published":"2023-08-29T11:29:43Z","title":"CLIPTrans: Transferring Visual Knowledge with Pre-trained Models for\n Multimodal Machine Translation","summary":" There has been a growing interest in developing multimodal machine\ntranslation (MMT) systems that enhance neural machine translation (NMT) with\nvisual knowledge. This problem setup involves using images as auxiliary\ninformation during training, and more recently, eliminating their use during\ninference. Towards this end, previous works face a challenge in training\npowerful MMT models from scratch due to the scarcity of annotated multilingual\nvision-language data, especially for low-resource languages. Simultaneously,\nthere has been an influx of multilingual pre-trained models for NMT and\nmultimodal pre-trained models for vision-language tasks, primarily in English,\nwhich have shown exceptional generalisation ability. However, these are not\ndirectly applicable to MMT since they do not provide aligned multimodal\nmultilingual features for generative tasks. To alleviate this issue, instead of\ndesigning complex modules for MMT, we propose CLIPTrans, which simply adapts\nthe independently pre-trained multimodal M-CLIP and the multilingual mBART. In\norder to align their embedding spaces, mBART is conditioned on the M-CLIP\nfeatures by a prefix sequence generated through a lightweight mapping network.\nWe train this in a two-stage pipeline which warms up the model with image\ncaptioning before the actual translation task. Through experiments, we\ndemonstrate the merits of this framework and consequently push forward the\nstate-of-the-art across standard benchmarks by an average of +2.67 BLEU. The\ncode can be found at www.github.com/devaansh100/CLIPTrans.\n","authors":["Devaansh Gupta","Siddhant Kharbanda","Jiawei Zhou","Wanhua Li","Hanspeter Pfister","Donglai Wei"],"pdf_url":"https://arxiv.org/pdf/2308.15226v1.pdf","comment":"15 pages, 9 figures, to be published In Proceedings of International\n Conference of Computer Vision(ICCV), 2023"},{"id":"http://arxiv.org/abs/2308.15216v1","updated":"2023-08-29T11:12:53Z","published":"2023-08-29T11:12:53Z","title":"Optron: Better Medical Image Registration via Training in the Loop","summary":" Previously, in the field of medical image registration, there are primarily\ntwo paradigms, the traditional optimization-based methods, and the\ndeep-learning-based methods. Each of these paradigms has its advantages, and in\nthis work, we aim to take the best of both worlds. Instead of developing a new\ndeep learning model, we designed a robust training architecture that is simple\nand generalizable. We present Optron, a general training architecture\nincorporating the idea of training-in-the-loop. By iteratively optimizing the\nprediction result of a deep learning model through a plug-and-play optimizer\nmodule in the training loop, Optron introduces pseudo ground truth to an\nunsupervised training process. And by bringing the training process closer to\nthat of supervised training, Optron can consistently improve the models'\nperformance and convergence speed. We evaluated our method on various\ncombinations of models and datasets, and we have achieved state-of-the-art\nperformance on the IXI dataset, improving the previous state-of-the-art method\nTransMorph by a significant margin of +1.6% DSC. Moreover, Optron also\nconsistently achieved positive results with other models and datasets. It\nincreases the validation DSC for VoxelMorph and ViT-V-Net by +2.3% and +2.2%\nrespectively on IXI, demonstrating our method's generalizability. Our\nimplementation is publicly available at\nhttps://github.com/miraclefactory/optron\n","authors":["Yicheng Chen","Shengxiang Ji","Yuelin Xin","Kun Han","Xiaohui Xie"],"pdf_url":"https://arxiv.org/pdf/2308.15216v1.pdf","comment":"10 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2306.03038v2","updated":"2023-08-29T11:08:59Z","published":"2023-06-05T16:53:58Z","title":"HeadSculpt: Crafting 3D Head Avatars with Text","summary":" Recently, text-guided 3D generative methods have made remarkable advancements\nin producing high-quality textures and geometry, capitalizing on the\nproliferation of large vision-language and image diffusion models. However,\nexisting methods still struggle to create high-fidelity 3D head avatars in two\naspects: (1) They rely mostly on a pre-trained text-to-image diffusion model\nwhilst missing the necessary 3D awareness and head priors. This makes them\nprone to inconsistency and geometric distortions in the generated avatars. (2)\nThey fall short in fine-grained editing. This is primarily due to the inherited\nlimitations from the pre-trained 2D image diffusion models, which become more\npronounced when it comes to 3D head avatars. In this work, we address these\nchallenges by introducing a versatile coarse-to-fine pipeline dubbed HeadSculpt\nfor crafting (i.e., generating and editing) 3D head avatars from textual\nprompts. Specifically, we first equip the diffusion model with 3D awareness by\nleveraging landmark-based control and a learned textual embedding representing\nthe back view appearance of heads, enabling 3D-consistent head avatar\ngenerations. We further propose a novel identity-aware editing score\ndistillation strategy to optimize a textured mesh with a high-resolution\ndifferentiable rendering technique. This enables identity preservation while\nfollowing the editing instruction. We showcase HeadSculpt's superior fidelity\nand editing capabilities through comprehensive experiments and comparisons with\nexisting methods.\n","authors":["Xiao Han","Yukang Cao","Kai Han","Xiatian Zhu","Jiankang Deng","Yi-Zhe Song","Tao Xiang","Kwan-Yee K. Wong"],"pdf_url":"https://arxiv.org/pdf/2306.03038v2.pdf","comment":"Webpage: https://brandonhan.uk/HeadSculpt/"},{"id":"http://arxiv.org/abs/2211.14573v3","updated":"2023-08-29T10:59:41Z","published":"2022-11-26T14:00:18Z","title":"Deep Curvilinear Editing: Commutative and Nonlinear Image Manipulation\n for Pretrained Deep Generative Model","summary":" Semantic editing of images is the fundamental goal of computer vision.\nAlthough deep learning methods, such as generative adversarial networks (GANs),\nare capable of producing high-quality images, they often do not have an\ninherent way of editing generated images semantically. Recent studies have\ninvestigated a way of manipulating the latent variable to determine the images\nto be generated. However, methods that assume linear semantic arithmetic have\ncertain limitations in terms of the quality of image editing, whereas methods\nthat discover nonlinear semantic pathways provide non-commutative editing,\nwhich is inconsistent when applied in different orders. This study proposes a\nnovel method called deep curvilinear editing (DeCurvEd) to determine semantic\ncommuting vector fields on the latent space. We theoretically demonstrate that\nowing to commutativity, the editing of multiple attributes depends only on the\nquantities and not on the order. Furthermore, we experimentally demonstrate\nthat compared to previous methods, the nonlinear and commutative nature of\nDeCurvEd facilitates the disentanglement of image attributes and provides\nhigher-quality editing.\n","authors":["Takehiro Aoshima","Takashi Matsubara"],"pdf_url":"https://arxiv.org/pdf/2211.14573v3.pdf","comment":"15 pages. The last update made no changes except for adding the\n following link to the CVF repository:\n https://openaccess.thecvf.com/content/CVPR2023/html/Aoshima_Deep_Curvilinear_Editing_Commutative_and_Nonlinear_Image_Manipulation_for_Pretrained_CVPR_2023_paper.html.\n Here, you can find our code to reproduce our results"},{"id":"http://arxiv.org/abs/2308.10658v2","updated":"2023-08-29T10:37:26Z","published":"2023-08-21T11:51:46Z","title":"Learning Clothing and Pose Invariant 3D Shape Representation for\n Long-Term Person Re-Identification","summary":" Long-Term Person Re-Identification (LT-ReID) has become increasingly crucial\nin computer vision and biometrics. In this work, we aim to extend LT-ReID\nbeyond pedestrian recognition to include a wider range of real-world human\nactivities while still accounting for cloth-changing scenarios over large time\ngaps. This setting poses additional challenges due to the geometric\nmisalignment and appearance ambiguity caused by the diversity of human pose and\nclothing. To address these challenges, we propose a new approach 3DInvarReID\nfor (i) disentangling identity from non-identity components (pose, clothing\nshape, and texture) of 3D clothed humans, and (ii) reconstructing accurate 3D\nclothed body shapes and learning discriminative features of naked body shapes\nfor person ReID in a joint manner. To better evaluate our study of LT-ReID, we\ncollect a real-world dataset called CCDA, which contains a wide variety of\nhuman activities and clothing changes. Experimentally, we show the superior\nperformance of our approach for person ReID.\n","authors":["Feng Liu","Minchul Kim","ZiAng Gu","Anil Jain","Xiaoming Liu"],"pdf_url":"https://arxiv.org/pdf/2308.10658v2.pdf","comment":"10 pages, 7 figures, accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2304.11705v2","updated":"2023-08-29T10:08:24Z","published":"2023-04-23T17:43:29Z","title":"Walking Your LiDOG: A Journey Through Multiple Domains for LiDAR\n Semantic Segmentation","summary":" The ability to deploy robots that can operate safely in diverse environments\nis crucial for developing embodied intelligent agents. As a community, we have\nmade tremendous progress in within-domain LiDAR semantic segmentation. However,\ndo these methods generalize across domains? To answer this question, we design\nthe first experimental setup for studying domain generalization (DG) for LiDAR\nsemantic segmentation (DG-LSS). Our results confirm a significant gap between\nmethods, evaluated in a cross-domain setting: for example, a model trained on\nthe source dataset (SemanticKITTI) obtains $26.53$ mIoU on the target data,\ncompared to $48.49$ mIoU obtained by the model trained on the target domain\n(nuScenes). To tackle this gap, we propose the first method specifically\ndesigned for DG-LSS, which obtains $34.88$ mIoU on the target domain,\noutperforming all baselines. Our method augments a sparse-convolutional\nencoder-decoder 3D segmentation network with an additional, dense 2D\nconvolutional decoder that learns to classify a birds-eye view of the point\ncloud. This simple auxiliary task encourages the 3D network to learn features\nthat are robust to sensor placement shifts and resolution, and are transferable\nacross domains. With this work, we aim to inspire the community to develop and\nevaluate future models in such cross-domain conditions.\n","authors":["Cristiano Saltori","Aljoša Ošep","Elisa Ricci","Laura Leal-Taixé"],"pdf_url":"https://arxiv.org/pdf/2304.11705v2.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2308.15172v1","updated":"2023-08-29T09:54:30Z","published":"2023-08-29T09:54:30Z","title":"Is visual explanation with Grad-CAM more reliable for deeper neural\n networks? a case study with automatic pneumothorax diagnosis","summary":" While deep learning techniques have provided the state-of-the-art performance\nin various clinical tasks, explainability regarding their decision-making\nprocess can greatly enhance the credence of these methods for safer and quicker\nclinical adoption. With high flexibility, Gradient-weighted Class Activation\nMapping (Grad-CAM) has been widely adopted to offer intuitive visual\ninterpretation of various deep learning models' reasoning processes in\ncomputer-assisted diagnosis. However, despite the popularity of the technique,\nthere is still a lack of systematic study on Grad-CAM's performance on\ndifferent deep learning architectures. In this study, we investigate its\nrobustness and effectiveness across different popular deep learning models,\nwith a focus on the impact of the networks' depths and architecture types, by\nusing a case study of automatic pneumothorax diagnosis in X-ray scans. Our\nresults show that deeper neural networks do not necessarily contribute to a\nstrong improvement of pneumothorax diagnosis accuracy, and the effectiveness of\nGradCAM also varies among different network architectures.\n","authors":["Zirui Qiu","Hassan Rivaz","Yiming Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.15172v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15170v1","updated":"2023-08-29T09:53:10Z","published":"2023-08-29T09:53:10Z","title":"A lightweight 3D dense facial landmark estimation model from position\n map data","summary":" The incorporation of 3D data in facial analysis tasks has gained popularity\nin recent years. Though it provides a more accurate and detailed representation\nof the human face, accruing 3D face data is more complex and expensive than 2D\nface images. Either one has to rely on expensive 3D scanners or depth sensors\nwhich are prone to noise. An alternative option is the reconstruction of 3D\nfaces from uncalibrated 2D images in an unsupervised way without any ground\ntruth 3D data. However, such approaches are computationally expensive and the\nlearned model size is not suitable for mobile or other edge device\napplications. Predicting dense 3D landmarks over the whole face can overcome\nthis issue. As there is no public dataset available containing dense landmarks,\nwe propose a pipeline to create a dense keypoint training dataset containing\n520 key points across the whole face from an existing facial position map data.\nWe train a lightweight MobileNet-based regressor model with the generated data.\nAs we do not have access to any evaluation dataset with dense landmarks in it\nwe evaluate our model against the 68 keypoint detection task. Experimental\nresults show that our trained model outperforms many of the existing methods in\nspite of its lower model size and minimal computational cost. Also, the\nqualitative evaluation shows the efficiency of our trained models in extreme\nhead pose angles as well as other facial variations and occlusions.\n","authors":["Shubhajit Basak","Sathish Mangapuram","Gabriel Costache","Rachel McDonnell","Michael Schukat"],"pdf_url":"https://arxiv.org/pdf/2308.15170v1.pdf","comment":"8 pages, The Irish Machine Vision and Image Processing\n Conference(IMVIP)"},{"id":"http://arxiv.org/abs/2308.15169v1","updated":"2023-08-29T09:52:32Z","published":"2023-08-29T09:52:32Z","title":"Uncovering the Unseen: Discover Hidden Intentions by Micro-Behavior\n Graph Reasoning","summary":" This paper introduces a new and challenging Hidden Intention Discovery (HID)\ntask. Unlike existing intention recognition tasks, which are based on obvious\nvisual representations to identify common intentions for normal behavior, HID\nfocuses on discovering hidden intentions when humans try to hide their\nintentions for abnormal behavior. HID presents a unique challenge in that\nhidden intentions lack the obvious visual representations to distinguish them\nfrom normal intentions. Fortunately, from a sociological and psychological\nperspective, we find that the difference between hidden and normal intentions\ncan be reasoned from multiple micro-behaviors, such as gaze, attention, and\nfacial expressions. Therefore, we first discover the relationship between\nmicro-behavior and hidden intentions and use graph structure to reason about\nhidden intentions. To facilitate research in the field of HID, we also\nconstructed a seminal dataset containing a hidden intention annotation of a\ntypical theft scenario for HID. Extensive experiments show that the proposed\nnetwork improves performance on the HID task by 9.9\\% over the state-of-the-art\nmethod SBP.\n","authors":["Zhuo Zhou","Wenxuan Liu","Danni Xu","Zheng Wang","Jian Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.15169v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.07700v2","updated":"2023-08-29T09:49:34Z","published":"2022-12-15T10:23:32Z","title":"Colab NAS: Obtaining lightweight task-specific convolutional neural\n networks following Occam's razor","summary":" The current trend of applying transfer learning from convolutional neural\nnetworks (CNNs) trained on large datasets can be an overkill when the target\napplication is a custom and delimited problem, with enough data to train a\nnetwork from scratch. On the other hand, the training of custom and lighter\nCNNs requires expertise, in the from-scratch case, and or high-end resources,\nas in the case of hardware-aware neural architecture search (HW NAS), limiting\naccess to the technology by non-habitual NN developers.\n For this reason, we present ColabNAS, an affordable HW NAS technique for\nproducing lightweight task-specific CNNs. Its novel derivative-free search\nstrategy, inspired by Occam's razor, allows to obtain state-of-the-art results\non the Visual Wake Word dataset, a standard TinyML benchmark, in just 3.1 GPU\nhours using free online GPU services such as Google Colaboratory and Kaggle\nKernel.\n","authors":["Andrea Mattia Garavagno","Daniele Leonardis","Antonio Frisoli"],"pdf_url":"https://arxiv.org/pdf/2212.07700v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.14672v2","updated":"2023-08-29T09:33:59Z","published":"2023-03-26T10:15:33Z","title":"Sat2Density: Faithful Density Learning from Satellite-Ground Image Pairs","summary":" This paper aims to develop an accurate 3D geometry representation of\nsatellite images using satellite-ground image pairs. Our focus is on the\nchallenging problem of 3D-aware ground-views synthesis from a satellite image.\nWe draw inspiration from the density field representation used in volumetric\nneural rendering and propose a new approach, called Sat2Density. Our method\nutilizes the properties of ground-view panoramas for the sky and non-sky\nregions to learn faithful density fields of 3D scenes in a geometric\nperspective. Unlike other methods that require extra depth information during\ntraining, our Sat2Density can automatically learn accurate and faithful 3D\ngeometry via density representation without depth supervision. This advancement\nsignificantly improves the ground-view panorama synthesis task. Additionally,\nour study provides a new geometric perspective to understand the relationship\nbetween satellite and ground-view images in 3D space.\n","authors":["Ming Qian","Jincheng Xiong","Gui-Song Xia","Nan Xue"],"pdf_url":"https://arxiv.org/pdf/2303.14672v2.pdf","comment":"ICCV 2023, project page: https://sat2density.github.io/, code:\n https://github.com/qianmingduowan/Sat2Density"},{"id":"http://arxiv.org/abs/2308.15142v1","updated":"2023-08-29T09:21:48Z","published":"2023-08-29T09:21:48Z","title":"A Multimodal Visual Encoding Model Aided by Introducing Verbal Semantic\n Information","summary":" Biological research has revealed that the verbal semantic information in the\nbrain cortex, as an additional source, participates in nonverbal semantic\ntasks, such as visual encoding. However, previous visual encoding models did\nnot incorporate verbal semantic information, contradicting this biological\nfinding. This paper proposes a multimodal visual information encoding network\nmodel based on stimulus images and associated textual information in response\nto this issue. Our visual information encoding network model takes stimulus\nimages as input and leverages textual information generated by a text-image\ngeneration model as verbal semantic information. This approach injects new\ninformation into the visual encoding model. Subsequently, a Transformer network\naligns image and text feature information, creating a multimodal feature space.\nA convolutional network then maps from this multimodal feature space to voxel\nspace, constructing the multimodal visual information encoding network model.\nExperimental results demonstrate that the proposed multimodal visual\ninformation encoding network model outperforms previous models under the exact\ntraining cost. In voxel prediction of the left hemisphere of subject 1's brain,\nthe performance improves by approximately 15.87%, while in the right\nhemisphere, the performance improves by about 4.6%. The multimodal visual\nencoding network model exhibits superior encoding performance. Additionally,\nablation experiments indicate that our proposed model better simulates the\nbrain's visual information processing.\n","authors":["Shuxiao Ma","Linyuan Wang","Bin Yan"],"pdf_url":"https://arxiv.org/pdf/2308.15142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15141v1","updated":"2023-08-29T09:19:49Z","published":"2023-08-29T09:19:49Z","title":"Uncertainty Aware Training to Improve Deep Learning Model Calibration\n for Classification of Cardiac MR Images","summary":" Quantifying uncertainty of predictions has been identified as one way to\ndevelop more trustworthy artificial intelligence (AI) models beyond\nconventional reporting of performance metrics. When considering their role in a\nclinical decision support setting, AI classification models should ideally\navoid confident wrong predictions and maximise the confidence of correct\npredictions. Models that do this are said to be well-calibrated with regard to\nconfidence. However, relatively little attention has been paid to how to\nimprove calibration when training these models, i.e., to make the training\nstrategy uncertainty-aware. In this work we evaluate three novel\nuncertainty-aware training strategies comparing against two state-of-the-art\napproaches. We analyse performance on two different clinical applications:\ncardiac resynchronisation therapy (CRT) response prediction and coronary artery\ndisease (CAD) diagnosis from cardiac magnetic resonance (CMR) images. The\nbest-performing model in terms of both classification accuracy and the most\ncommon calibration measure, expected calibration error (ECE) was the Confidence\nWeight method, a novel approach that weights the loss of samples to explicitly\npenalise confident incorrect predictions. The method reduced the ECE by 17% for\nCRT response prediction and by 22% for CAD diagnosis when compared to a\nbaseline classifier in which no uncertainty-aware strategy was included. In\nboth applications, as well as reducing the ECE there was a slight increase in\naccuracy from 69% to 70% and 70% to 72% for CRT response prediction and CAD\ndiagnosis respectively. However, our analysis showed a lack of consistency in\nterms of optimal models when using different calibration measures. This\nindicates the need for careful consideration of performance metrics when\ntraining and selecting models for complex high-risk applications in healthcare.\n","authors":["Tareen Dawood","Chen Chen","Baldeep S. Sidhua","Bram Ruijsink","Justin Goulda","Bradley Porter","Mark K. Elliott","Vishal Mehta","Christopher A. Rinaldi","Esther Puyol-Anton","Reza Razavi","Andrew P. King"],"pdf_url":"https://arxiv.org/pdf/2308.15141v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14619v2","updated":"2023-08-29T09:16:48Z","published":"2023-08-28T14:43:36Z","title":"Compositional Semantic Mix for Domain Adaptation in Point Cloud\n Segmentation","summary":" Deep-learning models for 3D point cloud semantic segmentation exhibit limited\ngeneralization capabilities when trained and tested on data captured with\ndifferent sensors or in varying environments due to domain shift. Domain\nadaptation methods can be employed to mitigate this domain shift, for instance,\nby simulating sensor noise, developing domain-agnostic generators, or training\npoint cloud completion networks. Often, these methods are tailored for range\nview maps or necessitate multi-modal input. In contrast, domain adaptation in\nthe image domain can be executed through sample mixing, which emphasizes input\ndata manipulation rather than employing distinct adaptation modules. In this\nstudy, we introduce compositional semantic mixing for point cloud domain\nadaptation, representing the first unsupervised domain adaptation technique for\npoint cloud segmentation based on semantic and geometric sample mixing. We\npresent a two-branch symmetric network architecture capable of concurrently\nprocessing point clouds from a source domain (e.g. synthetic) and point clouds\nfrom a target domain (e.g. real-world). Each branch operates within one domain\nby integrating selected data fragments from the other domain and utilizing\nsemantic information derived from source labels and target (pseudo) labels.\nAdditionally, our method can leverage a limited number of human point-level\nannotations (semi-supervised) to further enhance performance. We assess our\napproach in both synthetic-to-real and real-to-real scenarios using LiDAR\ndatasets and demonstrate that it significantly outperforms state-of-the-art\nmethods in both unsupervised and semi-supervised settings.\n","authors":["Cristiano Saltori","Fabio Galasso","Giuseppe Fiameni","Nicu Sebe","Fabio Poiesi","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2308.14619v2.pdf","comment":"TPAMI. arXiv admin note: text overlap with arXiv:2207.09778"},{"id":"http://arxiv.org/abs/2308.15137v1","updated":"2023-08-29T09:13:24Z","published":"2023-08-29T09:13:24Z","title":"Abdominal Multi-Organ Segmentation Based on Feature Pyramid Network and\n Spatial Recurrent Neural Network","summary":" As recent advances in AI are causing the decline of conventional diagnostic\nmethods, the realization of end-to-end diagnosis is fast approaching.\nUltrasound image segmentation is an important step in the diagnostic process.\nAn accurate and robust segmentation model accelerates the process and reduces\nthe burden of sonographers. In contrast to previous research, we take two\ninherent features of ultrasound images into consideration: (1) different organs\nand tissues vary in spatial sizes, (2) the anatomical structures inside human\nbody form a relatively constant spatial relationship. Based on those two ideas,\nwe propose a new image segmentation model combining Feature Pyramid Network\n(FPN) and Spatial Recurrent Neural Network (SRNN). We discuss why we use FPN to\nextract anatomical structures of different scales and how SRNN is implemented\nto extract the spatial context features in abdominal ultrasound images.\n","authors":["Yuhan Song","Armagan Elibol","Nak Young Chong"],"pdf_url":"https://arxiv.org/pdf/2308.15137v1.pdf","comment":"IFAC World Congress 2023 paper"},{"id":"http://arxiv.org/abs/2308.15136v1","updated":"2023-08-29T09:10:53Z","published":"2023-08-29T09:10:53Z","title":"CAGRA: Highly Parallel Graph Construction and Approximate Nearest\n Neighbor Search for GPUs","summary":" Approximate Nearest Neighbor Search (ANNS) plays a critical role in various\ndisciplines spanning data mining and artificial intelligence, from information\nretrieval and computer vision to natural language processing and recommender\nsystems. Data volumes have soared in recent years and the computational cost of\nan exhaustive exact nearest neighbor search is often prohibitive, necessitating\nthe adoption of approximate techniques. The balanced performance and recall of\ngraph-based approaches have more recently garnered significant attention in\nANNS algorithms, however, only a few studies have explored harnessing the power\nof GPUs and multi-core processors despite the widespread use of massively\nparallel and general-purpose computing. To bridge this gap, we introduce a\nnovel parallel computing hardware-based proximity graph and search algorithm.\nBy leveraging the high-performance capabilities of modern hardware, our\napproach achieves remarkable efficiency gains. In particular, our method\nsurpasses existing CPU and GPU-based methods in constructing the proximity\ngraph, demonstrating higher throughput in both large- and small-batch searches\nwhile maintaining compatible accuracy. In graph construction time, our method,\nCAGRA, is 2.2~27x faster than HNSW, which is one of the CPU SOTA\nimplementations. In large-batch query throughput in the 90% to 95% recall\nrange, our method is 33~77x faster than HNSW, and is 3.8~8.8x faster than the\nSOTA implementations for GPU. For a single query, our method is 3.4~53x faster\nthan HNSW at 95% recall.\n","authors":["Hiroyuki Ootomo","Akira Naruse","Corey Nolet","Ray Wang","Tamas Feher","Yong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.15136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.08015v2","updated":"2023-08-29T09:05:58Z","published":"2022-08-17T01:44:32Z","title":"Cross-Domain Few-Shot Classification via Inter-Source Stylization","summary":" The goal of Cross-Domain Few-Shot Classification (CDFSC) is to accurately\nclassify a target dataset with limited labelled data by exploiting the\nknowledge of a richly labelled auxiliary dataset, despite the differences\nbetween the domains of the two datasets. Some existing approaches require\nlabelled samples from multiple domains for model training. However, these\nmethods fail when the sample labels are scarce. To overcome this challenge,\nthis paper proposes a solution that makes use of multiple source domains\nwithout the need for additional labeling costs. Specifically, one of the source\ndomains is completely tagged, while the others are untagged. An Inter-Source\nStylization Network (ISSNet) is then introduced to enhance stylisation across\nmultiple source domains, enriching data distribution and model's generalization\ncapabilities. Experiments on 8 target datasets show that ISSNet leverages\nunlabelled data from multiple source data and significantly reduces the\nnegative impact of domain gaps on classification performance compared to\nseveral baseline methods.\n","authors":["Huali Xu","Shuaifeng Zhi","Li Liu"],"pdf_url":"https://arxiv.org/pdf/2208.08015v2.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2308.13495v2","updated":"2023-08-29T09:01:13Z","published":"2023-08-25T17:10:22Z","title":"Open Gaze: Open Source eye tracker for smartphone devices using Deep\n Learning","summary":" Eye tracking has been a pivotal tool in diverse fields such as vision\nresearch, language analysis, and usability assessment. The majority of prior\ninvestigations, however, have concentrated on expansive desktop displays\nemploying specialized, costly eye tracking hardware that lacks scalability.\nRemarkably little insight exists into ocular movement patterns on smartphones,\ndespite their widespread adoption and significant usage. In this manuscript, we\npresent an open-source implementation of a smartphone-based gaze tracker that\nemulates the methodology proposed by a GooglePaper (whose source code remains\nproprietary). Our focus is on attaining accuracy comparable to that attained\nthrough the GooglePaper's methodology, without the necessity for supplementary\nhardware. Through the integration of machine learning techniques, we unveil an\naccurate eye tracking solution that is native to smartphones. Our approach\ndemonstrates precision akin to the state-of-the-art mobile eye trackers, which\nare characterized by a cost that is two orders of magnitude higher. Leveraging\nthe vast MIT GazeCapture dataset, which is available through registration on\nthe dataset's website, we successfully replicate crucial findings from previous\nstudies concerning ocular motion behavior in oculomotor tasks and saliency\nanalyses during natural image observation. Furthermore, we emphasize the\napplicability of smartphone-based gaze tracking in discerning reading\ncomprehension challenges. Our findings exhibit the inherent potential to\namplify eye movement research by significant proportions, accommodating\nparticipation from thousands of subjects with explicit consent. This\nscalability not only fosters advancements in vision research, but also extends\nits benefits to domains such as accessibility enhancement and healthcare\napplications.\n","authors":["Sushmanth reddy","Jyothi Swaroop Reddy"],"pdf_url":"https://arxiv.org/pdf/2308.13495v2.pdf","comment":"26 pages , 15 figures"},{"id":"http://arxiv.org/abs/2308.08730v3","updated":"2023-08-29T08:52:58Z","published":"2023-08-17T01:59:59Z","title":"Learning A Coarse-to-Fine Diffusion Transformer for Image Restoration","summary":" Recent years have witnessed the remarkable performance of diffusion models in\nvarious vision tasks. However, for image restoration that aims to recover clear\nimages with sharper details from given degraded observations, diffusion-based\nmethods may fail to recover promising results due to inaccurate noise\nestimation. Moreover, simple constraining noises cannot effectively learn\ncomplex degradation information, which subsequently hinders the model capacity.\nTo solve the above problems, we propose a coarse-to-fine diffusion Transformer\n(C2F-DFT) for image restoration. Specifically, our C2F-DFT contains diffusion\nself-attention (DFSA) and diffusion feed-forward network (DFN) within a new\ncoarse-to-fine training scheme. The DFSA and DFN respectively capture the\nlong-range diffusion dependencies and learn hierarchy diffusion representation\nto facilitate better restoration. In the coarse training stage, our C2F-DFT\nestimates noises and then generates the final clean image by a sampling\nalgorithm. To further improve the restoration quality, we propose a simple yet\neffective fine training scheme. It first exploits the coarse-trained diffusion\nmodel with fixed steps to generate restoration results, which then would be\nconstrained with corresponding ground-truth ones to optimize the models to\nremedy the unsatisfactory results affected by inaccurate noise estimation.\nExtensive experiments show that C2F-DFT significantly outperforms\ndiffusion-based restoration method IR-SDE and achieves competitive performance\ncompared with Transformer-based state-of-the-art methods on $3$ tasks,\nincluding deraining, deblurring, and real denoising. The code is available at\nhttps://github.com/wlydlut/C2F-DFT.\n","authors":["Liyan Wang","Qinyu Yang","Cong Wang","Wei Wang","Jinshan Pan","Zhixun Su"],"pdf_url":"https://arxiv.org/pdf/2308.08730v3.pdf","comment":"9 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.15126v1","updated":"2023-08-29T08:51:24Z","published":"2023-08-29T08:51:24Z","title":"Evaluation and Analysis of Hallucination in Large Vision-Language Models","summary":" Large Vision-Language Models (LVLMs) have recently achieved remarkable\nsuccess. However, LVLMs are still plagued by the hallucination problem, which\nlimits the practicality in many scenarios. Hallucination refers to the\ninformation of LVLMs' responses that does not exist in the visual input, which\nposes potential risks of substantial consequences. There has been limited work\nstudying hallucination evaluation in LVLMs. In this paper, we propose\nHallucination Evaluation based on Large Language Models (HaELM), an LLM-based\nhallucination evaluation framework. HaELM achieves an approximate 95%\nperformance comparable to ChatGPT and has additional advantages including low\ncost, reproducibility, privacy preservation and local deployment. Leveraging\nthe HaELM, we evaluate the hallucination in current LVLMs. Furthermore, we\nanalyze the factors contributing to hallucination in LVLMs and offer helpful\nsuggestions to mitigate the hallucination problem. Our training data and human\nannotation hallucination data will be made public soon.\n","authors":["Junyang Wang","Yiyang Zhou","Guohai Xu","Pengcheng Shi","Chenlin Zhao","Haiyang Xu","Qinghao Ye","Ming Yan","Ji Zhang","Jihua Zhu","Jitao Sang","Haoyu Tang"],"pdf_url":"https://arxiv.org/pdf/2308.15126v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.15119v1","updated":"2023-08-29T08:37:16Z","published":"2023-08-29T08:37:16Z","title":"AI-Based Facial Emotion Recognition Solutions for Education: A Study of\n Teacher-User and Other Categories","summary":" Existing information on AI-based facial emotion recognition (FER) is not\neasily comprehensible by those outside the field of computer science, requiring\ncross-disciplinary effort to determine a categorisation framework that promotes\nthe understanding of this technology, and its impact on users. Most proponents\nclassify FER in terms of methodology, implementation and analysis; relatively\nfew by its application in education; and none by its users. This paper is\nconcerned primarily with (potential) teacher-users of FER tools for education.\nIt proposes a three-part classification of these teachers, by orientation,\ncondition and preference, based on a classical taxonomy of affective\neducational objectives, and related theories. It also compiles and organises\nthe types of FER solutions found in or inferred from the literature into\n\"technology\" and \"applications\" categories, as a prerequisite for structuring\nthe proposed \"teacher-user\" category. This work has implications for\nproponents', critics', and users' understanding of the relationship between\nteachers and FER.\n","authors":["R. Yamamoto Ravenor"],"pdf_url":"https://arxiv.org/pdf/2308.15119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.13004v2","updated":"2023-08-29T08:35:07Z","published":"2022-10-24T07:50:02Z","title":"Efficient Representation of Natural Image Patches","summary":" In the complex domain of neural information processing, discerning\nfundamental principles from ancillary details remains a significant challenge.\nWhile there is extensive knowledge about the anatomy and physiology of the\nearly visual system, a comprehensive computational theory remains elusive. Can\nwe gain insights into the underlying principles of a biological system by\nabstracting away from its detailed implementation and focusing on the\nfundamental problems that the system is designed to solve? Utilizing an\nabstract model based on minimal yet realistic assumptions, we show how to\nachieve the early visual system's two ultimate objectives: efficient\ninformation transmission and sensor probability distribution modeling. We show\nthat optimizing for information transmission does not yield optimal probability\ndistribution modeling. We illustrate, using a two-pixel (2D) system and image\npatches, that an efficient representation can be realized via nonlinear\npopulation code driven by two types of biologically plausible loss functions\nthat depend solely on output. After unsupervised learning, our abstract IPU\nmodel bears remarkable resemblances to biological systems, despite not\nmimicking many features of real neurons, such as spiking activity. A\npreliminary comparison with a contemporary deep learning model suggests that\nthe IPU model offers a significant efficiency advantage. Our model provides\nnovel insights into the computational theory of early visual systems as well as\na potential new approach to enhance the efficiency of deep learning models.\n","authors":["Cheng Guo"],"pdf_url":"https://arxiv.org/pdf/2210.13004v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.00647v3","updated":"2023-08-29T08:34:40Z","published":"2022-10-02T22:45:11Z","title":"IntrinsicNeRF: Learning Intrinsic Neural Radiance Fields for Editable\n Novel View Synthesis","summary":" Existing inverse rendering combined with neural rendering methods can only\nperform editable novel view synthesis on object-specific scenes, while we\npresent intrinsic neural radiance fields, dubbed IntrinsicNeRF, which introduce\nintrinsic decomposition into the NeRF-based neural rendering method and can\nextend its application to room-scale scenes. Since intrinsic decomposition is a\nfundamentally under-constrained inverse problem, we propose a novel\ndistance-aware point sampling and adaptive reflectance iterative clustering\noptimization method, which enables IntrinsicNeRF with traditional intrinsic\ndecomposition constraints to be trained in an unsupervised manner, resulting in\nmulti-view consistent intrinsic decomposition results. To cope with the problem\nthat different adjacent instances of similar reflectance in a scene are\nincorrectly clustered together, we further propose a hierarchical clustering\nmethod with coarse-to-fine optimization to obtain a fast hierarchical indexing\nrepresentation. It supports compelling real-time augmented applications such as\nrecoloring and illumination variation. Extensive experiments and editing\nsamples on both object-specific/room-scale scenes and synthetic/real-word data\ndemonstrate that we can obtain consistent intrinsic decomposition results and\nhigh-fidelity novel view synthesis even for challenging sequences.\n","authors":["Weicai Ye","Shuo Chen","Chong Bao","Hujun Bao","Marc Pollefeys","Zhaopeng Cui","Guofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2210.00647v3.pdf","comment":"Accepted to ICCV2023, Project webpage:\n https://zju3dv.github.io/intrinsic_nerf/, code:\n https://github.com/zju3dv/IntrinsicNeRF"},{"id":"http://arxiv.org/abs/2308.15109v1","updated":"2023-08-29T08:20:23Z","published":"2023-08-29T08:20:23Z","title":"DiffusionVMR: Diffusion Model for Video Moment Retrieval","summary":" Video moment retrieval is a fundamental visual-language task that aims to\nretrieve target moments from an untrimmed video based on a language query.\nExisting methods typically generate numerous proposals manually or via\ngenerative networks in advance as the support set for retrieval, which is not\nonly inflexible but also time-consuming. Inspired by the success of diffusion\nmodels on object detection, this work aims at reformulating video moment\nretrieval as a denoising generation process to get rid of the inflexible and\ntime-consuming proposal generation. To this end, we propose a novel\nproposal-free framework, namely DiffusionVMR, which directly samples random\nspans from noise as candidates and introduces denoising learning to ground\ntarget moments. During training, Gaussian noise is added to the real moments,\nand the model is trained to learn how to reverse this process. In inference, a\nset of time spans is progressively refined from the initial noise to the final\noutput. Notably, the training and inference of DiffusionVMR are decoupled, and\nan arbitrary number of random spans can be used in inference without being\nconsistent with the training phase. Extensive experiments conducted on three\nwidely-used benchmarks (i.e., QVHighlight, Charades-STA, and TACoS) demonstrate\nthe effectiveness of the proposed DiffusionVMR by comparing it with\nstate-of-the-art methods.\n","authors":["Henghao Zhao","Kevin Qinghong Lin","Rui Yan","Zechao Li"],"pdf_url":"https://arxiv.org/pdf/2308.15109v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15094v1","updated":"2023-08-29T08:02:41Z","published":"2023-08-29T08:02:41Z","title":"Group-Conditional Conformal Prediction via Quantile Regression\n Calibration for Crop and Weed Classification","summary":" As deep learning predictive models become an integral part of a large\nspectrum of precision agricultural systems, a barrier to the adoption of such\nautomated solutions is the lack of user trust in these highly complex, opaque\nand uncertain models. Indeed, deep neural networks are not equipped with any\nexplicit guarantees that can be used to certify the system's performance,\nespecially in highly varying uncontrolled environments such as the ones\ntypically faced in computer vision for agriculture.Fortunately, certain methods\ndeveloped in other communities can prove to be important for agricultural\napplications. This article presents the conformal prediction framework that\nprovides valid statistical guarantees on the predictive performance of any\nblack box prediction machine, with almost no assumptions, applied to the\nproblem of deep visual classification of weeds and crops in real-world\nconditions. The framework is exposed with a focus on its practical aspects and\nspecial attention accorded to the Adaptive Prediction Sets (APS) approach that\ndelivers marginal guarantees on the model's coverage. Marginal results are then\nshown to be insufficient to guarantee performance on all groups of individuals\nin the population as characterized by their environmental and pedo-climatic\nauxiliary data gathered during image acquisition.To tackle this shortcoming,\ngroup-conditional conformal approaches are presented: the ''classical'' method\nthat consists of iteratively applying the APS procedure on all groups, and a\nproposed elegant reformulation and implementation of the procedure using\nquantile regression on group membership indicators. Empirical results showing\nthe validity of the proposed approach are presented and compared to the\nmarginal APS then discussed.\n","authors":["Paul Melki","Lionel Bombrun","Boubacar Diallo","Jérôme Dias","Jean-Pierre da Costa"],"pdf_url":"https://arxiv.org/pdf/2308.15094v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.14308v3","updated":"2023-08-29T07:58:49Z","published":"2022-11-25T18:59:46Z","title":"WALDO: Future Video Synthesis using Object Layer Decomposition and\n Parametric Flow Prediction","summary":" This paper presents WALDO (WArping Layer-Decomposed Objects), a novel\napproach to the prediction of future video frames from past ones. Individual\nimages are decomposed into multiple layers combining object masks and a small\nset of control points. The layer structure is shared across all frames in each\nvideo to build dense inter-frame connections. Complex scene motions are modeled\nby combining parametric geometric transformations associated with individual\nlayers, and video synthesis is broken down into discovering the layers\nassociated with past frames, predicting the corresponding transformations for\nupcoming ones and warping the associated object regions accordingly, and\nfilling in the remaining image parts. Extensive experiments on multiple\nbenchmarks including urban videos (Cityscapes and KITTI) and videos featuring\nnonrigid motions (UCF-Sports and H3.6M), show that our method consistently\noutperforms the state of the art by a significant margin in every case. Code,\npretrained models, and video samples synthesized by our approach can be found\nin the project webpage https://16lemoing.github.io/waldo.\n","authors":["Guillaume Le Moing","Jean Ponce","Cordelia Schmid"],"pdf_url":"https://arxiv.org/pdf/2211.14308v3.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2303.11851v2","updated":"2023-08-29T07:57:20Z","published":"2023-03-21T13:49:49Z","title":"Sample4Geo: Hard Negative Sampling For Cross-View Geo-Localisation","summary":" Cross-View Geo-Localisation is still a challenging task where additional\nmodules, specific pre-processing or zooming strategies are necessary to\ndetermine accurate positions of images. Since different views have different\ngeometries, pre-processing like polar transformation helps to merge them.\nHowever, this results in distorted images which then have to be rectified.\nAdding hard negatives to the training batch could improve the overall\nperformance but with the default loss functions in geo-localisation it is\ndifficult to include them. In this article, we present a simplified but\neffective architecture based on contrastive learning with symmetric InfoNCE\nloss that outperforms current state-of-the-art results. Our framework consists\nof a narrow training pipeline that eliminates the need of using aggregation\nmodules, avoids further pre-processing steps and even increases the\ngeneralisation capability of the model to unknown regions. We introduce two\ntypes of sampling strategies for hard negatives. The first explicitly exploits\ngeographically neighboring locations to provide a good starting point. The\nsecond leverages the visual similarity between the image embeddings in order to\nmine hard negative samples. Our work shows excellent performance on common\ncross-view datasets like CVUSA, CVACT, University-1652 and VIGOR. A comparison\nbetween cross-area and same-area settings demonstrate the good generalisation\ncapability of our model.\n","authors":["Fabian Deuser","Konrad Habel","Norbert Oswald"],"pdf_url":"https://arxiv.org/pdf/2303.11851v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15088v1","updated":"2023-08-29T07:51:36Z","published":"2023-08-29T07:51:36Z","title":"Using deep learning for an automatic detection and classification of the\n vascular bifurcations along the Circle of Willis","summary":" Most of the intracranial aneurysms (ICA) occur on a specific portion of the\ncerebral vascular tree named the Circle of Willis (CoW). More particularly,\nthey mainly arise onto fifteen of the major arterial bifurcations constituting\nthis circular structure. Hence, for an efficient and timely diagnosis it is\ncritical to develop some methods being able to accurately recognize each\nBifurcation of Interest (BoI). Indeed, an automatic extraction of the\nbifurcations presenting the higher risk of developing an ICA would offer the\nneuroradiologists a quick glance at the most alarming areas. Due to the recent\nefforts on Artificial Intelligence, Deep Learning turned out to be the best\nperforming technology for many pattern recognition tasks. Moreover, various\nmethods have been particularly designed for medical image analysis purposes.\nThis study intends to assist the neuroradiologists to promptly locate any\nbifurcation presenting a high risk of ICA occurrence. It can be seen as a\nComputer Aided Diagnosis scheme, where the Artificial Intelligence facilitates\nthe access to the regions of interest within the MRI. In this work, we propose\na method for a fully automatic detection and recognition of the bifurcations of\ninterest forming the Circle of Willis. Several neural networks architectures\nhave been tested, and we thoroughly evaluate the bifurcation recognition rate.\n","authors":["Rafic Nader","Romain Bourcier","Florent Autrusseau"],"pdf_url":"https://arxiv.org/pdf/2308.15088v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15085v1","updated":"2023-08-29T07:50:11Z","published":"2023-08-29T07:50:11Z","title":"Learning to Upsample by Learning to Sample","summary":" We present DySample, an ultra-lightweight and effective dynamic upsampler.\nWhile impressive performance gains have been witnessed from recent kernel-based\ndynamic upsamplers such as CARAFE, FADE, and SAPA, they introduce much\nworkload, mostly due to the time-consuming dynamic convolution and the\nadditional sub-network used to generate dynamic kernels. Further, the need for\nhigh-res feature guidance of FADE and SAPA somehow limits their application\nscenarios. To address these concerns, we bypass dynamic convolution and\nformulate upsampling from the perspective of point sampling, which is more\nresource-efficient and can be easily implemented with the standard built-in\nfunction in PyTorch. We first showcase a naive design, and then demonstrate how\nto strengthen its upsampling behavior step by step towards our new upsampler,\nDySample. Compared with former kernel-based dynamic upsamplers, DySample\nrequires no customized CUDA package and has much fewer parameters, FLOPs, GPU\nmemory, and latency. Besides the light-weight characteristics, DySample\noutperforms other upsamplers across five dense prediction tasks, including\nsemantic segmentation, object detection, instance segmentation, panoptic\nsegmentation, and monocular depth estimation. Code is available at\nhttps://github.com/tiny-smart/dysample.\n","authors":["Wenze Liu","Hao Lu","Hongtao Fu","Zhiguo Cao"],"pdf_url":"https://arxiv.org/pdf/2308.15085v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.15081v1","updated":"2023-08-29T07:29:30Z","published":"2023-08-29T07:29:30Z","title":"Class Prior-Free Positive-Unlabeled Learning with Taylor Variational\n Loss for Hyperspectral Remote Sensing Imagery","summary":" Positive-unlabeled learning (PU learning) in hyperspectral remote sensing\nimagery (HSI) is aimed at learning a binary classifier from positive and\nunlabeled data, which has broad prospects in various earth vision applications.\nHowever, when PU learning meets limited labeled HSI, the unlabeled data may\ndominate the optimization process, which makes the neural networks overfit the\nunlabeled data. In this paper, a Taylor variational loss is proposed for HSI PU\nlearning, which reduces the weight of the gradient of the unlabeled data by\nTaylor series expansion to enable the network to find a balance between\noverfitting and underfitting. In addition, the self-calibrated optimization\nstrategy is designed to stabilize the training process. Experiments on 7\nbenchmark datasets (21 tasks in total) validate the effectiveness of the\nproposed method. Code is at: https://github.com/Hengwei-Zhao96/T-HOneCls.\n","authors":["Hengwei Zhao","Xinyu Wang","Jingtao Li","Yanfei Zhong"],"pdf_url":"https://arxiv.org/pdf/2308.15081v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.15074v1","updated":"2023-08-29T07:15:57Z","published":"2023-08-29T07:15:57Z","title":"Exploring Model Transferability through the Lens of Potential Energy","summary":" Transfer learning has become crucial in computer vision tasks due to the vast\navailability of pre-trained deep learning models. However, selecting the\noptimal pre-trained model from a diverse pool for a specific downstream task\nremains a challenge. Existing methods for measuring the transferability of\npre-trained models rely on statistical correlations between encoded static\nfeatures and task labels, but they overlook the impact of underlying\nrepresentation dynamics during fine-tuning, leading to unreliable results,\nespecially for self-supervised models. In this paper, we present an insightful\nphysics-inspired approach named PED to address these challenges. We reframe the\nchallenge of model selection through the lens of potential energy and directly\nmodel the interaction forces that influence fine-tuning dynamics. By capturing\nthe motion of dynamic representations to decline the potential energy within a\nforce-driven physical model, we can acquire an enhanced and more stable\nobservation for estimating transferability. The experimental results on 10\ndownstream tasks and 12 self-supervised models demonstrate that our approach\ncan seamlessly integrate into existing ranking techniques and enhance their\nperformances, revealing its effectiveness for the model selection task and its\npotential for understanding the mechanism in transfer learning. Code will be\navailable at https://github.com/lixiaotong97/PED.\n","authors":["Xiaotong Li","Zixuan Hu","Yixiao Ge","Ying Shan","Ling-Yu Duan"],"pdf_url":"https://arxiv.org/pdf/2308.15074v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.15070v1","updated":"2023-08-29T07:11:52Z","published":"2023-08-29T07:11:52Z","title":"DiffBIR: Towards Blind Image Restoration with Generative Diffusion Prior","summary":" We present DiffBIR, which leverages pretrained text-to-image diffusion models\nfor blind image restoration problem. Our framework adopts a two-stage pipeline.\nIn the first stage, we pretrain a restoration module across diversified\ndegradations to improve generalization capability in real-world scenarios. The\nsecond stage leverages the generative ability of latent diffusion models, to\nachieve realistic image restoration. Specifically, we introduce an injective\nmodulation sub-network -- LAControlNet for finetuning, while the pre-trained\nStable Diffusion is to maintain its generative ability. Finally, we introduce a\ncontrollable module that allows users to balance quality and fidelity by\nintroducing the latent image guidance in the denoising process during\ninference. Extensive experiments have demonstrated its superiority over\nstate-of-the-art approaches for both blind image super-resolution and blind\nface restoration tasks on synthetic and real-world datasets. The code is\navailable at https://github.com/XPixelGroup/DiffBIR.\n","authors":["Xinqi Lin","Jingwen He","Ziyan Chen","Zhaoyang Lyu","Ben Fei","Bo Dai","Wanli Ouyang","Yu Qiao","Chao Dong"],"pdf_url":"https://arxiv.org/pdf/2308.15070v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15068v1","updated":"2023-08-29T07:00:35Z","published":"2023-08-29T07:00:35Z","title":"A Comprehensive Augmentation Framework for Anomaly Detection","summary":" Data augmentation methods are commonly integrated into the training of\nanomaly detection models. Previous approaches have primarily focused on\nreplicating real-world anomalies or enhancing diversity, without considering\nthat the standard of anomaly varies across different classes, potentially\nleading to a biased training distribution.This paper analyzes crucial traits of\nsimulated anomalies that contribute to the training of reconstructive networks\nand condenses them into several methods, thus creating a comprehensive\nframework by selectively utilizing appropriate combinations.Furthermore, we\nintegrate this framework with a reconstruction-based approach and concurrently\npropose a split training strategy that alleviates the issue of overfitting\nwhile avoiding introducing interference to the reconstruction process. The\nevaluations conducted on the MVTec anomaly detection dataset demonstrate that\nour method outperforms the previous state-of-the-art approach, particularly in\nterms of object classes.To evaluate generalizability, we generate a simulated\ndataset comprising anomalies with diverse characteristics since the original\ntest samples only include specific types of anomalies and may lead to biased\nevaluations. Experimental results demonstrate that our approach exhibits\npromising potential for generalizing effectively to various unforeseen\nanomalies encountered in real-world scenarios.\n","authors":["Jiang Lin","Yaping Yan"],"pdf_url":"https://arxiv.org/pdf/2308.15068v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15063v1","updated":"2023-08-29T06:55:42Z","published":"2023-08-29T06:55:42Z","title":"Learning Cross-modality Information Bottleneck Representation for\n Heterogeneous Person Re-Identification","summary":" Visible-Infrared person re-identification (VI-ReID) is an important and\nchallenging task in intelligent video surveillance. Existing methods mainly\nfocus on learning a shared feature space to reduce the modality discrepancy\nbetween visible and infrared modalities, which still leave two problems\nunderexplored: information redundancy and modality complementarity. To this\nend, properly eliminating the identity-irrelevant information as well as making\nup for the modality-specific information are critical and remains a challenging\nendeavor. To tackle the above problems, we present a novel mutual information\nand modality consensus network, namely CMInfoNet, to extract modality-invariant\nidentity features with the most representative information and reduce the\nredundancies. The key insight of our method is to find an optimal\nrepresentation to capture more identity-relevant information and compress the\nirrelevant parts by optimizing a mutual information bottleneck trade-off.\nBesides, we propose an automatically search strategy to find the most prominent\nparts that identify the pedestrians. To eliminate the cross- and intra-modality\nvariations, we also devise a modality consensus module to align the visible and\ninfrared modalities for task-specific guidance. Moreover, the global-local\nfeature representations can also be acquired for key parts discrimination.\nExperimental results on four benchmarks, i.e., SYSU-MM01, RegDB,\nOccluded-DukeMTMC, Occluded-REID, Partial-REID and Partial\\_iLIDS dataset, have\ndemonstrated the effectiveness of CMInfoNet.\n","authors":["Haichao Shi","Mandi Luo","Xiao-Yu Zhang","Ran He"],"pdf_url":"https://arxiv.org/pdf/2308.15063v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15061v1","updated":"2023-08-29T06:50:04Z","published":"2023-08-29T06:50:04Z","title":"AIoT-Based Drum Transcription Robot using Convolutional Neural Networks","summary":" With the development of information technology, robot technology has made\ngreat progress in various fields. These new technologies enable robots to be\nused in industry, agriculture, education and other aspects. In this paper, we\npropose a drum robot that can automatically complete music transcription in\nreal-time, which is based on AIoT and fog computing technology. Specifically,\nthis drum robot system consists of a cloud node for data storage, edge nodes\nfor real-time computing, and data-oriented execution application nodes. In\norder to analyze drumming music and realize drum transcription, we further\npropose a light-weight convolutional neural network model to classify drums,\nwhich can be more effectively deployed in terminal devices for fast edge\ncalculations. The experimental results show that the proposed system can\nachieve more competitive performance and enjoy a variety of smart applications\nand services.\n","authors":["Yukun Su","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2308.15061v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15056v1","updated":"2023-08-29T06:33:13Z","published":"2023-08-29T06:33:13Z","title":"A Consumer-tier based Visual-Brain Machine Interface for Augmented\n Reality Glasses Interactions","summary":" Objective.Visual-Brain Machine Interface(V-BMI) has provide a novel\ninteraction technique for Augmented Reality (AR) industries. Several\nstate-of-arts work has demonstates its high accuracy and real-time interaction\ncapbilities. However, most of the studies employ EEGs devices that are rigid\nand difficult to apply in real-life AR glasseses application sceniraros. Here\nwe develop a consumer-tier Visual-Brain Machine Inteface(V-BMI) system\nspecialized for Augmented Reality(AR) glasses interactions. Approach. The\ndeveloped system consists of a wearable hardware which takes advantages of fast\nset-up, reliable recording and comfortable wearable experience that\nspecificized for AR glasses applications. Complementing this hardware, we have\ndevised a software framework that facilitates real-time interactions within the\nsystem while accommodating a modular configuration to enhance scalability. Main\nresults. The developed hardware is only 110g and 120x85x23 mm, which with 1\nTohm and peak to peak voltage is less than 1.5 uV, and a V-BMI based angry bird\ngame and an Internet of Thing (IoT) AR applications are deisgned, we\ndemonstrated such technology merits of intuitive experience and efficiency\ninteraction. The real-time interaction accuracy is between 85 and 96\npercentages in a commercial AR glasses (DTI is 2.24s and ITR 65 bits-min ).\nSignificance. Our study indicates the developed system can provide an essential\nhardware-software framework for consumer based V-BMI AR glasses. Also, we\nderive several pivotal design factors for a consumer-grade V-BMI-based AR\nsystem: 1) Dynamic adaptation of stimulation patterns-classification methods\nvia computer vision algorithms is necessary for AR glasses applications; and 2)\nAlgorithmic localization to foster system stability and latency reduction.\n","authors":["Yuying Jiang","Fan Bai","Zicheng Zhang","Xiaochen Ye","Zheng Liu","Zhiping Shi","Jianwei Yao","Xiaojun Liu","Fangkun Zhu","Junling Li Qian Guo","Xiaoan Wang","Junwen Luo"],"pdf_url":"https://arxiv.org/pdf/2308.15056v1.pdf","comment":"15 pages,10 figures"},{"id":"http://arxiv.org/abs/2303.10452v2","updated":"2023-08-29T06:32:06Z","published":"2023-03-18T16:40:10Z","title":"Confidence Attention and Generalization Enhanced Distillation for\n Continuous Video Domain Adaptation","summary":" Continuous Video Domain Adaptation (CVDA) is a scenario where a source model\nis required to adapt to a series of individually available changing target\ndomains continuously without source data or target supervision. It has wide\napplications, such as robotic vision and autonomous driving. The main\nunderlying challenge of CVDA is to learn helpful information only from the\nunsupervised target data while avoiding forgetting previously learned knowledge\ncatastrophically, which is out of the capability of previous Video-based\nUnsupervised Domain Adaptation methods. Therefore, we propose a\nConfidence-Attentive network with geneRalization enhanced self-knowledge\ndisTillation (CART) to address the challenge in CVDA. Firstly, to learn from\nunsupervised domains, we propose to learn from pseudo labels. However, in\ncontinuous adaptation, prediction errors can accumulate rapidly in pseudo\nlabels, and CART effectively tackles this problem with two key modules.\nSpecifically, The first module generates refined pseudo labels using model\npredictions and deploys a novel attentive learning strategy. The second module\ncompares the outputs of augmented data from the current model to the outputs of\nweakly augmented data from the source model, forming a novel consistency\nregularization on the model to alleviate the accumulation of prediction errors.\nExtensive experiments suggest that the CVDA performance of CART outperforms\nexisting methods by a considerable margin.\n","authors":["Xiyu Wang","Yuecong Xu","Jianfei Yang","Bihan Wen","Alex C. Kot"],"pdf_url":"https://arxiv.org/pdf/2303.10452v2.pdf","comment":"16 pages, 9 tables, 10 figures"},{"id":"http://arxiv.org/abs/2308.15050v1","updated":"2023-08-29T06:20:36Z","published":"2023-08-29T06:20:36Z","title":"iBARLE: imBalance-Aware Room Layout Estimation","summary":" Room layout estimation predicts layouts from a single panorama. It requires\ndatasets with large-scale and diverse room shapes to train the models. However,\nthere are significant imbalances in real-world datasets including the\ndimensions of layout complexity, camera locations, and variation in scene\nappearance. These issues considerably influence the model training performance.\nIn this work, we propose the imBalance-Aware Room Layout Estimation (iBARLE)\nframework to address these issues. iBARLE consists of (1) Appearance Variation\nGeneration (AVG) module, which promotes visual appearance domain\ngeneralization, (2) Complex Structure Mix-up (CSMix) module, which enhances\ngeneralizability w.r.t. room structure, and (3) a gradient-based layout\nobjective function, which allows more effective accounting for occlusions in\ncomplex layouts. All modules are jointly trained and help each other to achieve\nthe best performance. Experiments and ablation studies based on\nZInD~\\cite{cruz2021zillow} dataset illustrate that iBARLE has state-of-the-art\nperformance compared with other layout estimation baselines.\n","authors":["Taotao Jing","Lichen Wang","Naji Khosravan","Zhiqiang Wan","Zachary Bessinger","Zhengming Ding","Sing Bing Kang"],"pdf_url":"https://arxiv.org/pdf/2308.15050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15049v1","updated":"2023-08-29T06:14:06Z","published":"2023-08-29T06:14:06Z","title":"Pose-Free Neural Radiance Fields via Implicit Pose Regularization","summary":" Pose-free neural radiance fields (NeRF) aim to train NeRF with unposed\nmulti-view images and it has achieved very impressive success in recent years.\nMost existing works share the pipeline of training a coarse pose estimator with\nrendered images at first, followed by a joint optimization of estimated poses\nand neural radiance field. However, as the pose estimator is trained with only\nrendered images, the pose estimation is usually biased or inaccurate for real\nimages due to the domain gap between real images and rendered images, leading\nto poor robustness for the pose estimation of real images and further local\nminima in joint optimization. We design IR-NeRF, an innovative pose-free NeRF\nthat introduces implicit pose regularization to refine pose estimator with\nunposed real images and improve the robustness of the pose estimation for real\nimages. With a collection of 2D images of a specific scene, IR-NeRF constructs\na scene codebook that stores scene features and captures the scene-specific\npose distribution implicitly as priors. Thus, the robustness of pose estimation\ncan be promoted with the scene priors according to the rationale that a 2D real\nimage can be well reconstructed from the scene codebook only when its estimated\npose lies within the pose distribution. Extensive experiments show that IR-NeRF\nachieves superior novel view synthesis and outperforms the state-of-the-art\nconsistently across multiple synthetic and real datasets.\n","authors":["Jiahui Zhang","Fangneng Zhan","Yingchen Yu","Kunhao Liu","Rongliang Wu","Xiaoqin Zhang","Ling Shao","Shijian Lu"],"pdf_url":"https://arxiv.org/pdf/2308.15049v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2305.09121v2","updated":"2023-08-29T06:11:58Z","published":"2023-05-16T03:00:04Z","title":"A Conditional Denoising Diffusion Probabilistic Model for Radio\n Interferometric Image Reconstruction","summary":" In radio astronomy, signals from radio telescopes are transformed into images\nof observed celestial objects, or sources. However, these images, called dirty\nimages, contain real sources as well as artifacts due to signal sparsity and\nother factors. Therefore, radio interferometric image reconstruction is\nperformed on dirty images, aiming to produce clean images in which artifacts\nare reduced and real sources are recovered. So far, existing methods have\nlimited success on recovering faint sources, preserving detailed structures,\nand eliminating artifacts. In this paper, we present VIC-DDPM, a Visibility and\nImage Conditioned Denoising Diffusion Probabilistic Model. Our main idea is to\nuse both the original visibility data in the spectral domain and dirty images\nin the spatial domain to guide the image generation process with DDPM. This\nway, we can leverage DDPM to generate fine details and eliminate noise, while\nutilizing visibility data to separate signals from noise and retaining spatial\ninformation in dirty images. We have conducted experiments in comparison with\nboth traditional methods and recent deep learning based approaches. Our results\nshow that our method significantly improves the resulting images by reducing\nartifacts, preserving fine details, and recovering dim sources. This\nadvancement further facilitates radio astronomical data analysis tasks on\ncelestial phenomena.\n","authors":["Ruoqi Wang","Zhuoyang Chen","Qiong Luo","Feng Wang"],"pdf_url":"https://arxiv.org/pdf/2305.09121v2.pdf","comment":"Accepted by ECAI 2023"},{"id":"http://arxiv.org/abs/2308.15037v1","updated":"2023-08-29T05:44:00Z","published":"2023-08-29T05:44:00Z","title":"Is it an i or an l: Test-time Adaptation of Text Line Recognition Models","summary":" Recognizing text lines from images is a challenging problem, especially for\nhandwritten documents due to large variations in writing styles. While text\nline recognition models are generally trained on large corpora of real and\nsynthetic data, such models can still make frequent mistakes if the handwriting\nis inscrutable or the image acquisition process adds corruptions, such as\nnoise, blur, compression, etc. Writing style is generally quite consistent for\nan individual, which can be leveraged to correct mistakes made by such models.\nMotivated by this, we introduce the problem of adapting text line recognition\nmodels during test time. We focus on a challenging and realistic setting where,\ngiven only a single test image consisting of multiple text lines, the task is\nto adapt the model such that it performs better on the image, without any\nlabels. We propose an iterative self-training approach that uses feedback from\nthe language model to update the optical model, with confident self-labels in\neach iteration. The confidence measure is based on an augmentation mechanism\nthat evaluates the divergence of the prediction of the model in a local region.\nWe perform rigorous evaluation of our method on several benchmark datasets as\nwell as their corrupted versions. Experimental results on multiple datasets\nspanning multiple scripts show that the proposed adaptation method offers an\nabsolute improvement of up to 8% in character error rate with just a few\niterations of self-training at test time.\n","authors":["Debapriya Tula","Sujoy Paul","Gagan Madan","Peter Garst","Reeve Ingle","Gaurav Aggarwal"],"pdf_url":"https://arxiv.org/pdf/2308.15037v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10875v2","updated":"2023-08-29T05:42:49Z","published":"2023-07-20T13:47:30Z","title":"Risk-optimized Outlier Removal for Robust Point Cloud Classification","summary":" With the growth of 3D sensing technology, deep learning system for 3D point\nclouds has become increasingly important, especially in applications like\nautonomous vehicles where safety is a primary concern. However, there are also\ngrowing concerns about the reliability of these systems when they encounter\nnoisy point clouds, whether occurring naturally or introduced with malicious\nintent. This paper highlights the challenges of point cloud classification\nposed by various forms of noise, from simple background noise to malicious\nbackdoor attacks that can intentionally skew model predictions. While there's\nan urgent need for optimized point cloud denoising, current point outlier\nremoval approaches, an essential step for denoising, rely heavily on\nhandcrafted strategies and are not adapted for higher-level tasks, such as\nclassification. To address this issue, we introduce an innovative point outlier\ncleansing method that harnesses the power of downstream classification models.\nBy employing gradient-based attribution analysis, we define a novel concept:\npoint risk. Drawing inspiration from tail risk minimization in finance, we\nrecast the outlier removal process as an optimization problem, named PointCVaR.\nExtensive experiments show that our proposed technique not only robustly\nfilters diverse point cloud outliers but also consistently and significantly\nenhances existing robust methods for point cloud classification.\n","authors":["Xinke Li","Junchi Lu","Henghui Ding","Changsheng Sun","Joey Tianyi Zhou","Chee Yeow Meng"],"pdf_url":"https://arxiv.org/pdf/2307.10875v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.05021v4","updated":"2023-08-29T05:20:36Z","published":"2023-03-09T03:48:24Z","title":"DiffusionDepth: Diffusion Denoising Approach for Monocular Depth\n Estimation","summary":" Monocular depth estimation is a challenging task that predicts the pixel-wise\ndepth from a single 2D image. Current methods typically model this problem as a\nregression or classification task. We propose DiffusionDepth, a new approach\nthat reformulates monocular depth estimation as a denoising diffusion process.\nIt learns an iterative denoising process to `denoise' random depth distribution\ninto a depth map with the guidance of monocular visual conditions. The process\nis performed in the latent space encoded by a dedicated depth encoder and\ndecoder. Instead of diffusing ground truth (GT) depth, the model learns to\nreverse the process of diffusing the refined depth of itself into random depth\ndistribution. This self-diffusion formulation overcomes the difficulty of\napplying generative models to sparse GT depth scenarios. The proposed approach\nbenefits this task by refining depth estimation step by step, which is superior\nfor generating accurate and highly detailed depth maps. Experimental results on\nKITTI and NYU-Depth-V2 datasets suggest that a simple yet efficient diffusion\napproach could reach state-of-the-art performance in both indoor and outdoor\nscenarios with acceptable inference time.\n","authors":["Yiqun Duan","Xianda Guo","Zheng Zhu"],"pdf_url":"https://arxiv.org/pdf/2303.05021v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14066v2","updated":"2023-08-29T04:59:41Z","published":"2023-08-27T10:39:33Z","title":"Bi-Modality Medical Image Synthesis Using Semi-Supervised Sequential\n Generative Adversarial Networks","summary":" In this paper, we propose a bi-modality medical image synthesis approach\nbased on sequential generative adversarial network (GAN) and semi-supervised\nlearning. Our approach consists of two generative modules that synthesize\nimages of the two modalities in a sequential order. A method for measuring the\nsynthesis complexity is proposed to automatically determine the synthesis order\nin our sequential GAN. Images of the modality with a lower complexity are\nsynthesized first, and the counterparts with a higher complexity are generated\nlater. Our sequential GAN is trained end-to-end in a semi-supervised manner. In\nsupervised training, the joint distribution of bi-modality images are learned\nfrom real paired images of the two modalities by explicitly minimizing the\nreconstruction losses between the real and synthetic images. To avoid\noverfitting limited training images, in unsupervised training, the marginal\ndistribution of each modality is learned based on unpaired images by minimizing\nthe Wasserstein distance between the distributions of real and fake images. We\ncomprehensively evaluate the proposed model using two synthesis tasks based on\nthree types of evaluate metrics and user studies. Visual and quantitative\nresults demonstrate the superiority of our method to the state-of-the-art\nmethods, and reasonable visual quality and clinical significance. Code is made\npublicly available at\nhttps://github.com/hustlinyi/Multimodal-Medical-Image-Synthesis.\n","authors":["Xin Yang","Yi Lin","Zhiwei Wang","Xin Li","Kwang-Ting Cheng"],"pdf_url":"https://arxiv.org/pdf/2308.14066v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15019v1","updated":"2023-08-29T04:46:52Z","published":"2023-08-29T04:46:52Z","title":"Pyramid diffractive optical networks for unidirectional magnification\n and demagnification","summary":" Diffractive deep neural networks (D2NNs) are composed of successive\ntransmissive layers optimized using supervised deep learning to all-optically\nimplement various computational tasks between an input and output field-of-view\n(FOV). Here, we present a pyramid-structured diffractive optical network design\n(which we term P-D2NN), optimized specifically for unidirectional image\nmagnification and demagnification. In this P-D2NN design, the diffractive\nlayers are pyramidally scaled in alignment with the direction of the image\nmagnification or demagnification. Our analyses revealed the efficacy of this\nP-D2NN design in unidirectional image magnification and demagnification tasks,\nproducing high-fidelity magnified or demagnified images in only one direction,\nwhile inhibiting the image formation in the opposite direction - confirming the\ndesired unidirectional imaging operation. Compared to the conventional D2NN\ndesigns with uniform-sized successive diffractive layers, P-D2NN design\nachieves similar performance in unidirectional magnification tasks using only\nhalf of the diffractive degrees of freedom within the optical processor volume.\nFurthermore, it maintains its unidirectional image\nmagnification/demagnification functionality across a large band of illumination\nwavelengths despite being trained with a single illumination wavelength. With\nthis pyramidal architecture, we also designed a wavelength-multiplexed\ndiffractive network, where a unidirectional magnifier and a unidirectional\ndemagnifier operate simultaneously in opposite directions, at two distinct\nillumination wavelengths. The efficacy of the P-D2NN architecture was also\nvalidated experimentally using monochromatic terahertz illumination,\nsuccessfully matching our numerical simulations. P-D2NN offers a\nphysics-inspired strategy for designing task-specific visual processors.\n","authors":["Bijie Bai","Xilin Yang","Tianyi Gan","Jingxi Li","Deniz Mengu","Mona Jarrahi","Aydogan Ozcan"],"pdf_url":"https://arxiv.org/pdf/2308.15019v1.pdf","comment":"26 Pages, 7 Figures"},{"id":"http://arxiv.org/abs/2308.15016v1","updated":"2023-08-29T04:39:07Z","published":"2023-08-29T04:39:07Z","title":"C2G2: Controllable Co-speech Gesture Generation with Latent Diffusion\n Model","summary":" Co-speech gesture generation is crucial for automatic digital avatar\nanimation. However, existing methods suffer from issues such as unstable\ntraining and temporal inconsistency, particularly in generating high-fidelity\nand comprehensive gestures. Additionally, these methods lack effective control\nover speaker identity and temporal editing of the generated gestures. Focusing\non capturing temporal latent information and applying practical controlling, we\npropose a Controllable Co-speech Gesture Generation framework, named C2G2.\nSpecifically, we propose a two-stage temporal dependency enhancement strategy\nmotivated by latent diffusion models. We further introduce two key features to\nC2G2, namely a speaker-specific decoder to generate speaker-related real-length\nskeletons and a repainting strategy for flexible gesture generation/editing.\nExtensive experiments on benchmark gesture datasets verify the effectiveness of\nour proposed C2G2 compared with several state-of-the-art baselines. The link of\nthe project demo page can be found at https://c2g2-gesture.github.io/c2_gesture\n","authors":["Longbin Ji","Pengfei Wei","Yi Ren","Jinglin Liu","Chen Zhang","Xiang Yin"],"pdf_url":"https://arxiv.org/pdf/2308.15016v1.pdf","comment":"12 pages, 6 figures, 7 tables"},{"id":"http://arxiv.org/abs/2307.11058v2","updated":"2023-08-29T04:10:18Z","published":"2023-07-20T17:38:55Z","title":"Anticipating Driving Behavior through Deep Learning-Based Policy\n Prediction","summary":" In this endeavor, we developed a comprehensive system that processes\nintegrated visual features derived from video frames captured by a regular\ncamera, along with depth details obtained from a point cloud scanner. This\nsystem is designed to anticipate driving actions, encompassing both vehicle\nspeed and steering angle. To ensure its reliability, we conducted assessments\nwhere we juxtaposed the projected outcomes with the established norms adhered\nto by skilled real-world drivers. Our evaluation outcomes indicate that the\nforecasts achieve a noteworthy level of accuracy in a minimum of half the test\nscenarios (ranging around 50-80%, contingent on the specific model). Notably,\nthe utilization of amalgamated features yielded superior performance in\ncomparison to using video frames in isolation, as demonstrated by most of the\ncases.\n","authors":["Fuxiao Liu"],"pdf_url":"https://arxiv.org/pdf/2307.11058v2.pdf","comment":"5 pages, 9 figures"},{"id":"http://arxiv.org/abs/2308.15005v1","updated":"2023-08-29T03:54:26Z","published":"2023-08-29T03:54:26Z","title":"Few-Shot Object Detection via Synthetic Features with Optimal Transport","summary":" Few-shot object detection aims to simultaneously localize and classify the\nobjects in an image with limited training samples. However, most existing\nfew-shot object detection methods focus on extracting the features of a few\nsamples of novel classes that lack diversity. Hence, they may not be sufficient\nto capture the data distribution. To address that limitation, in this paper, we\npropose a novel approach in which we train a generator to generate synthetic\ndata for novel classes. Still, directly training a generator on the novel class\nis not effective due to the lack of novel data. To overcome that issue, we\nleverage the large-scale dataset of base classes. Our overarching goal is to\ntrain a generator that captures the data variations of the base dataset. We\nthen transform the captured variations into novel classes by generating\nsynthetic data with the trained generator. To encourage the generator to\ncapture data variations on base classes, we propose to train the generator with\nan optimal transport loss that minimizes the optimal transport distance between\nthe distributions of real and synthetic data. Extensive experiments on two\nbenchmark datasets demonstrate that the proposed method outperforms the state\nof the art. Source code will be available.\n","authors":["Anh-Khoa Nguyen Vu","Thanh-Toan Do","Vinh-Tiep Nguyen","Tam Le","Minh-Triet Tran","Tam V. Nguyen"],"pdf_url":"https://arxiv.org/pdf/2308.15005v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15004v1","updated":"2023-08-29T03:41:27Z","published":"2023-08-29T03:41:27Z","title":"PBFormer: Capturing Complex Scene Text Shape with Polynomial Band\n Transformer","summary":" We present PBFormer, an efficient yet powerful scene text detector that\nunifies the transformer with a novel text shape representation Polynomial Band\n(PB). The representation has four polynomial curves to fit a text's top,\nbottom, left, and right sides, which can capture a text with a complex shape by\nvarying polynomial coefficients. PB has appealing features compared with\nconventional representations: 1) It can model different curvatures with a fixed\nnumber of parameters, while polygon-points-based methods need to utilize a\ndifferent number of points. 2) It can distinguish adjacent or overlapping texts\nas they have apparent different curve coefficients, while segmentation-based or\npoints-based methods suffer from adhesive spatial positions. PBFormer combines\nthe PB with the transformer, which can directly generate smooth text contours\nsampled from predicted curves without interpolation. A parameter-free\ncross-scale pixel attention (CPA) module is employed to highlight the feature\nmap of a suitable scale while suppressing the other feature maps. The simple\noperation can help detect small-scale texts and is compatible with the\none-stage DETR framework, where no postprocessing exists for NMS. Furthermore,\nPBFormer is trained with a shape-contained loss, which not only enforces the\npiecewise alignment between the ground truth and the predicted curves but also\nmakes curves' positions and shapes consistent with each other. Without bells\nand whistles about text pre-training, our method is superior to the previous\nstate-of-the-art text detectors on the arbitrary-shaped text datasets.\n","authors":["Ruijin Liu","Ning Lu","Dapeng Chen","Cheng Li","Zejian Yuan","Wei Peng"],"pdf_url":"https://arxiv.org/pdf/2308.15004v1.pdf","comment":"9 pages, 8 figures, accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2307.00290v2","updated":"2023-08-29T03:31:58Z","published":"2023-07-01T10:12:46Z","title":"All-in-SAM: from Weak Annotation to Pixel-wise Nuclei Segmentation with\n Prompt-based Finetuning","summary":" The Segment Anything Model (SAM) is a recently proposed prompt-based\nsegmentation model in a generic zero-shot segmentation approach. With the\nzero-shot segmentation capacity, SAM achieved impressive flexibility and\nprecision on various segmentation tasks. However, the current pipeline requires\nmanual prompts during the inference stage, which is still resource intensive\nfor biomedical image segmentation. In this paper, instead of using prompts\nduring the inference stage, we introduce a pipeline that utilizes the SAM,\ncalled all-in-SAM, through the entire AI development workflow (from annotation\ngeneration to model finetuning) without requiring manual prompts during the\ninference stage. Specifically, SAM is first employed to generate pixel-level\nannotations from weak prompts (e.g., points, bounding box). Then, the\npixel-level annotations are used to finetune the SAM segmentation model rather\nthan training from scratch. Our experimental results reveal two key findings:\n1) the proposed pipeline surpasses the state-of-the-art (SOTA) methods in a\nnuclei segmentation task on the public Monuseg dataset, and 2) the utilization\nof weak and few annotations for SAM finetuning achieves competitive performance\ncompared to using strong pixel-wise annotated data.\n","authors":["Can Cui","Ruining Deng","Quan Liu","Tianyuan Yao","Shunxing Bao","Lucas W. Remedios","Yucheng Tang","Yuankai Huo"],"pdf_url":"https://arxiv.org/pdf/2307.00290v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14995v1","updated":"2023-08-29T02:50:36Z","published":"2023-08-29T02:50:36Z","title":"WSAM: Visual Explanations from Style Augmentation as Adversarial\n Attacker and Their Influence in Image Classification","summary":" Currently, style augmentation is capturing attention due to convolutional\nneural networks (CNN) being strongly biased toward recognizing textures rather\nthan shapes. Most existing styling methods either perform a low-fidelity style\ntransfer or a weak style representation in the embedding vector. This paper\noutlines a style augmentation algorithm using stochastic-based sampling with\nnoise addition to improving randomization on a general linear transformation\nfor style transfer. With our augmentation strategy, all models not only present\nincredible robustness against image stylizing but also outperform all previous\nmethods and surpass the state-of-the-art performance for the STL-10 dataset. In\naddition, we present an analysis of the model interpretations under different\nstyle variations. At the same time, we compare comprehensive experiments\ndemonstrating the performance when applied to deep neural architectures in\ntraining settings.\n","authors":["Felipe Moreno-Vera","Edgar Medina","Jorge Poco"],"pdf_url":"https://arxiv.org/pdf/2308.14995v1.pdf","comment":"8 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.14221v2","updated":"2023-08-29T02:50:25Z","published":"2023-08-27T22:45:24Z","title":"High-Resolution Document Shadow Removal via A Large-Scale Real-World\n Dataset and A Frequency-Aware Shadow Erasing Net","summary":" Shadows often occur when we capture the documents with casual equipment,\nwhich influences the visual quality and readability of the digital copies.\nDifferent from the algorithms for natural shadow removal, the algorithms in\ndocument shadow removal need to preserve the details of fonts and figures in\nhigh-resolution input. Previous works ignore this problem and remove the\nshadows via approximate attention and small datasets, which might not work in\nreal-world situations. We handle high-resolution document shadow removal\ndirectly via a larger-scale real-world dataset and a carefully designed\nfrequency-aware network. As for the dataset, we acquire over 7k couples of\nhigh-resolution (2462 x 3699) images of real-world document pairs with various\nsamples under different lighting circumstances, which is 10 times larger than\nexisting datasets. As for the design of the network, we decouple the\nhigh-resolution images in the frequency domain, where the low-frequency details\nand high-frequency boundaries can be effectively learned via the carefully\ndesigned network structure. Powered by our network and dataset, the proposed\nmethod clearly shows a better performance than previous methods in terms of\nvisual quality and numerical results. The code, models, and dataset are\navailable at: https://github.com/CXH-Research/DocShadow-SD7K\n","authors":["Zinuo Li","Xuhang Chen","Chi-Man Pun","Xiaodong Cun"],"pdf_url":"https://arxiv.org/pdf/2308.14221v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.14994v1","updated":"2023-08-29T02:49:16Z","published":"2023-08-29T02:49:16Z","title":"ICARUS: An Android-Based Unmanned Aerial Vehicle (UAV) Search and Rescue\n Eye in the Sky","summary":" The purpose of this paper is to develop an unmanned aerial vehicle (UAV)\nusing a quadcopter with the capability of video surveillance, map coordinates,\na deployable parachute with a medicine kit or a food pack as a payload, a\ncollision warning system, remotely controlled, integrated with an android\napplication to assist in search and rescue operations.\n Applied research for the development of the functional prototype,\nquantitative and descriptive statistics to summarize data by describing the\nrelationship between variables in a sample or population. The quadcopter\nunderwent an evaluation using a survey instrument to test its acceptability\nusing predefined variables to select respondents within Caloocan City and\nQuezon City, Philippines.\n Demographic profiles and known issues and concerns were answered by 30\nrespondents. The results were summarized and distributed in Tables 1 and 2.\n In terms of demographic profiles, the number of SAR operators within the\nspecified areas is distributed equally, most are male, single, and within the\nage bracket of 31 and above. In issues and concerns, the most common type of\nsearch and rescue was ground search and rescue. Human error is the primary\ncause of most injuries in operating units. The prototype was useful and\neveryone agreed, in terms of acceptability, drone technology will improve\nsearch and rescue operations.\n The innovative way of utilizing Android and drone technology is a new step\ntowards the improvement of SAR operations in the Philippines.\n The LiPo battery must be replaced with a higher capacity and the drone\noperator should undergo a training course and secure a permit from the Civil\nAviation Authority of the Philippines (CAAP).\n","authors":["Manuel Luis C. Delos Santos","Jerum B. Dasalla","Jomar C. Feliciano","Dustin Red B. Cabatay"],"pdf_url":"https://arxiv.org/pdf/2308.14994v1.pdf","comment":"15 pages, 14 figures, Special Issue: IRCCETE 2023"},{"id":"http://arxiv.org/abs/2210.00429v2","updated":"2023-08-29T02:32:22Z","published":"2022-10-02T05:34:19Z","title":"ROSIA: Rotation-Search-Based Star Identification Algorithm","summary":" This paper presents a rotation-search-based approach for addressing the star\nidentification (Star-ID) problem. The proposed algorithm, ROSIA, is a\nheuristics-free algorithm that seeks the optimal rotation that maximally aligns\nthe input and catalog stars in their respective coordinates. ROSIA searches the\nrotation space systematically with the Branch-and-Bound (BnB) method. Crucially\naffecting the runtime feasibility of ROSIA is the upper bound function that\nprioritizes the search space. In this paper, we make a theoretical contribution\nby proposing a tight (provable) upper bound function that enables a 400x\nspeed-up compared to an existing formulation. Coupling the bounding function\nwith an efficient evaluation scheme that leverages stereographic projection and\nthe R-tree data structure, ROSIA achieves feasible operational speed on\nembedded processors with state-of-the-art performances under different sources\nof noise. The source code of ROSIA is available at\nhttps://github.com/ckchng/ROSIA.\n","authors":["Chee-Kheng Chng","Alvaro Parra Bustos","Benjamin McCarthy","Tat-Jun Chin"],"pdf_url":"https://arxiv.org/pdf/2210.00429v2.pdf","comment":"21 pages, 16 figures, Accepted to IEEE Transactions on Aerospace and\n Electronic Systems"},{"id":"http://arxiv.org/abs/2308.14978v1","updated":"2023-08-29T02:09:56Z","published":"2023-08-29T02:09:56Z","title":"Vision Grid Transformer for Document Layout Analysis","summary":" Document pre-trained models and grid-based models have proven to be very\neffective on various tasks in Document AI. However, for the document layout\nanalysis (DLA) task, existing document pre-trained models, even those\npre-trained in a multi-modal fashion, usually rely on either textual features\nor visual features. Grid-based models for DLA are multi-modality but largely\nneglect the effect of pre-training. To fully leverage multi-modal information\nand exploit pre-training techniques to learn better representation for DLA, in\nthis paper, we present VGT, a two-stream Vision Grid Transformer, in which Grid\nTransformer (GiT) is proposed and pre-trained for 2D token-level and\nsegment-level semantic understanding. Furthermore, a new dataset named D$^4$LA,\nwhich is so far the most diverse and detailed manually-annotated benchmark for\ndocument layout analysis, is curated and released. Experiment results have\nillustrated that the proposed VGT model achieves new state-of-the-art results\non DLA tasks, e.g. PubLayNet ($95.7\\%$$\\rightarrow$$96.2\\%$), DocBank\n($79.6\\%$$\\rightarrow$$84.1\\%$), and D$^4$LA ($67.7\\%$$\\rightarrow$$68.8\\%$).\nThe code and models as well as the D$^4$LA dataset will be made publicly\navailable ~\\url{https://github.com/AlibabaResearch/AdvancedLiterateMachinery}.\n","authors":["Cheng Da","Chuwei Luo","Qi Zheng","Cong Yao"],"pdf_url":"https://arxiv.org/pdf/2308.14978v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.03998v2","updated":"2023-08-29T01:57:19Z","published":"2023-08-08T02:28:48Z","title":"Real-time Strawberry Detection Based on Improved YOLOv5s Architecture\n for Robotic Harvesting in open-field environment","summary":" This study proposed a YOLOv5-based custom object detection model to detect\nstrawberries in an outdoor environment. The original architecture of the\nYOLOv5s was modified by replacing the C3 module with the C2f module in the\nbackbone network, which provided a better feature gradient flow. Secondly, the\nSpatial Pyramid Pooling Fast in the final layer of the backbone network of\nYOLOv5s was combined with Cross Stage Partial Net to improve the generalization\nability over the strawberry dataset in this study. The proposed architecture\nwas named YOLOv5s-Straw. The RGB images dataset of the strawberry canopy with\nthree maturity classes (immature, nearly mature, and mature) was collected in\nopen-field environment and augmented through a series of operations including\nbrightness reduction, brightness increase, and noise adding. To verify the\nsuperiority of the proposed method for strawberry detection in open-field\nenvironment, four competitive detection models (YOLOv3-tiny, YOLOv5s,\nYOLOv5s-C2f, and YOLOv8s) were trained, and tested under the same computational\nenvironment and compared with YOLOv5s-Straw. The results showed that the\nhighest mean average precision of 80.3% was achieved using the proposed\narchitecture whereas the same was achieved with YOLOv3-tiny, YOLOv5s,\nYOLOv5s-C2f, and YOLOv8s were 73.4%, 77.8%, 79.8%, 79.3%, respectively.\nSpecifically, the average precision of YOLOv5s-Straw was 82.1% in the immature\nclass, 73.5% in the nearly mature class, and 86.6% in the mature class, which\nwere 2.3% and 3.7%, respectively, higher than that of the latest YOLOv8s. The\nmodel included 8.6*10^6 network parameters with an inference speed of 18ms per\nimage while the inference speed of YOLOv8s had a slower inference speed of\n21.0ms and heavy parameters of 11.1*10^6, which indicates that the proposed\nmodel is fast enough for real time strawberry detection and localization for\nthe robotic picking.\n","authors":["Zixuan He","Salik Ram Khana","Xin Zhang","Manoj Karkee","Qin Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03998v2.pdf","comment":"20 pages; 15 figures"},{"id":"http://arxiv.org/abs/2308.14969v1","updated":"2023-08-29T01:47:49Z","published":"2023-08-29T01:47:49Z","title":"Reprogramming under constraints: Revisiting efficient and reliable\n transferability of lottery tickets","summary":" In the era of foundation models with huge pre-training budgets, the\ndownstream tasks have been shifted to the narrative of efficient and fast\nadaptation. For classification-based tasks in the domain of computer vision,\nthe two most efficient approaches have been linear probing (LP) and visual\nprompting/reprogramming (VP); the former aims to learn a classifier in the form\nof a linear head on the features extracted by the pre-trained model, while the\nlatter maps the input data to the domain of the source data on which the model\nwas originally pre-trained on. Although extensive studies have demonstrated the\ndifferences between LP and VP in terms of downstream performance, we explore\nthe capabilities of the two aforementioned methods via the sparsity axis: (a)\nData sparsity: the impact of few-shot adaptation and (b) Model sparsity: the\nimpact of lottery tickets (LT). We demonstrate that LT are not universal\nreprogrammers, i.e., for certain target datasets, reprogramming an LT yields\nsignificantly lower performance than the reprogrammed dense model although\ntheir corresponding upstream performance is similar. Further, we demonstrate\nthat the calibration of dense models is always superior to that of their\nlottery ticket counterparts under both LP and VP regimes. Our empirical study\nopens a new avenue of research into VP for sparse models and encourages further\nunderstanding of the performance beyond the accuracy achieved by VP under\nconstraints of sparsity. Code and logs can be accessed at\n\\url{https://github.com/landskape-ai/Reprogram_LT}.\n","authors":["Diganta Misra","Agam Goyal","Bharat Runwal","Pin Yu Chen"],"pdf_url":"https://arxiv.org/pdf/2308.14969v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2308.14965v1","updated":"2023-08-29T01:34:33Z","published":"2023-08-29T01:34:33Z","title":"CEFHRI: A Communication Efficient Federated Learning Framework for\n Recognizing Industrial Human-Robot Interaction","summary":" Human-robot interaction (HRI) is a rapidly growing field that encompasses\nsocial and industrial applications. Machine learning plays a vital role in\nindustrial HRI by enhancing the adaptability and autonomy of robots in complex\nenvironments. However, data privacy is a crucial concern in the interaction\nbetween humans and robots, as companies need to protect sensitive data while\nmachine learning algorithms require access to large datasets. Federated\nLearning (FL) offers a solution by enabling the distributed training of models\nwithout sharing raw data. Despite extensive research on Federated learning (FL)\nfor tasks such as natural language processing (NLP) and image classification,\nthe question of how to use FL for HRI remains an open research problem. The\ntraditional FL approach involves transmitting large neural network parameter\nmatrices between the server and clients, which can lead to high communication\ncosts and often becomes a bottleneck in FL. This paper proposes a\ncommunication-efficient FL framework for human-robot interaction (CEFHRI) to\naddress the challenges of data heterogeneity and communication costs. The\nframework leverages pre-trained models and introduces a trainable\nspatiotemporal adapter for video understanding tasks in HRI. Experimental\nresults on three human-robot interaction benchmark datasets: HRI30, InHARD, and\nCOIN demonstrate the superiority of CEFHRI over full fine-tuning in terms of\ncommunication costs. The proposed methodology provides a secure and efficient\napproach to HRI federated learning, particularly in industrial environments\nwith data privacy concerns and limited communication bandwidth. Our code is\navailable at\nhttps://github.com/umarkhalidAI/CEFHRI-Efficient-Federated-Learning.\n","authors":["Umar Khalid","Hasan Iqbal","Saeed Vahidian","Jing Hua","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2308.14965v1.pdf","comment":"Accepted in IROS 2023"},{"id":"http://arxiv.org/abs/2308.14960v1","updated":"2023-08-29T01:22:30Z","published":"2023-08-29T01:22:30Z","title":"Read-only Prompt Optimization for Vision-Language Few-shot Learning","summary":" In recent years, prompt tuning has proven effective in adapting pre-trained\nvision-language models to downstream tasks. These methods aim to adapt the\npre-trained models by introducing learnable prompts while keeping pre-trained\nweights frozen. However, learnable prompts can affect the internal\nrepresentation within the self-attention module, which may negatively impact\nperformance variance and generalization, especially in data-deficient settings.\nTo address these issues, we propose a novel approach, Read-only Prompt\nOptimization (RPO). RPO leverages masked attention to prevent the internal\nrepresentation shift in the pre-trained model. Further, to facilitate the\noptimization of RPO, the read-only prompts are initialized based on special\ntokens of the pre-trained model. Our extensive experiments demonstrate that RPO\noutperforms CLIP and CoCoOp in base-to-new generalization and domain\ngeneralization while displaying better robustness. Also, the proposed method\nachieves better generalization on extremely data-deficient settings, while\nimproving parameter efficiency and computational overhead. Code is available at\nhttps://github.com/mlvlab/RPO.\n","authors":["Dongjun Lee","Seokwon Song","Jihee Suh","Joonmyeong Choi","Sanghyeok Lee","Hyunwoo J. Kim"],"pdf_url":"https://arxiv.org/pdf/2308.14960v1.pdf","comment":"Accepted at ICCV2023"},{"id":"http://arxiv.org/abs/2306.03454v2","updated":"2023-08-29T01:20:04Z","published":"2023-06-06T07:17:56Z","title":"Benchmarking Robustness of AI-Enabled Multi-sensor Fusion Systems:\n Challenges and Opportunities","summary":" Multi-Sensor Fusion (MSF) based perception systems have been the foundation\nin supporting many industrial applications and domains, such as self-driving\ncars, robotic arms, and unmanned aerial vehicles. Over the past few years, the\nfast progress in data-driven artificial intelligence (AI) has brought a\nfast-increasing trend to empower MSF systems by deep learning techniques to\nfurther improve performance, especially on intelligent systems and their\nperception systems. Although quite a few AI-enabled MSF perception systems and\ntechniques have been proposed, up to the present, limited benchmarks that focus\non MSF perception are publicly available. Given that many intelligent systems\nsuch as self-driving cars are operated in safety-critical contexts where\nperception systems play an important role, there comes an urgent need for a\nmore in-depth understanding of the performance and reliability of these MSF\nsystems. To bridge this gap, we initiate an early step in this direction and\nconstruct a public benchmark of AI-enabled MSF-based perception systems\nincluding three commonly adopted tasks (i.e., object detection, object\ntracking, and depth completion). Based on this, to comprehensively understand\nMSF systems' robustness and reliability, we design 14 common and realistic\ncorruption patterns to synthesize large-scale corrupted datasets. We further\nperform a systematic evaluation of these systems through our large-scale\nevaluation. Our results reveal the vulnerability of the current AI-enabled MSF\nperception systems, calling for researchers and practitioners to take\nrobustness and reliability into account when designing AI-enabled MSF.\n","authors":["Xinyu Gao","Zhijie Wang","Yang Feng","Lei Ma","Zhenyu Chen","Baowen Xu"],"pdf_url":"https://arxiv.org/pdf/2306.03454v2.pdf","comment":"To appear in ESEC/FSE 2023"},{"id":"http://arxiv.org/abs/2305.14713v2","updated":"2023-08-29T01:10:15Z","published":"2023-05-24T04:30:25Z","title":"Streaming Object Detection on Fisheye Cameras for Automatic Parking","summary":" Fisheye cameras are widely employed in automatic parking, and the video\nstream object detection (VSOD) of the fisheye camera is a fundamental\nperception function to ensure the safe operation of vehicles. In past research\nwork, the difference between the output of the deep learning model and the\nactual situation at the current moment due to the existence of delay of the\nperception system is generally ignored. But the environment will inevitably\nchange within the delay time which may cause a potential safety hazard. In this\npaper, we propose a real-time detection framework equipped with a dual-flow\nperception module (dynamic and static flows) that can predict the future and\nalleviate the time-lag problem. Meanwhile, we use a new scheme to evaluate\nlatency and accuracy. The standard bounding box is unsuitable for the object in\nfisheye camera images due to the strong radial distortion of the fisheye camera\nand the primary detection objects of parking perception are vehicles and\npedestrians, so we adopt the rotate bounding box and propose a new periodic\nangle loss function to regress the angle of the box, which is the simple and\naccurate representation method of objects. The instance segmentation ground\ntruth is used to supervise the training. Experiments demonstrate the\neffectiveness of our approach. Code is released at:\nhttps://gitee.com/hiyanyx/fisheye-streaming-perception.\n","authors":["Yixiong Yan","Liangzhu Cheng","Yongxu Li","Xinjuan Tuo"],"pdf_url":"https://arxiv.org/pdf/2305.14713v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.17209v2","updated":"2023-08-29T23:47:49Z","published":"2023-03-30T08:05:59Z","title":"Human from Blur: Human Pose Tracking from Blurry Images","summary":" We propose a method to estimate 3D human poses from substantially blurred\nimages. The key idea is to tackle the inverse problem of image deblurring by\nmodeling the forward problem with a 3D human model, a texture map, and a\nsequence of poses to describe human motion. The blurring process is then\nmodeled by a temporal image aggregation step. Using a differentiable renderer,\nwe can solve the inverse problem by backpropagating the pixel-wise reprojection\nerror to recover the best human motion representation that explains a single or\nmultiple input images. Since the image reconstruction loss alone is\ninsufficient, we present additional regularization terms. To the best of our\nknowledge, we present the first method to tackle this problem. Our method\nconsistently outperforms other methods on significantly blurry inputs since\nthey lack one or multiple key functionalities that our method unifies, i.e.\nimage deblurring with sub-frame accuracy and explicit 3D modeling of non-rigid\nhuman motion.\n","authors":["Yiming Zhao","Denys Rozumnyi","Jie Song","Otmar Hilliges","Marc Pollefeys","Martin R. Oswald"],"pdf_url":"https://arxiv.org/pdf/2303.17209v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15670v1","updated":"2023-08-29T23:45:54Z","published":"2023-08-29T23:45:54Z","title":"Multimodal Foundation Models For Echocardiogram Interpretation","summary":" Multimodal deep learning foundation models can learn the relationship between\nimages and text. In the context of medical imaging, mapping images to language\nconcepts reflects the clinical task of diagnostic image interpretation, however\ncurrent general-purpose foundation models do not perform well in this context\nbecause their training corpus have limited medical text and images. To address\nthis challenge and account for the range of cardiac physiology, we leverage\n1,032,975 cardiac ultrasound videos and corresponding expert interpretations to\ndevelop EchoCLIP, a multimodal foundation model for echocardiography. EchoCLIP\ndisplays strong zero-shot (not explicitly trained) performance in cardiac\nfunction assessment (external validation left ventricular ejection fraction\nmean absolute error (MAE) of 7.1%) and identification of implanted intracardiac\ndevices (areas under the curve (AUC) between 0.84 and 0.98 for pacemakers and\nartificial heart valves). We also developed a long-context variant (EchoCLIP-R)\nwith a custom echocardiography report text tokenizer which can accurately\nidentify unique patients across multiple videos (AUC of 0.86), identify\nclinical changes such as orthotopic heart transplants (AUC of 0.79) or cardiac\nsurgery (AUC 0.77), and enable robust image-to-text search (mean cross-modal\nretrieval rank in the top 1% of candidate text reports). These emergent\ncapabilities can be used for preliminary assessment and summarization of\nechocardiographic findings.\n","authors":["Matthew Christensen","Milos Vukadinovic","Neal Yuan","David Ouyang"],"pdf_url":"https://arxiv.org/pdf/2308.15670v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15667v1","updated":"2023-08-29T23:35:36Z","published":"2023-08-29T23:35:36Z","title":"Bridging Distribution Learning and Image Clustering in High-dimensional\n Space","summary":" Distribution learning focuses on learning the probability density function\nfrom a set of data samples. In contrast, clustering aims to group similar\nobjects together in an unsupervised manner. Usually, these two tasks are\nconsidered unrelated. However, the relationship between the two may be\nindirectly correlated, with Gaussian Mixture Models (GMM) acting as a bridge.\nIn this paper, we focus on exploring the correlation between distribution\nlearning and clustering, with the motivation to fill the gap between these two\nfields, utilizing an autoencoder (AE) to encode images into a high-dimensional\nlatent space. Then, Monte-Carlo Marginalization (MCMarg) and Kullback-Leibler\n(KL) divergence loss are used to fit the Gaussian components of the GMM and\nlearn the data distribution. Finally, image clustering is achieved through each\nGaussian component of GMM. Yet, the \"curse of dimensionality\" poses severe\nchallenges for most clustering algorithms. Compared with the classic\nExpectation-Maximization (EM) Algorithm, experimental results show that MCMarg\nand KL divergence can greatly alleviate the difficulty. Based on the\nexperimental results, we believe distribution learning can exploit the\npotential of GMM in image clustering within high-dimensional space.\n","authors":["Guanfang Dong","Chenqiu Zhao","Anup Basu"],"pdf_url":"https://arxiv.org/pdf/2308.15667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15660v1","updated":"2023-08-29T22:43:46Z","published":"2023-08-29T22:43:46Z","title":"Unveiling Camouflage: A Learnable Fourier-based Augmentation for\n Camouflaged Object Detection and Instance Segmentation","summary":" Camouflaged object detection (COD) and camouflaged instance segmentation\n(CIS) aim to recognize and segment objects that are blended into their\nsurroundings, respectively. While several deep neural network models have been\nproposed to tackle those tasks, augmentation methods for COD and CIS have not\nbeen thoroughly explored. Augmentation strategies can help improve the\nperformance of models by increasing the size and diversity of the training data\nand exposing the model to a wider range of variations in the data. Besides, we\naim to automatically learn transformations that help to reveal the underlying\nstructure of camouflaged objects and allow the model to learn to better\nidentify and segment camouflaged objects. To achieve this, we propose a\nlearnable augmentation method in the frequency domain for COD and CIS via\nFourier transform approach, dubbed CamoFourier. Our method leverages a\nconditional generative adversarial network and cross-attention mechanism to\ngenerate a reference image and an adaptive hybrid swapping with parameters to\nmix the low-frequency component of the reference image and the high-frequency\ncomponent of the input image. This approach aims to make camouflaged objects\nmore visible for detection and segmentation models. Without bells and whistles,\nour proposed augmentation method boosts the performance of camouflaged object\ndetectors and camouflaged instance segmenters by large margins.\n","authors":["Minh-Quan Le","Minh-Triet Tran","Trung-Nghia Le","Tam V. Nguyen","Thanh-Toan Do"],"pdf_url":"https://arxiv.org/pdf/2308.15660v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.11418v2","updated":"2023-08-29T22:36:22Z","published":"2023-01-26T21:09:45Z","title":"Parkinson gait modelling from an anomaly deep representation","summary":" Parkinson's Disease (PD) is associated with gait movement disorders, such as\nbradykinesia, stiffness, tremors and postural instability, caused by\nprogressive dopamine deficiency. Today, some approaches have implemented\nlearning representations to quantify kinematic patterns during locomotion,\nsupporting clinical procedures such as diagnosis and treatment planning. These\napproaches assumes a large amount of stratified and labeled data to optimize\ndiscriminative representations. Nonetheless these considerations may restrict\nthe approaches to be operable in real scenarios during clinical practice. This\nwork introduces a self-supervised generative representation to learn\ngait-motion-related patterns, under the pretext of video reconstruction and an\nanomaly detection framework. This architecture is trained following a one-class\nweakly supervised learning to avoid inter-class variance and approach the\nmultiple relationships that represent locomotion. The proposed approach was\nvalidated with 14 PD patients and 23 control subjects, and trained with the\ncontrol population only, achieving an AUC of 95%, homocedasticity level of 70%\nand shapeness level of 70% in the classification task considering its\ngeneralization.\n","authors":["Edgar Rangel","Fabio Martinez"],"pdf_url":"https://arxiv.org/pdf/2301.11418v2.pdf","comment":"Journal not submitted to any editorial"},{"id":"http://arxiv.org/abs/2305.02422v3","updated":"2023-08-29T22:12:04Z","published":"2023-05-03T20:29:04Z","title":"GAMIVAL: Video Quality Prediction on Mobile Cloud Gaming Content","summary":" The mobile cloud gaming industry has been rapidly growing over the last\ndecade. When streaming gaming videos are transmitted to customers' client\ndevices from cloud servers, algorithms that can monitor distorted video quality\nwithout having any reference video available are desirable tools. However,\ncreating No-Reference Video Quality Assessment (NR VQA) models that can\naccurately predict the quality of streaming gaming videos rendered by computer\ngraphics engines is a challenging problem, since gaming content generally\ndiffers statistically from naturalistic videos, often lacks detail, and\ncontains many smooth regions. Until recently, the problem has been further\ncomplicated by the lack of adequate subjective quality databases of mobile\ngaming content. We have created a new gaming-specific NR VQA model called the\nGaming Video Quality Evaluator (GAMIVAL), which combines and leverages the\nadvantages of spatial and temporal gaming distorted scene statistics models, a\nneural noise model, and deep semantic features. Using a support vector\nregression (SVR) as a regressor, GAMIVAL achieves superior performance on the\nnew LIVE-Meta Mobile Cloud Gaming (LIVE-Meta MCG) video quality database.\n","authors":["Yu-Chih Chen","Avinab Saha","Chase Davis","Bo Qiu","Xiaoming Wang","Rahul Gowda","Ioannis Katsavounidis","Alan C. Bovik"],"pdf_url":"https://arxiv.org/pdf/2305.02422v3.pdf","comment":"Accepted to IEEE SPL 2023. The implementation of GAMIVAL has been\n made available online: https://github.com/lskdream/GAMIVAL"},{"id":"http://arxiv.org/abs/2303.02698v3","updated":"2023-08-29T20:57:07Z","published":"2023-03-05T15:27:24Z","title":"Robust affine point matching via quadratic assignment on Grassmannians","summary":" Robust Affine matching with Grassmannians (RAG) is a new algorithm to perform\naffine registration of point clouds. The algorithm is based on minimizing the\nFrobenius distance between two elements of the Grassmannian. For this purpose,\nan indefinite relaxation of the Quadratic Assignment Problem (QAP) is used, and\nseveral approaches to affine feature matching are studied and compared.\nExperiments demonstrate that RAG is more robust to noise and point discrepancy\nthan previous methods.\n","authors":["Alexander Kolpakov","Michael Werman"],"pdf_url":"https://arxiv.org/pdf/2303.02698v3.pdf","comment":"8 pages, 23 figures; GitHub repository at\n (https://github.com/sashakolpakov/rag)"},{"id":"http://arxiv.org/abs/2305.18221v3","updated":"2023-08-29T20:52:57Z","published":"2023-05-29T17:01:54Z","title":"GazeGNN: A Gaze-Guided Graph Neural Network for Chest X-ray\n Classification","summary":" Eye tracking research is important in computer vision because it can help us\nunderstand how humans interact with the visual world. Specifically for\nhigh-risk applications, such as in medical imaging, eye tracking can help us to\ncomprehend how radiologists and other medical professionals search, analyze,\nand interpret images for diagnostic and clinical purposes. Hence, the\napplication of eye tracking techniques in disease classification has become\nincreasingly popular in recent years. Contemporary works usually transform gaze\ninformation collected by eye tracking devices into visual attention maps (VAMs)\nto supervise the learning process. However, this is a time-consuming\npreprocessing step, which stops us from applying eye tracking to radiologists'\ndaily work. To solve this problem, we propose a novel gaze-guided graph neural\nnetwork (GNN), GazeGNN, to leverage raw eye-gaze data without being converted\ninto VAMs. In GazeGNN, to directly integrate eye gaze into image\nclassification, we create a unified representation graph that models both\nimages and gaze pattern information. With this benefit, we develop a real-time,\nreal-world, end-to-end disease classification algorithm for the first time in\nthe literature. This achievement demonstrates the practicality and feasibility\nof integrating real-time eye tracking techniques into the daily work of\nradiologists. To our best knowledge, GazeGNN is the first work that adopts GNN\nto integrate image and eye-gaze data. Our experiments on the public chest X-ray\ndataset show that our proposed method exhibits the best classification\nperformance compared to existing methods. The code is available at\nhttps://github.com/ukaukaaaa/GazeGNN.\n","authors":["Bin Wang","Hongyi Pan","Armstrong Aboah","Zheyuan Zhang","Elif Keles","Drew Torigian","Baris Turkbey","Elizabeth Krupinski","Jayaram Udupa","Ulas Bagci"],"pdf_url":"https://arxiv.org/pdf/2305.18221v3.pdf","comment":"WACV 2024"},{"id":"http://arxiv.org/abs/2308.15624v1","updated":"2023-08-29T20:45:41Z","published":"2023-08-29T20:45:41Z","title":"Detection of Mild Cognitive Impairment Using Facial Features in Video\n Conversations","summary":" Early detection of Mild Cognitive Impairment (MCI) leads to early\ninterventions to slow the progression from MCI into dementia. Deep Learning\n(DL) algorithms could help achieve early non-invasive, low-cost detection of\nMCI. This paper presents the detection of MCI in older adults using DL models\nbased only on facial features extracted from video-recorded conversations at\nhome. We used the data collected from the I-CONECT behavioral intervention\nstudy (NCT02871921), where several sessions of semi-structured interviews\nbetween socially isolated older individuals and interviewers were video\nrecorded. We develop a framework that extracts spatial holistic facial features\nusing a convolutional autoencoder and temporal information using transformers.\nOur proposed DL model was able to detect the I-CONECT study participants'\ncognitive conditions (MCI vs. those with normal cognition (NC)) using facial\nfeatures. The segments and sequence information of the facial features improved\nthe prediction performance compared with the non-temporal features. The\ndetection accuracy using this combined method reached 88% whereas 84% is the\naccuracy without applying the segments and sequences information of the facial\nfeatures within a video on a certain theme.\n","authors":["Muath Alsuhaibani","Hiroko H. Dodge","Mohammad H. Mahoor"],"pdf_url":"https://arxiv.org/pdf/2308.15624v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15618v1","updated":"2023-08-29T20:25:49Z","published":"2023-08-29T20:25:49Z","title":"RACR-MIL: Weakly Supervised Skin Cancer Grading using Rank-Aware\n Contextual Reasoning on Whole Slide Images","summary":" Cutaneous squamous cell cancer (cSCC) is the second most common skin cancer\nin the US. It is diagnosed by manual multi-class tumor grading using a tissue\nwhole slide image (WSI), which is subjective and suffers from inter-pathologist\nvariability. We propose an automated weakly-supervised grading approach for\ncSCC WSIs that is trained using WSI-level grade and does not require\nfine-grained tumor annotations. The proposed model, RACR-MIL, transforms each\nWSI into a bag of tiled patches and leverages attention-based multiple-instance\nlearning to assign a WSI-level grade. We propose three key innovations to\naddress general as well as cSCC-specific challenges in tumor grading. First, we\nleverage spatial and semantic proximity to define a WSI graph that encodes both\nlocal and non-local dependencies between tumor regions and leverage graph\nattention convolution to derive contextual patch features. Second, we introduce\na novel ordinal ranking constraint on the patch attention network to ensure\nthat higher-grade tumor regions are assigned higher attention. Third, we use\ntumor depth as an auxiliary task to improve grade classification in a multitask\nlearning framework. RACR-MIL achieves 2-9% improvement in grade classification\nover existing weakly-supervised approaches on a dataset of 718 cSCC tissue\nimages and localizes the tumor better. The model achieves 5-20% higher accuracy\nin difficult-to-classify high-risk grade classes and is robust to class\nimbalance.\n","authors":["Anirudh Choudhary","Angelina Hwang","Jacob Kechter","Krishnakant Saboo","Blake Bordeaux","Puneet Bhullar","Nneka Comfere","David DiCaudo","Steven Nelson","Emma Johnson","Leah Swanson","Dennis Murphree","Aaron Mangold","Ravishankar K. Iyer"],"pdf_url":"https://arxiv.org/pdf/2308.15618v1.pdf","comment":"7 pages main text, 2 page references, 3 page appendix; submitted to\n AAAI"},{"id":"http://arxiv.org/abs/2308.15575v1","updated":"2023-08-29T19:04:42Z","published":"2023-08-29T19:04:42Z","title":"Prototype Fission: Closing Set for Robust Open-set Semi-supervised\n Learning","summary":" Semi-supervised Learning (SSL) has been proven vulnerable to\nout-of-distribution (OOD) samples in realistic large-scale unsupervised\ndatasets due to over-confident pseudo-labeling OODs as in-distribution (ID). A\nkey underlying problem is class-wise latent space spreading from closed seen\nspace to open unseen space, and the bias is further magnified in SSL's\nself-training loops. To close the ID distribution set so that OODs are better\nrejected for safe SSL, we propose Prototype Fission(PF) to divide class-wise\nlatent spaces into compact sub-spaces by automatic fine-grained latent space\nmining, driven by coarse-grained labels only. Specifically, we form multiple\nunique learnable sub-class prototypes for each class, optimized towards both\ndiversity and consistency. The Diversity Modeling term encourages samples to be\nclustered by one of the multiple sub-class prototypes, while the Consistency\nModeling term clusters all samples of the same class to a global prototype.\nInstead of \"opening set\", i.e., modeling OOD distribution, Prototype Fission\n\"closes set\" and makes it hard for OOD samples to fit in sub-class latent\nspace. Therefore, PF is compatible with existing methods for further\nperformance gains. Extensive experiments validate the effectiveness of our\nmethod in open-set SSL settings in terms of successfully forming sub-classes,\ndiscriminating OODs from IDs and improving overall accuracy. Codes will be\nreleased.\n","authors":["Xuwei Tan","Yi-Jie Huang","Yaqian Li"],"pdf_url":"https://arxiv.org/pdf/2308.15575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.12542v2","updated":"2023-08-29T18:40:19Z","published":"2022-11-22T19:18:30Z","title":"CASSPR: Cross Attention Single Scan Place Recognition","summary":" Place recognition based on point clouds (LiDAR) is an important component for\nautonomous robots or self-driving vehicles. Current SOTA performance is\nachieved on accumulated LiDAR submaps using either point-based or voxel-based\nstructures. While voxel-based approaches nicely integrate spatial context\nacross multiple scales, they do not exhibit the local precision of point-based\nmethods. As a result, existing methods struggle with fine-grained matching of\nsubtle geometric features in sparse single-shot Li- DAR scans. To overcome\nthese limitations, we propose CASSPR as a method to fuse point-based and\nvoxel-based approaches using cross attention transformers. CASSPR leverages a\nsparse voxel branch for extracting and aggregating information at lower\nresolution and a point-wise branch for obtaining fine-grained local\ninformation. CASSPR uses queries from one branch to try to match structures in\nthe other branch, ensuring that both extract self-contained descriptors of the\npoint cloud (rather than one branch dominating), but using both to inform the\noutput global descriptor of the point cloud. Extensive experiments show that\nCASSPR surpasses the state-of-the-art by a large margin on several datasets\n(Oxford RobotCar, TUM, USyd). For instance, it achieves AR@1 of 85.6% on the\nTUM dataset, surpassing the strongest prior model by ~15%. Our code is publicly\navailable.\n","authors":["Yan Xia","Mariia Gladkova","Rui Wang","Qianyun Li","Uwe Stilla","João F. Henriques","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2211.12542v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.15564v1","updated":"2023-08-29T18:36:21Z","published":"2023-08-29T18:36:21Z","title":"Learning Sequential Information in Task-based fMRI for Synthetic Data\n Augmentation","summary":" Insufficiency of training data is a persistent issue in medical image\nanalysis, especially for task-based functional magnetic resonance images (fMRI)\nwith spatio-temporal imaging data acquired using specific cognitive tasks. In\nthis paper, we propose an approach for generating synthetic fMRI sequences that\ncan then be used to create augmented training datasets in downstream learning\ntasks. To synthesize high-resolution task-specific fMRI, we adapt the\n$\\alpha$-GAN structure, leveraging advantages of both GAN and variational\nautoencoder models, and propose different alternatives in aggregating temporal\ninformation. The synthetic images are evaluated from multiple perspectives\nincluding visualizations and an autism spectrum disorder (ASD) classification\ntask. The results show that the synthetic task-based fMRI can provide effective\ndata augmentation in learning the ASD classification task.\n","authors":["Jiyao Wang","Nicha C. Dvornek","Lawrence H. Staib","James S. Duncan"],"pdf_url":"https://arxiv.org/pdf/2308.15564v1.pdf","comment":"Accepted by Machine Learning in Clinical Neuroimaging 2023 (MICCAI\n workshop), preprint version"},{"id":"http://arxiv.org/abs/2308.15557v1","updated":"2023-08-29T18:24:28Z","published":"2023-08-29T18:24:28Z","title":"A Pseudo-Boolean Polynomials Approach for Image Edge Detection","summary":" We introduce a novel approach for image edge detection based on\npseudo-Boolean polynomials for image patches. We show that patches covering\nedge regions in the image result in pseudo-Boolean polynomials with higher\ndegrees compared to patches that cover blob regions. The proposed approach is\nbased on reduction of polynomial degree and equivalence properties of\npenalty-based pseudo-Boolean polynomials.\n","authors":["Tendai Mapungwana Chikake","Boris Goldengorin"],"pdf_url":"https://arxiv.org/pdf/2308.15557v1.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2110.03105v3","updated":"2023-08-29T18:15:10Z","published":"2021-10-06T23:37:21Z","title":"MetaCOG: Learning a Metacognition to Recover What Objects Are Actually\n There","summary":" Humans not only form representations about the world based on what we see,\nbut also learn meta-cognitive representations about how our own vision works.\nThis enables us to recognize when our vision is unreliable (e.g., when we\nrealize that we are experiencing a visual illusion) and enables us to question\nwhat we see. Inspired by this human capacity, we present MetaCOG: a model that\nincreases the robustness of object detectors by learning representations of\ntheir reliability, and does so without feedback. Specifically, MetaCOG is a\nhierarchical probabilistic model that expresses a joint distribution over the\nobjects in a 3D scene and the outputs produced by a detector. When paired with\nan off-the-shelf object detector, MetaCOG takes detections as input and infers\nthe detector's tendencies to miss objects of certain categories and to\nhallucinate objects that are not actually present, all without access to\nground-truth object labels. When paired with three modern neural object\ndetectors, MetaCOG learns useful and accurate meta-cognitive representations,\nresulting in improved performance on the detection task. Additionally, we show\nthat MetaCOG is robust to varying levels of error in the detections. Our\nresults are a proof-of-concept for a novel approach to the problem of\ncorrecting a faulty vision system's errors. The model code, datasets, results,\nand demos are available:\nhttps://osf.io/8b9qt/?view_only=8c1b1c412c6b4e1697e3c7859be2fce6\n","authors":["Marlene Berke","Zhangir Azerbayev","Mario Belledonne","Zenna Tavares","Julian Jara-Ettinger"],"pdf_url":"https://arxiv.org/pdf/2110.03105v3.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.15547v1","updated":"2023-08-29T18:11:32Z","published":"2023-08-29T18:11:32Z","title":"Efficient Ray Sampling for Radiance Fields Reconstruction","summary":" Accelerating neural radiance fields training is of substantial practical\nvalue, as the ray sampling strategy profoundly impacts network convergence.\nMore efficient ray sampling can thus directly enhance existing NeRF models'\ntraining efficiency. We therefore propose a novel ray sampling approach for\nneural radiance fields that improves training efficiency while retaining\nphotorealistic rendering results. First, we analyze the relationship between\nthe pixel loss distribution of sampled rays and rendering quality. This reveals\nredundancy in the original NeRF's uniform ray sampling. Guided by this finding,\nwe develop a sampling method leveraging pixel regions and depth boundaries. Our\nmain idea is to sample fewer rays in training views, yet with each ray more\ninformative for scene fitting. Sampling probability increases in pixel areas\nexhibiting significant color and depth variation, greatly reducing wasteful\nrays from other regions without sacrificing precision. Through this method, not\nonly can the convergence of the network be accelerated, but the spatial\ngeometry of a scene can also be perceived more accurately. Rendering outputs\nare enhanced, especially for texture-complex regions. Experiments demonstrate\nthat our method significantly outperforms state-of-the-art techniques on public\nbenchmark datasets.\n","authors":["Shilei Sun","Ming Liu","Zhongyi Fan","Yuxue Liu","Chengwei Lv","Liquan Dong","Lingqin Kong"],"pdf_url":"https://arxiv.org/pdf/2308.15547v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2308.15536v1","updated":"2023-08-29T18:00:22Z","published":"2023-08-29T18:00:22Z","title":"DebSDF: Delving into the Details and Bias of Neural Indoor Scene\n Reconstruction","summary":" In recent years, the neural implicit surface has emerged as a powerful\nrepresentation for multi-view surface reconstruction due to its simplicity and\nstate-of-the-art performance. However, reconstructing smooth and detailed\nsurfaces in indoor scenes from multi-view images presents unique challenges.\nIndoor scenes typically contain large texture-less regions, making the\nphotometric loss unreliable for optimizing the implicit surface. Previous work\nutilizes monocular geometry priors to improve the reconstruction in indoor\nscenes. However, monocular priors often contain substantial errors in thin\nstructure regions due to domain gaps and the inherent inconsistencies when\nderived independently from different views. This paper presents \\textbf{DebSDF}\nto address these challenges, focusing on the utilization of uncertainty in\nmonocular priors and the bias in SDF-based volume rendering. We propose an\nuncertainty modeling technique that associates larger uncertainties with larger\nerrors in the monocular priors. High-uncertainty priors are then excluded from\noptimization to prevent bias. This uncertainty measure also informs an\nimportance-guided ray sampling and adaptive smoothness regularization,\nenhancing the learning of fine structures. We further introduce a bias-aware\nsigned distance function to density transformation that takes into account the\ncurvature and the angle between the view direction and the SDF normals to\nreconstruct fine details better. Our approach has been validated through\nextensive experiments on several challenging datasets, demonstrating improved\nqualitative and quantitative results in reconstructing thin structures in\nindoor scenes, thereby outperforming previous work.\n","authors":["Yuting Xiao","Jingwei Xu","Zehao Yu","Shenghua Gao"],"pdf_url":"https://arxiv.org/pdf/2308.15536v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15517v1","updated":"2023-08-29T16:58:03Z","published":"2023-08-29T16:58:03Z","title":"Document AI: A Comparative Study of Transformer-Based, Graph-Based\n Models, and Convolutional Neural Networks For Document Layout Analysis","summary":" Document AI aims to automatically analyze documents by leveraging natural\nlanguage processing and computer vision techniques. One of the major tasks of\nDocument AI is document layout analysis, which structures document pages by\ninterpreting the content and spatial relationships of layout, image, and text.\nThis task can be image-centric, wherein the aim is to identify and label\nvarious regions such as authors and paragraphs, or text-centric, where the\nfocus is on classifying individual words in a document. Although there are\nincreasingly sophisticated methods for improving layout analysis, doubts remain\nabout the extent to which their findings can be generalized to a broader\ncontext. Specifically, prior work developed systems based on very different\narchitectures, such as transformer-based, graph-based, and CNNs. However, no\nwork has mentioned the effectiveness of these models in a comparative analysis.\nMoreover, while language-independent Document AI models capable of knowledge\ntransfer have been developed, it remains to be investigated to what degree they\ncan effectively transfer knowledge. In this study, we aim to fill these gaps by\nconducting a comparative evaluation of state-of-the-art models in document\nlayout analysis and investigating the potential of cross-lingual layout\nanalysis by utilizing machine translation techniques.\n","authors":["Sotirios Kastanas","Shaomu Tan","Yi He"],"pdf_url":"https://arxiv.org/pdf/2308.15517v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15512v1","updated":"2023-08-29T15:39:15Z","published":"2023-08-29T15:39:15Z","title":"Shatter and Gather: Learning Referring Image Segmentation with Text\n Supervision","summary":" Referring image segmentation, the task of segmenting any arbitrary entities\ndescribed in free-form texts, opens up a variety of vision applications.\nHowever, manual labeling of training data for this task is prohibitively\ncostly, leading to lack of labeled data for training. We address this issue by\na weakly supervised learning approach using text descriptions of training\nimages as the only source of supervision. To this end, we first present a new\nmodel that discovers semantic entities in input image and then combines such\nentities relevant to text query to predict the mask of the referent. We also\npresent a new loss function that allows the model to be trained without any\nfurther supervision. Our method was evaluated on four public benchmarks for\nreferring image segmentation, where it clearly outperformed the existing method\nfor the same task and recent open-vocabulary segmentation models on all the\nbenchmarks.\n","authors":["Dongwon Kim","Namyup Kim","Cuiling Lan","Suha Kwak"],"pdf_url":"https://arxiv.org/pdf/2308.15512v1.pdf","comment":"Accepted to ICCV 2023"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2306.08018v2","updated":"2023-08-29T17:13:05Z","published":"2023-06-13T14:35:34Z","title":"Mol-Instructions: A Large-Scale Biomolecular Instruction Dataset for\n Large Language Models","summary":" Large Language Models (LLMs), with their remarkable task-handling\ncapabilities and innovative outputs, have catalyzed significant advancements\nacross a spectrum of fields. However, their proficiency within specialized\ndomains such as biomolecular studies remains limited. To address this\nchallenge, we introduce Mol-Instructions, a meticulously curated, comprehensive\ninstruction dataset expressly designed for the biomolecular realm.\nMol-Instructions is composed of three pivotal components: molecule-oriented\ninstructions, protein-oriented instructions, and biomolecular text\ninstructions, each curated to enhance the understanding and prediction\ncapabilities of LLMs concerning biomolecular features and behaviors. Through\nextensive instruction tuning experiments on the representative LLM, we\nunderscore the potency of Mol-Instructions to enhance the adaptability and\ncognitive acuity of large models within the complex sphere of biomolecular\nstudies, thereby promoting advancements in the biomolecular research community.\nMol-Instructions is made publicly accessible for future research endeavors and\nwill be subjected to continual updates for enhanced applicability.\n","authors":["Yin Fang","Xiaozhuan Liang","Ningyu Zhang","Kangwei Liu","Rui Huang","Zhuo Chen","Xiaohui Fan","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2306.08018v2.pdf","comment":"Project homepage: https://github.com/zjunlp/Mol-Instructions. Add\n quantitative evaluations"},{"id":"http://arxiv.org/abs/2307.07740v2","updated":"2023-08-29T16:55:11Z","published":"2023-07-15T08:08:38Z","title":"Political Sentiment Analysis of Persian Tweets Using CNN-LSTM Model","summary":" Sentiment analysis is the process of identifying and categorizing people's\nemotions or opinions regarding various topics. The analysis of Twitter\nsentiment has become an increasingly popular topic in recent years. In this\npaper, we present several machine learning and a deep learning model to\nanalysis sentiment of Persian political tweets. Our analysis was conducted\nusing Bag of Words and ParsBERT for word representation. We applied Gaussian\nNaive Bayes, Gradient Boosting, Logistic Regression, Decision Trees, Random\nForests, as well as a combination of CNN and LSTM to classify the polarities of\ntweets. The results of this study indicate that deep learning with ParsBERT\nembedding performs better than machine learning. The CNN-LSTM model had the\nhighest classification accuracy with 89 percent on the first dataset and 71\npercent on the second dataset. Due to the complexity of Persian, it was a\ndifficult task to achieve this level of efficiency. The main objective of our\nresearch was to reduce the training time while maintaining the model's\nperformance. As a result, several adjustments were made to the model\narchitecture and parameters. In addition to achieving the objective, the\nperformance was slightly improved as well.\n","authors":["Mohammad Dehghani","Zahra Yazdanparast"],"pdf_url":"https://arxiv.org/pdf/2307.07740v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15405v1","updated":"2023-08-29T16:07:18Z","published":"2023-08-29T16:07:18Z","title":"Robust Long-Tailed Learning via Label-Aware Bounded CVaR","summary":" Data in the real-world classification problems are always imbalanced or\nlong-tailed, wherein the majority classes have the most of the samples that\ndominate the model training. In such setting, the naive model tends to have\npoor performance on the minority classes. Previously, a variety of loss\nmodifications have been proposed to address the long-tailed leaning problem,\nwhile these methods either treat the samples in the same class\nindiscriminatingly or lack a theoretical guarantee. In this paper, we propose\ntwo novel approaches based on CVaR (Conditional Value at Risk) to improve the\nperformance of long-tailed learning with a solid theoretical ground.\nSpecifically, we firstly introduce a Label-Aware Bounded CVaR (LAB-CVaR) loss\nto overcome the pessimistic result of the original CVaR, and further design the\noptimal weight bounds for LAB-CVaR theoretically. Based on LAB-CVaR, we\nadditionally propose a LAB-CVaR with logit adjustment (LAB-CVaR-logit) loss to\nstabilize the optimization process, where we also offer the theoretical\nsupport. Extensive experiments on real-world datasets with long-tailed label\ndistributions verify the superiority of our proposed methods.\n","authors":["Hong Zhu","Runpeng Yu","Xing Tang","Yifei Wang","Yuan Fang","Yisen Wang"],"pdf_url":"https://arxiv.org/pdf/2308.15405v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15265v1","updated":"2023-08-29T12:50:21Z","published":"2023-08-29T12:50:21Z","title":"A Multi-Perspective Learning to Rank Approach to Support Children's\n Information Seeking in the Classroom","summary":" We introduce a novel re-ranking model that aims to augment the functionality\nof standard search engines to support classroom search activities for children\n(ages 6 to 11). This model extends the known listwise learning-to-rank\nframework by balancing risk and reward. Doing so enables the model to\nprioritize Web resources of high educational alignment, appropriateness, and\nadequate readability by analyzing the URLs, snippets, and page titles of Web\nresources retrieved by a given mainstream search engine. Experimental results,\nincluding an ablation study and comparisons with existing baselines, showcase\nthe correctness of the proposed model. The outcomes of this work demonstrate\nthe value of considering multiple perspectives inherent to the classroom\nsetting, e.g., educational alignment, readability, and objectionability, when\napplied to the design of algorithms that can better support children's\ninformation discovery.\n","authors":["Garrett Allen","Katherine Landau Wright","Jerry Alan Fails","Casey Kennington","Maria Soledad Pera"],"pdf_url":"https://arxiv.org/pdf/2308.15265v1.pdf","comment":"Extended version of the manuscript to appear in proceedings of the\n 22nd IEEE/WIC International Conference on Web Intelligence and Intelligent\n Agent Technology"},{"id":"http://arxiv.org/abs/2308.15244v1","updated":"2023-08-29T12:11:16Z","published":"2023-08-29T12:11:16Z","title":"Knowledge-based Multiple Adaptive Spaces Fusion for Recommendation","summary":" Since Knowledge Graphs (KGs) contain rich semantic information, recently\nthere has been an influx of KG-enhanced recommendation methods. Most of\nexisting methods are entirely designed based on euclidean space without\nconsidering curvature. However, recent studies have revealed that a tremendous\ngraph-structured data exhibits highly non-euclidean properties. Motivated by\nthese observations, in this work, we propose a knowledge-based multiple\nadaptive spaces fusion method for recommendation, namely MCKG. Unlike existing\nmethods that solely adopt a specific manifold, we introduce the unified space\nthat is compatible with hyperbolic, euclidean and spherical spaces.\nFurthermore, we fuse the multiple unified spaces in an attention manner to\nobtain the high-quality embeddings for better knowledge propagation. In\naddition, we propose a geometry-aware optimization strategy which enables the\npull and push processes benefited from both hyperbolic and spherical spaces.\nSpecifically, in hyperbolic space, we set smaller margins in the area near to\nthe origin, which is conducive to distinguishing between highly similar\npositive items and negative ones. At the same time, we set larger margins in\nthe area far from the origin to ensure the model has sufficient error\ntolerance. The similar manner also applies to spherical spaces. Extensive\nexperiments on three real-world datasets demonstrate that the MCKG has a\nsignificant improvement over state-of-the-art recommendation methods. Further\nablation experiments verify the importance of multi-space fusion and\ngeometry-aware optimization strategy, justifying the rationality and\neffectiveness of MCKG.\n","authors":["Meng Yuan","Fuzhen Zhuang","Zhao Zhang","Deqing Wang","Jin Dong"],"pdf_url":"https://arxiv.org/pdf/2308.15244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15232v1","updated":"2023-08-29T11:40:24Z","published":"2023-08-29T11:40:24Z","title":"Classification-Aware Neural Topic Model Combined With Interpretable\n Analysis -- For Conflict Classification","summary":" A large number of conflict events are affecting the world all the time. In\norder to analyse such conflict events effectively, this paper presents a\nClassification-Aware Neural Topic Model (CANTM-IA) for Conflict Information\nClassification and Topic Discovery. The model provides a reliable\ninterpretation of classification results and discovered topics by introducing\ninterpretability analysis. At the same time, interpretation is introduced into\nthe model architecture to improve the classification performance of the model\nand to allow interpretation to focus further on the details of the data.\nFinally, the model architecture is optimised to reduce the complexity of the\nmodel.\n","authors":["Tianyu Liang","Yida Mu","Soonho Kim","Darline Larissa Kengne Kuate","Julie Lang","Rob Vos","Xingyi Song"],"pdf_url":"https://arxiv.org/pdf/2308.15232v1.pdf","comment":"Accepted by RANLP 2023"},{"id":"http://arxiv.org/abs/2308.15230v1","updated":"2023-08-29T11:37:33Z","published":"2023-08-29T11:37:33Z","title":"Providing Previously Unseen Users Fair Recommendations Using Variational\n Autoencoders","summary":" An emerging definition of fairness in machine learning requires that models\nare oblivious to demographic user information, e.g., a user's gender or age\nshould not influence the model. Personalized recommender systems are\nparticularly prone to violating this definition through their explicit user\nfocus and user modelling. Explicit user modelling is also an aspect that makes\nmany recommender systems incapable of providing hitherto unseen users with\nrecommendations. We propose novel approaches for mitigating discrimination in\nVariational Autoencoder-based recommender systems by limiting the encoding of\ndemographic information. The approaches are capable of, and evaluated on,\nproviding users that are not represented in the training data with fair\nrecommendations.\n","authors":["Bjørnar Vassøy","Helge Langseth","Benjamin Kille"],"pdf_url":"https://arxiv.org/pdf/2308.15230v1.pdf","comment":"Appearing in RecSys 2023 proceedings"},{"id":"http://arxiv.org/abs/2308.15136v1","updated":"2023-08-29T09:10:53Z","published":"2023-08-29T09:10:53Z","title":"CAGRA: Highly Parallel Graph Construction and Approximate Nearest\n Neighbor Search for GPUs","summary":" Approximate Nearest Neighbor Search (ANNS) plays a critical role in various\ndisciplines spanning data mining and artificial intelligence, from information\nretrieval and computer vision to natural language processing and recommender\nsystems. Data volumes have soared in recent years and the computational cost of\nan exhaustive exact nearest neighbor search is often prohibitive, necessitating\nthe adoption of approximate techniques. The balanced performance and recall of\ngraph-based approaches have more recently garnered significant attention in\nANNS algorithms, however, only a few studies have explored harnessing the power\nof GPUs and multi-core processors despite the widespread use of massively\nparallel and general-purpose computing. To bridge this gap, we introduce a\nnovel parallel computing hardware-based proximity graph and search algorithm.\nBy leveraging the high-performance capabilities of modern hardware, our\napproach achieves remarkable efficiency gains. In particular, our method\nsurpasses existing CPU and GPU-based methods in constructing the proximity\ngraph, demonstrating higher throughput in both large- and small-batch searches\nwhile maintaining compatible accuracy. In graph construction time, our method,\nCAGRA, is 2.2~27x faster than HNSW, which is one of the CPU SOTA\nimplementations. In large-batch query throughput in the 90% to 95% recall\nrange, our method is 33~77x faster than HNSW, and is 3.8~8.8x faster than the\nSOTA implementations for GPU. For a single query, our method is 3.4~53x faster\nthan HNSW at 95% recall.\n","authors":["Hiroyuki Ootomo","Akira Naruse","Corey Nolet","Ray Wang","Tamas Feher","Yong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.15136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15090v1","updated":"2023-08-29T07:53:17Z","published":"2023-08-29T07:53:17Z","title":"Killing two birds with one stone: Can an audio captioning system also be\n used for audio-text retrieval?","summary":" Automated Audio Captioning (AAC) aims to develop systems capable of\ndescribing an audio recording using a textual sentence. In contrast, Audio-Text\nRetrieval (ATR) systems seek to find the best matching audio recording(s) for a\ngiven textual query (Text-to-Audio) or vice versa (Audio-to-Text). These tasks\nrequire different types of systems: AAC employs a sequence-to-sequence model,\nwhile ATR utilizes a ranking model that compares audio and text representations\nwithin a shared projection subspace. However, this work investigates the\nrelationship between AAC and ATR by exploring the ATR capabilities of an\nunmodified AAC system, without fine-tuning for the new task. Our AAC system\nconsists of an audio encoder (ConvNeXt-Tiny) trained on AudioSet for audio\ntagging, and a transformer decoder responsible for generating sentences. For\nAAC, it achieves a high SPIDEr-FL score of 0.298 on Clotho and 0.472 on\nAudioCaps on average. For ATR, we propose using the standard Cross-Entropy loss\nvalues obtained for any audio/caption pair. Experimental results on the Clotho\nand AudioCaps datasets demonstrate decent recall values using this simple\napproach. For instance, we obtained a Text-to-Audio R@1 value of 0.382 for\nAu-dioCaps, which is above the current state-of-the-art method without external\ndata. Interestingly, we observe that normalizing the loss values was necessary\nfor Audio-to-Text retrieval.\n","authors":["Etienne Labbé","Thomas Pellegrini","Julien Pinquier"],"pdf_url":"https://arxiv.org/pdf/2308.15090v1.pdf","comment":"cam ready version (14/08/23)"},{"id":"http://arxiv.org/abs/2308.15033v1","updated":"2023-08-29T05:35:49Z","published":"2023-08-29T05:35:49Z","title":"STEC: See-Through Transformer-based Encoder for CTR Prediction","summary":" Click-Through Rate (CTR) prediction holds a pivotal place in online\nadvertising and recommender systems since CTR prediction performance directly\ninfluences the overall satisfaction of the users and the revenue generated by\ncompanies. Even so, CTR prediction is still an active area of research since it\ninvolves accurately modelling the preferences of users based on sparse and\nhigh-dimensional features where the higher-order interactions of multiple\nfeatures can lead to different outcomes. Most CTR prediction models have relied\non a single fusion and interaction learning strategy. The few CTR prediction\nmodels that have utilized multiple interaction modelling strategies have\ntreated each interaction to be self-contained. In this paper, we propose a\nnovel model named STEC that reaps the benefits of multiple interaction learning\napproaches in a single unified architecture. Additionally, our model introduces\nresidual connections from different orders of interactions which boosts the\nperformance by allowing lower level interactions to directly affect the\npredictions. Through extensive experiments on four real-world datasets, we\ndemonstrate that STEC outperforms existing state-of-the-art approaches for CTR\nprediction thanks to its greater expressive capabilities.\n","authors":["Serdarcan Dilbaz","Hasan Saribas"],"pdf_url":"https://arxiv.org/pdf/2308.15033v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15027v1","updated":"2023-08-29T05:18:47Z","published":"2023-08-29T05:18:47Z","title":"Improving Neural Ranking Models with Traditional IR Methods","summary":" Neural ranking methods based on large transformer models have recently gained\nsignificant attention in the information retrieval community, and have been\nadopted by major commercial solutions. Nevertheless, they are computationally\nexpensive to create, and require a great deal of labeled data for specialized\ncorpora. In this paper, we explore a low resource alternative which is a\nbag-of-embedding model for document retrieval and find that it is competitive\nwith large transformer models fine tuned on information retrieval tasks. Our\nresults show that a simple combination of TF-IDF, a traditional keyword\nmatching method, with a shallow embedding model provides a low cost path to\ncompete well with the performance of complex neural ranking models on 3\ndatasets. Furthermore, adding TF-IDF measures improves the performance of\nlarge-scale fine tuned models on these tasks.\n","authors":["Anik Saha","Oktie Hassanzadeh","Alex Gittens","Jian Ni","Kavitha Srinivas","Bulent Yener"],"pdf_url":"https://arxiv.org/pdf/2308.15027v1.pdf","comment":"Short paper, 4 pages"},{"id":"http://arxiv.org/abs/2308.15014v1","updated":"2023-08-29T04:34:32Z","published":"2023-08-29T04:34:32Z","title":"CAPS: A Practical Partition Index for Filtered Similarity Search","summary":" With the surging popularity of approximate near-neighbor search (ANNS),\ndriven by advances in neural representation learning, the ability to serve\nqueries accompanied by a set of constraints has become an area of intense\ninterest. While the community has recently proposed several algorithms for\nconstrained ANNS, almost all of these methods focus on integration with\ngraph-based indexes, the predominant class of algorithms achieving\nstate-of-the-art performance in latency-recall tradeoffs. In this work, we take\na different approach and focus on developing a constrained ANNS algorithm via\nspace partitioning as opposed to graphs. To that end, we introduce Constrained\nApproximate Partitioned Search (CAPS), an index for ANNS with filters via space\npartitions that not only retains the benefits of a partition-based algorithm\nbut also outperforms state-of-the-art graph-based constrained search techniques\nin recall-latency tradeoffs, with only 10% of the index size.\n","authors":["Gaurav Gupta","Jonah Yi","Benjamin Coleman","Chen Luo","Vihan Lakshman","Anshumali Shrivastava"],"pdf_url":"https://arxiv.org/pdf/2308.15014v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2308.14968v1","updated":"2023-08-29T01:46:06Z","published":"2023-08-29T01:46:06Z","title":"Continual Learning for Generative Retrieval over Dynamic Corpora","summary":" Generative retrieval (GR) directly predicts the identifiers of relevant\ndocuments (i.e., docids) based on a parametric model. It has achieved solid\nperformance on many ad-hoc retrieval tasks. So far, these tasks have assumed a\nstatic document collection. In many practical scenarios, however, document\ncollections are dynamic, where new documents are continuously added to the\ncorpus. The ability to incrementally index new documents while preserving the\nability to answer queries with both previously and newly indexed relevant\ndocuments is vital to applying GR models. In this paper, we address this\npractical continual learning problem for GR. We put forward a novel\nContinual-LEarner for generatiVE Retrieval (CLEVER) model and make two major\ncontributions to continual learning for GR: (i) To encode new documents into\ndocids with low computational cost, we present Incremental Product\nQuantization, which updates a partial quantization codebook according to two\nadaptive thresholds; and (ii) To memorize new documents for querying without\nforgetting previous knowledge, we propose a memory-augmented learning\nmechanism, to form meaningful connections between old and new documents.\nEmpirical results demonstrate the effectiveness and efficiency of the proposed\nmodel.\n","authors":["Jiangui Chen","Ruqing Zhang","Jiafeng Guo","Maarten de Rijke","Wei Chen","Yixing Fan","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2308.14968v1.pdf","comment":"Accepted by CIKM 2023"},{"id":"http://arxiv.org/abs/2308.14963v1","updated":"2023-08-29T01:30:23Z","published":"2023-08-29T01:30:23Z","title":"Vector Search with OpenAI Embeddings: Lucene Is All You Need","summary":" We provide a reproducible, end-to-end demonstration of vector search with\nOpenAI embeddings using Lucene on the popular MS MARCO passage ranking test\ncollection. The main goal of our work is to challenge the prevailing narrative\nthat a dedicated vector store is necessary to take advantage of recent advances\nin deep neural networks as applied to search. Quite the contrary, we show that\nhierarchical navigable small-world network (HNSW) indexes in Lucene are\nadequate to provide vector search capabilities in a standard bi-encoder\narchitecture. This suggests that, from a simple cost-benefit analysis, there\ndoes not appear to be a compelling reason to introduce a dedicated vector store\ninto a modern \"AI stack\" for search, since such applications have already\nreceived substantial investments in existing, widely deployed infrastructure.\n","authors":["Jimmy Lin","Ronak Pradeep","Tommaso Teofili","Jasper Xian"],"pdf_url":"https://arxiv.org/pdf/2308.14963v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15651v1","updated":"2023-08-29T22:03:17Z","published":"2023-08-29T22:03:17Z","title":"Ensuring User-side Fairness in Dynamic Recommender Systems","summary":" User-side group fairness is crucial for modern recommender systems, as it\naims to alleviate performance disparity between groups of users defined by\nsensitive attributes such as gender, race, or age. We find that the disparity\ntends to persist or even increase over time. This calls for effective ways to\naddress user-side fairness in a dynamic environment, which has been\ninfrequently explored in the literature. However, fairness-constrained\nre-ranking, a typical method to ensure user-side fairness (i.e., reducing\nperformance disparity), faces two fundamental challenges in the dynamic\nsetting: (1) non-differentiability of the ranking-based fairness constraint,\nwhich hinders the end-to-end training paradigm, and (2) time-inefficiency,\nwhich impedes quick adaptation to changes in user preferences. In this paper,\nwe propose FAir Dynamic rEcommender (FADE), an end-to-end framework with\nfine-tuning strategy to dynamically alleviate performance disparity. To tackle\nthe above challenges, FADE uses a novel fairness loss designed to be\ndifferentiable and lightweight to fine-tune model parameters to ensure both\nuser-side fairness and high-quality recommendations. Via extensive experiments\non the real-world dataset, we empirically demonstrate that FADE effectively and\nefficiently reduces performance disparity, and furthermore, FADE improves\noverall recommendation quality over time compared to not using any new data.\n","authors":["Hyunsik Yoo","Zhichen Zeng","Jian Kang","Zhining Liu","David Zhou","Fei Wang","Eunice Chan","Hanghang Tong"],"pdf_url":"https://arxiv.org/pdf/2308.15651v1.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2307.08303v3","updated":"2023-08-29T21:52:58Z","published":"2023-07-17T07:55:47Z","title":"Soft Prompt Tuning for Augmenting Dense Retrieval with Large Language\n Models","summary":" Dense retrieval (DR) converts queries and documents into dense embeddings and\nmeasures the similarity between queries and documents in vector space. One of\nthe challenges in DR is the lack of domain-specific training data. While DR\nmodels can learn from large-scale public datasets like MS MARCO through\ntransfer learning, evidence shows that not all DR models and domains can\nbenefit from transfer learning equally. Recently, some researchers have\nresorted to large language models (LLMs) to improve the zero-shot and few-shot\nDR models. However, the hard prompts or human-written prompts utilized in these\nworks cannot guarantee the good quality of generated weak queries. To tackle\nthis, we propose soft prompt tuning for augmenting DR (SPTAR): For each task,\nwe leverage soft prompt-tuning to optimize a task-specific soft prompt on\nlimited ground truth data and then prompt the LLMs to tag unlabeled documents\nwith weak queries, yielding enough weak document-query pairs to train\ntask-specific dense retrievers. We design a filter to select high-quality\nexample document-query pairs in the prompt to further improve the quality of\nweak tagged queries. To the best of our knowledge, there is no prior work\nutilizing soft prompt tuning to augment DR models. The experiments demonstrate\nthat SPTAR outperforms the unsupervised baselines BM25 and the recently\nproposed LLMs-based augmentation method for DR.\n","authors":["Zhiyuan Peng","Xuyang Wu","Yi Fang"],"pdf_url":"https://arxiv.org/pdf/2307.08303v3.pdf","comment":"fix typos"},{"id":"http://arxiv.org/abs/2304.08873v2","updated":"2023-08-29T20:23:41Z","published":"2023-04-18T10:07:09Z","title":"Dual-Granularity Contrastive Learning for Session-based Recommendation","summary":" Session-based recommendation systems(SBRS) are more suitable for the current\ne-commerce and streaming media recommendation scenarios and thus have become a\nhot topic. The data encountered by SBRS is typically highly sparse, which also\nserves as one of the bottlenecks limiting the accuracy of recommendations. So\nContrastive Learning(CL) is applied in SBRS owing to its capability of\nimproving embedding learning under the condition of sparse data. However,\nexisting CL strategies are limited in their ability to enforce finer-grained\n(e.g., factor-level) comparisons and, as a result, are unable to capture subtle\ndifferences between instances. More than that, these strategies usually use\nitem or segment dropout as a means of data augmentation which may result in\nsparser data and thus ineffective self-supervised signals. By addressing the\ntwo aforementioned limitations, we introduce a novel multi-granularity CL\nframework. Specifically, two extra augmented embedding convolution channels\nwith different granularities are constructed and the embeddings learned by them\nare compared with those learned from original view to complete the CL tasks. At\nfactor-level, we employ Disentangled Representation Learning to obtain\nfiner-grained data(e.g. factor-level embeddings), with which we can construct\nfactor-level convolution channels. At item-level, the star graph is deployed as\nthe augmented data and graph convolution on it can ensure the effectiveness of\nself-supervised signals. Compare the learned embeddings of these two views with\nthe learned embeddings of the basic view to achieve CL at two granularities.\nFinally, the more precise item-level and factor-level embeddings obtained are\nreferenced to generate personalized recommendations for the user. The proposed\nmodel is validated through extensive experiments on two benchmark datasets,\nshowcasing superior performance compared to existing methods.\n","authors":["Zihan Wang","Gang Wu","Haotong Wang"],"pdf_url":"https://arxiv.org/pdf/2304.08873v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.14935v2","updated":"2023-08-29T19:52:02Z","published":"2022-11-27T21:00:31Z","title":"RecXplainer: Amortized Attribute-based Personalized Explanations for\n Recommender Systems","summary":" Recommender systems influence many of our interactions in the digital world\n-- impacting how we shop for clothes, sorting what we see when browsing YouTube\nor TikTok, and determining which restaurants and hotels we are shown when using\nhospitality platforms. Modern recommender systems are large, opaque models\ntrained on a mixture of proprietary and open-source datasets. Naturally, issues\nof trust arise on both the developer and user side: is the system working\ncorrectly, and why did a user receive (or not receive) a particular\nrecommendation? Providing an explanation alongside a recommendation alleviates\nsome of these concerns. The status quo for auxiliary recommender system\nfeedback is either user-specific explanations (e.g., \"users who bought item B\nalso bought item A\") or item-specific explanations (e.g., \"we are recommending\nitem A because you watched/bought item B\"). However, users bring personalized\ncontext into their search experience, valuing an item as a function of that\nitem's attributes and their own personal preferences. In this work, we propose\nRecXplainer, a novel method for generating fine-grained explanations based on a\nuser's preferences over the attributes of recommended items. We evaluate\nRecXplainer on five real-world and large-scale recommendation datasets using\nfive different kinds of recommender systems to demonstrate the efficacy of\nRecXplainer in capturing users' preferences over item attributes and using them\nto explain recommendations. We also compare RecXplainer to five baselines and\nshow RecXplainer's exceptional performance on ten metrics.\n","authors":["Sahil Verma","Chirag Shah","John P. Dickerson","Anurag Beniwal","Narayanan Sadagopan","Arjun Seshadri"],"pdf_url":"https://arxiv.org/pdf/2211.14935v2.pdf","comment":"Awarded the Best Student Paper at TEA Workshop at NeurIPS 2022"},{"id":"http://arxiv.org/abs/2308.15553v1","updated":"2023-08-29T18:19:36Z","published":"2023-08-29T18:19:36Z","title":"Dimensionality Reduction Using pseudo-Boolean polynomials For Cluster\n Analysis","summary":" We introduce usage of a reduction property of penalty-based formulation of\npseudo-Boolean polynomials as a mechanism for invariant dimensionality\nreduction in cluster analysis processes. In our experiments, we show that\nmultidimensional data, like 4-dimensional Iris Flower dataset can be reduced to\n2-dimensional space while the 30-dimensional Wisconsin Diagnostic Breast Cancer\n(WDBC) dataset can be reduced to 3-dimensional space, and by searching lines or\nplanes that lie between reduced samples we can extract clusters in a linear and\nunbiased manner with competitive accuracies, reproducibility and clear\ninterpretation.\n","authors":["Tendai Mapungwana Chikake","Boris Goldengorin"],"pdf_url":"https://arxiv.org/pdf/2308.15553v1.pdf","comment":"14 pages, 4 figures, submitted to the International Conference Data\n Analysis, Optimization and Their Applications on the Occasion of Boris\n Mirkin's 80th Birthday January 30-31, 2023, Dolgoprudny, Moscow Region,\n Moscow Institute of Physics and Technology\n https://mipt.ru/education/chairs/dm/conferences/data-analysis-optimization-and-their-applications-2023.php"},{"id":"http://arxiv.org/abs/2308.15498v1","updated":"2023-08-29T02:42:52Z","published":"2023-08-29T02:42:52Z","title":"Chunked Lists versus Extensible Arrays for Text Inversion","summary":" In our 2017 work on in-memory list-based text inversion [Hawking and\nBillerbeck. Efficient In-Memory, List-Based Text Inversion. ADCS 2017] we\ncompared memory use and indexing speed of a considerable number of variants of\nchunked linked lists. In the present work we compare the best performing of\nthose variants (FBB - dynamic Fibonacci chunking) with the extensible SQ array\ntechnique (SQA) presented in [Moffat and Mackenzie. Immediate-Access Indexing\nUsing Space-Efficient Extensible Arrays. ADCS 2023].\n","authors":["David Hawking","Bodo Billerbeck"],"pdf_url":"https://arxiv.org/pdf/2308.15498v1.pdf","comment":"2 pages, 2 figures, 1 table"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2308.15479v1","updated":"2023-08-29T17:58:55Z","published":"2023-08-29T17:58:55Z","title":"3D Adversarial Augmentations for Robust Out-of-Domain Predictions","summary":" Since real-world training datasets cannot properly sample the long tail of\nthe underlying data distribution, corner cases and rare out-of-domain samples\ncan severely hinder the performance of state-of-the-art models. This problem\nbecomes even more severe for dense tasks, such as 3D semantic segmentation,\nwhere points of non-standard objects can be confidently associated to the wrong\nclass. In this work, we focus on improving the generalization to out-of-domain\ndata. We achieve this by augmenting the training set with adversarial examples.\nFirst, we learn a set of vectors that deform the objects in an adversarial\nfashion. To prevent the adversarial examples from being too far from the\nexisting data distribution, we preserve their plausibility through a series of\nconstraints, ensuring sensor-awareness and shapes smoothness. Then, we perform\nadversarial augmentation by applying the learned sample-independent vectors to\nthe available objects when training a model. We conduct extensive experiments\nacross a variety of scenarios on data from KITTI, Waymo, and CrashD for 3D\nobject detection, and on data from SemanticKITTI, Waymo, and nuScenes for 3D\nsemantic segmentation. Despite training on a standard single dataset, our\napproach substantially improves the robustness and generalization of both 3D\nobject detection and 3D semantic segmentation methods to out-of-domain data.\n","authors":["Alexander Lehner","Stefano Gasperini","Alvaro Marcos-Ramiro","Michael Schmidt","Nassir Navab","Benjamin Busam","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2308.15479v1.pdf","comment":"37 pages, 12 figures"},{"id":"http://arxiv.org/abs/2308.15478v1","updated":"2023-08-29T17:57:20Z","published":"2023-08-29T17:57:20Z","title":"An Adaptive Tangent Feature Perspective of Neural Networks","summary":" In order to better understand feature learning in neural networks, we propose\na framework for understanding linear models in tangent feature space where the\nfeatures are allowed to be transformed during training. We consider linear\ntransformations of features, resulting in a joint optimization over parameters\nand transformations with a bilinear interpolation constraint. We show that this\noptimization problem has an equivalent linearly constrained optimization with\nstructured regularization that encourages approximately low rank solutions.\nSpecializing to neural network structure, we gain insights into how the\nfeatures and thus the kernel function change, providing additional nuance to\nthe phenomenon of kernel alignment when the target function is poorly\nrepresented using tangent features. In addition to verifying our theoretical\nobservations in real neural networks on a simple regression problem, we\nempirically show that an adaptive feature implementation of tangent feature\nclassification has an order of magnitude lower sample complexity than the fixed\ntangent feature model on MNIST and CIFAR-10.\n","authors":["Daniel LeJeune","Sina Alemohammad"],"pdf_url":"https://arxiv.org/pdf/2308.15478v1.pdf","comment":"15 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.14120v2","updated":"2023-08-29T17:52:02Z","published":"2023-08-27T14:28:38Z","title":"Empowering Clinicians and Democratizing Data Science: Large Language\n Models Automate Machine Learning for Clinical Studies","summary":" A knowledge gap persists between Machine Learning (ML) developers (e.g., data\nscientists) and practitioners (e.g., clinicians), hampering the full\nutilization of ML for clinical data analysis. We investigated the potential of\nthe chatGPT Advanced Data Analysis (ADA), an extension of GPT-4, to bridge this\ngap and perform ML analyses efficiently. Real-world clinical datasets and study\ndetails from large trials across various medical specialties were presented to\nchatGPT ADA without specific guidance. ChatGPT ADA autonomously developed\nstate-of-the-art ML models based on the original study's training data to\npredict clinical outcomes such as cancer development, cancer progression,\ndisease complications, or biomarkers such as pathogenic gene sequences.\nStrikingly, these ML models matched or outperformed their published\ncounterparts. We conclude that chatGPT ADA offers a promising avenue to\ndemocratize ML in medicine, making advanced analytics accessible to non-ML\nexperts and promoting broader applications in medical research and practice.\n","authors":["Soroosh Tayebi Arasteh","Tianyu Han","Mahshad Lotfinia","Christiane Kuhl","Jakob Nikolas Kather","Daniel Truhn","Sven Nebelung"],"pdf_url":"https://arxiv.org/pdf/2308.14120v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15470v1","updated":"2023-08-29T17:50:27Z","published":"2023-08-29T17:50:27Z","title":"Policy composition in reinforcement learning via multi-objective policy\n optimization","summary":" We enable reinforcement learning agents to learn successful behavior policies\nby utilizing relevant pre-existing teacher policies. The teacher policies are\nintroduced as objectives, in addition to the task objective, in a\nmulti-objective policy optimization setting. Using the Multi-Objective Maximum\na Posteriori Policy Optimization algorithm\n\\citep{abdolmaleki2020distributional}, we show that teacher policies can help\nspeed up learning, particularly in the absence of shaping rewards. In two\ndomains with continuous observation and action spaces, our agents successfully\ncompose teacher policies in sequence and in parallel, and are also able to\nfurther extend the policies of the teachers in order to solve the task.\n Depending on the specified combination of task and teacher(s), teacher(s) may\nnaturally act to limit the final performance of an agent. The extent to which\nagents are required to adhere to teacher policies are determined by\nhyperparameters which determine both the effect of teachers on learning speed\nand the eventual performance of the agent on the task. In the {\\tt humanoid}\ndomain \\citep{deepmindcontrolsuite2018}, we also equip agents with the ability\nto control the selection of teachers. With this ability, agents are able to\nmeaningfully compose from the teacher policies to achieve a superior task\nreward on the {\\tt walk} task than in cases without access to the teacher\npolicies. We show the resemblance of composed task policies with the\ncorresponding teacher policies through videos.\n","authors":["Shruti Mishra","Ankit Anand","Jordan Hoffmann","Nicolas Heess","Martin Riedmiller","Abbas Abdolmaleki","Doina Precup"],"pdf_url":"https://arxiv.org/pdf/2308.15470v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15466v1","updated":"2023-08-29T17:47:42Z","published":"2023-08-29T17:47:42Z","title":"Input margins can predict generalization too","summary":" Understanding generalization in deep neural networks is an active area of\nresearch. A promising avenue of exploration has been that of margin\nmeasurements: the shortest distance to the decision boundary for a given sample\nor its representation internal to the network. While margins have been shown to\nbe correlated with the generalization ability of a model when measured at its\nhidden representations (hidden margins), no such link between large margins and\ngeneralization has been established for input margins. We show that while input\nmargins are not generally predictive of generalization, they can be if the\nsearch space is appropriately constrained. We develop such a measure based on\ninput margins, which we refer to as `constrained margins'. The predictive power\nof this new measure is demonstrated on the 'Predicting Generalization in Deep\nLearning' (PGDL) dataset and contrasted with hidden representation margins. We\nfind that constrained margins achieve highly competitive scores and outperform\nother margin measurements in general. This provides a novel insight on the\nrelationship between generalization and classification margins, and highlights\nthe importance of considering the data manifold for investigations of\ngeneralization in DNNs.\n","authors":["Coenraad Mouton","Marthinus W. Theunissen","Marelie H. Davel"],"pdf_url":"https://arxiv.org/pdf/2308.15466v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15464v1","updated":"2023-08-29T17:44:02Z","published":"2023-08-29T17:44:02Z","title":"A Comparative Study of Loss Functions: Traffic Predictions in Regular\n and Congestion Scenarios","summary":" Spatiotemporal graph neural networks have achieved state-of-the-art\nperformance in traffic forecasting. However, they often struggle to forecast\ncongestion accurately due to the limitations of traditional loss functions.\nWhile accurate forecasting of regular traffic conditions is crucial, a reliable\nAI system must also accurately forecast congestion scenarios to maintain safe\nand efficient transportation. In this paper, we explore various loss functions\ninspired by heavy tail analysis and imbalanced classification problems to\naddress this issue. We evaluate the efficacy of these loss functions in\nforecasting traffic speed, with an emphasis on congestion scenarios. Through\nextensive experiments on real-world traffic datasets, we discovered that when\noptimizing for Mean Absolute Error (MAE), the MAE-Focal Loss function stands\nout as the most effective. When optimizing Mean Squared Error (MSE), Gumbel\nLoss proves to be the superior choice. These choices effectively forecast\ntraffic congestion events without compromising the accuracy of regular traffic\nspeed forecasts. This research enhances deep learning models' capabilities in\nforecasting sudden speed changes due to congestion and underscores the need for\nmore research in this direction. By elevating the accuracy of congestion\nforecasting, we advocate for AI systems that are reliable, secure, and\nresilient in practical traffic management scenarios.\n","authors":["Yangxinyu Xie","Tanwi Mallick"],"pdf_url":"https://arxiv.org/pdf/2308.15464v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.13803v2","updated":"2023-08-29T17:38:45Z","published":"2023-01-31T17:44:59Z","title":"Fairness-aware Vision Transformer via Debiased Self-Attention","summary":" Vision Transformer (ViT) has recently gained significant interest in solving\ncomputer vision (CV) problems due to its capability of extracting informative\nfeatures and modeling long-range dependencies through the self-attention\nmechanism. To fully realize the advantages of ViT in real-world applications,\nrecent works have explored the trustworthiness of ViT, including its robustness\nand explainability. However, another desiderata, fairness has not yet been\nadequately addressed in the literature. We establish that the existing\nfairness-aware algorithms (primarily designed for CNNs) do not perform well on\nViT. This necessitates the need for developing our novel framework via Debiased\nSelf-Attention (DSA). DSA is a fairness-through-blindness approach that\nenforces ViT to eliminate spurious features correlated with the sensitive\nattributes for bias mitigation. Notably, adversarial examples are leveraged to\nlocate and mask the spurious features in the input image patches. In addition,\nDSA utilizes an attention weights alignment regularizer in the training\nobjective to encourage learning informative features for target prediction.\nImportantly, our DSA framework leads to improved fairness guarantees over prior\nworks on multiple prediction tasks without compromising target prediction\nperformance.\n","authors":["Yao Qiang","Chengyin Li","Prashant Khanduri","Dongxiao Zhu"],"pdf_url":"https://arxiv.org/pdf/2301.13803v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15461v1","updated":"2023-08-29T17:38:33Z","published":"2023-08-29T17:38:33Z","title":"Canonical Factors for Hybrid Neural Fields","summary":" Factored feature volumes offer a simple way to build more compact, efficient,\nand intepretable neural fields, but also introduce biases that are not\nnecessarily beneficial for real-world data. In this work, we (1) characterize\nthe undesirable biases that these architectures have for axis-aligned signals\n-- they can lead to radiance field reconstruction differences of as high as 2\nPSNR -- and (2) explore how learning a set of canonicalizing transformations\ncan improve representations by removing these biases. We prove in a\ntwo-dimensional model problem that simultaneously learning these\ntransformations together with scene appearance succeeds with drastically\nimproved efficiency. We validate the resulting architectures, which we call\nTILTED, using image, signed distance, and radiance field reconstruction tasks,\nwhere we observe improvements across quality, robustness, compactness, and\nruntime. Results demonstrate that TILTED can enable capabilities comparable to\nbaselines that are 2x larger, while highlighting weaknesses of neural field\nevaluation procedures.\n","authors":["Brent Yi","Weijia Zeng","Sam Buchanan","Yi Ma"],"pdf_url":"https://arxiv.org/pdf/2308.15461v1.pdf","comment":"ICCV 2023. Project webpage: https://brentyi.github.io/tilted/"},{"id":"http://arxiv.org/abs/2308.15457v1","updated":"2023-08-29T17:31:26Z","published":"2023-08-29T17:31:26Z","title":"From SMOTE to Mixup for Deep Imbalanced Classification","summary":" Given imbalanced data, it is hard to train a good classifier using deep\nlearning because of the poor generalization of minority classes. Traditionally,\nthe well-known synthetic minority oversampling technique (SMOTE) for data\naugmentation, a data mining approach for imbalanced learning, has been used to\nimprove this generalization. However, it is unclear whether SMOTE also benefits\ndeep learning. In this work, we study why the original SMOTE is insufficient\nfor deep learning, and enhance SMOTE using soft labels. Connecting the\nresulting soft SMOTE with Mixup, a modern data augmentation technique, leads to\na unified framework that puts traditional and modern data augmentation\ntechniques under the same umbrella. A careful study within this framework shows\nthat Mixup improves generalization by implicitly achieving uneven margins\nbetween majority and minority classes. We then propose a novel margin-aware\nMixup technique that more explicitly achieves uneven margins. Extensive\nexperimental results demonstrate that our proposed technique yields\nstate-of-the-art performance on deep imbalanced classification while achieving\nsuperior performance on extremely imbalanced data. The code is open-sourced in\nour developed package https://github.com/ntucllab/imbalanced-DL to foster\nfuture research in this direction.\n","authors":["Wei-Chao Cheng","Tan-Ha Mai","Hsuan-Tien Lin"],"pdf_url":"https://arxiv.org/pdf/2308.15457v1.pdf","comment":"25 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.15452v1","updated":"2023-08-29T17:22:39Z","published":"2023-08-29T17:22:39Z","title":"When Do Program-of-Thoughts Work for Reasoning?","summary":" The reasoning capabilities of Large Language Models (LLMs) play a pivotal\nrole in the realm of embodied artificial intelligence. Although there are\neffective methods like program-of-thought prompting for LLMs which uses\nprogramming language to tackle complex reasoning tasks, the specific impact of\ncode data on the improvement of reasoning capabilities remains under-explored.\nTo address this gap, we propose complexity-impacted reasoning score (CIRS),\nwhich combines structural and logical attributes, to measure the correlation\nbetween code and reasoning abilities. Specifically, we use the abstract syntax\ntree to encode the structural information and calculate logical complexity by\nconsidering the difficulty and the cyclomatic complexity. Through an empirical\nanalysis, we find not all code data of complexity can be learned or understood\nby LLMs. Optimal level of complexity is critical to the improvement of\nreasoning abilities by program-aided prompting. Then we design an\nauto-synthesizing and stratifying algorithm, and apply it to instruction\ngeneration for mathematical reasoning and code data filtering for code\ngeneration tasks. Extensive results demonstrates the effectiveness of our\nproposed approach. Code will be integrated into the EasyInstruct framework at\nhttps://github.com/zjunlp/EasyInstruct.\n","authors":["Zhen Bi","Ningyu Zhang","Yinuo Jiang","Shumin Deng","Guozhou Zheng","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2308.15452v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2306.08018v2","updated":"2023-08-29T17:13:05Z","published":"2023-06-13T14:35:34Z","title":"Mol-Instructions: A Large-Scale Biomolecular Instruction Dataset for\n Large Language Models","summary":" Large Language Models (LLMs), with their remarkable task-handling\ncapabilities and innovative outputs, have catalyzed significant advancements\nacross a spectrum of fields. However, their proficiency within specialized\ndomains such as biomolecular studies remains limited. To address this\nchallenge, we introduce Mol-Instructions, a meticulously curated, comprehensive\ninstruction dataset expressly designed for the biomolecular realm.\nMol-Instructions is composed of three pivotal components: molecule-oriented\ninstructions, protein-oriented instructions, and biomolecular text\ninstructions, each curated to enhance the understanding and prediction\ncapabilities of LLMs concerning biomolecular features and behaviors. Through\nextensive instruction tuning experiments on the representative LLM, we\nunderscore the potency of Mol-Instructions to enhance the adaptability and\ncognitive acuity of large models within the complex sphere of biomolecular\nstudies, thereby promoting advancements in the biomolecular research community.\nMol-Instructions is made publicly accessible for future research endeavors and\nwill be subjected to continual updates for enhanced applicability.\n","authors":["Yin Fang","Xiaozhuan Liang","Ningyu Zhang","Kangwei Liu","Rui Huang","Zhuo Chen","Xiaohui Fan","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2306.08018v2.pdf","comment":"Project homepage: https://github.com/zjunlp/Mol-Instructions. Add\n quantitative evaluations"},{"id":"http://arxiv.org/abs/2112.09153v2","updated":"2023-08-29T17:04:19Z","published":"2021-12-16T19:00:55Z","title":"An Empirical Investigation of the Role of Pre-training in Lifelong\n Learning","summary":" The lifelong learning paradigm in machine learning is an attractive\nalternative to the more prominent isolated learning scheme not only due to its\nresemblance to biological learning but also its potential to reduce energy\nwaste by obviating excessive model re-training. A key challenge to this\nparadigm is the phenomenon of catastrophic forgetting. With the increasing\npopularity and success of pre-trained models in machine learning, we pose the\nquestion: What role does pre-training play in lifelong learning, specifically\nwith respect to catastrophic forgetting? We investigate existing methods in the\ncontext of large, pre-trained models and evaluate their performance on a\nvariety of text and image classification tasks, including a large-scale study\nusing a novel data set of 15 diverse NLP tasks. Across all settings, we observe\nthat generic pre-training implicitly alleviates the effects of catastrophic\nforgetting when learning multiple tasks sequentially compared to randomly\ninitialized models. We then further investigate why pre-training alleviates\nforgetting in this setting. We study this phenomenon by analyzing the loss\nlandscape, finding that pre-trained weights appear to ease forgetting by\nleading to wider minima. Based on this insight, we propose jointly optimizing\nfor current task loss and loss basin sharpness to explicitly encourage wider\nbasins during sequential fine-tuning. We show that this optimization approach\noutperforms several state-of-the-art task-sequential continual learning\nalgorithms across multiple settings, occasionally even without retaining a\nmemory that scales in size with the number of tasks.\n","authors":["Sanket Vaibhav Mehta","Darshan Patil","Sarath Chandar","Emma Strubell"],"pdf_url":"https://arxiv.org/pdf/2112.09153v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15434v1","updated":"2023-08-29T16:56:03Z","published":"2023-08-29T16:56:03Z","title":"Random feature approximation for general spectral methods","summary":" Random feature approximation is arguably one of the most popular techniques\nto speed up kernel methods in large scale algorithms and provides a theoretical\napproach to the analysis of deep neural networks. We analyze generalization\nproperties for a large class of spectral regularization methods combined with\nrandom features, containing kernel methods with implicit regularization such as\ngradient descent or explicit methods like Tikhonov regularization. For our\nestimators we obtain optimal learning rates over regularity classes (even for\nclasses that are not included in the reproducing kernel Hilbert space), which\nare defined through appropriate source conditions. This improves or completes\nprevious results obtained in related settings for specific kernel algorithms.\n","authors":["Mike Nguyen","Nicole Mücke"],"pdf_url":"https://arxiv.org/pdf/2308.15434v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15410v1","updated":"2023-08-29T16:10:20Z","published":"2023-08-29T16:10:20Z","title":"Probabilistic solar flare forecasting using historical magnetogram data","summary":" Solar flare forecasting research using machine learning (ML) has focused on\nhigh resolution magnetogram data from the SDO/HMI era covering Solar Cycle 24\nand the start of Solar Cycle 25, with some efforts looking back to SOHO/MDI for\ndata from Solar Cycle 23. In this paper, we consider over 4 solar cycles of\ndaily historical magnetogram data from multiple instruments. This is the first\nattempt to take advantage of this historical data for ML-based flare\nforecasting. We apply a convolutional neural network (CNN) to extract features\nfrom full-disk magnetograms together with a logistic regression model to\nincorporate scalar features based on magnetograms and flaring history. We use\nan ensemble approach to generate calibrated probabilistic forecasts of M-class\nor larger flares in the next 24 hours. Overall, we find that including\nhistorical data improves forecasting skill and reliability. We show that single\nframe magnetograms do not contain significantly more relevant information than\ncan be summarized in a small number of scalar features, and that flaring\nhistory has greater predictive power than our CNN-extracted features. This\nindicates the importance of including temporal information in flare forecasting\nmodels.\n","authors":["Kiera van der Sande","Andrés Muñoz-Jaramillo","Subhamoy Chatterjee"],"pdf_url":"https://arxiv.org/pdf/2308.15410v1.pdf","comment":"22 pages, 16 figures, accepted to ApJ"},{"id":"http://arxiv.org/abs/2308.15405v1","updated":"2023-08-29T16:07:18Z","published":"2023-08-29T16:07:18Z","title":"Robust Long-Tailed Learning via Label-Aware Bounded CVaR","summary":" Data in the real-world classification problems are always imbalanced or\nlong-tailed, wherein the majority classes have the most of the samples that\ndominate the model training. In such setting, the naive model tends to have\npoor performance on the minority classes. Previously, a variety of loss\nmodifications have been proposed to address the long-tailed leaning problem,\nwhile these methods either treat the samples in the same class\nindiscriminatingly or lack a theoretical guarantee. In this paper, we propose\ntwo novel approaches based on CVaR (Conditional Value at Risk) to improve the\nperformance of long-tailed learning with a solid theoretical ground.\nSpecifically, we firstly introduce a Label-Aware Bounded CVaR (LAB-CVaR) loss\nto overcome the pessimistic result of the original CVaR, and further design the\noptimal weight bounds for LAB-CVaR theoretically. Based on LAB-CVaR, we\nadditionally propose a LAB-CVaR with logit adjustment (LAB-CVaR-logit) loss to\nstabilize the optimization process, where we also offer the theoretical\nsupport. Extensive experiments on real-world datasets with long-tailed label\ndistributions verify the superiority of our proposed methods.\n","authors":["Hong Zhu","Runpeng Yu","Xing Tang","Yifei Wang","Yuan Fang","Yisen Wang"],"pdf_url":"https://arxiv.org/pdf/2308.15405v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12896v2","updated":"2023-08-29T15:57:02Z","published":"2023-08-24T16:16:47Z","title":"Beyond Document Page Classification: Design, Datasets, and Challenges","summary":" This paper highlights the need to bring document classification benchmarking\ncloser to real-world applications, both in the nature of data tested ($X$:\nmulti-channel, multi-paged, multi-industry; $Y$: class distributions and label\nset variety) and in classification tasks considered ($f$: multi-page document,\npage stream, and document bundle classification, ...). We identify the lack of\npublic multi-page document classification datasets, formalize different\nclassification tasks arising in application scenarios, and motivate the value\nof targeting efficient multi-page document representations. An experimental\nstudy on proposed multi-page document classification datasets demonstrates that\ncurrent benchmarks have become irrelevant and need to be updated to evaluate\ncomplete documents, as they naturally occur in practice. This reality check\nalso calls for more mature evaluation methodologies, covering calibration\nevaluation, inference complexity (time-memory), and a range of realistic\ndistribution shifts (e.g., born-digital vs. scanning noise, shifting page\norder). Our study ends on a hopeful note by recommending concrete avenues for\nfuture improvements.}\n","authors":["Jordy Van Landeghem","Sanket Biswas","Matthew B. Blaschko","Marie-Francine Moens"],"pdf_url":"https://arxiv.org/pdf/2308.12896v2.pdf","comment":"8 pages, under review"},{"id":"http://arxiv.org/abs/2308.15395v1","updated":"2023-08-29T15:54:15Z","published":"2023-08-29T15:54:15Z","title":"The CausalBench challenge: A machine learning contest for gene network\n inference from single-cell perturbation data","summary":" In drug discovery, mapping interactions between genes within cellular systems\nis a crucial early step. This helps formulate hypotheses regarding molecular\nmechanisms that could potentially be targeted by future medicines. The\nCausalBench Challenge was an initiative to invite the machine learning\ncommunity to advance the state of the art in constructing gene-gene interaction\nnetworks. These networks, derived from large-scale, real-world datasets of\nsingle cells under various perturbations, are crucial for understanding the\ncausal mechanisms underlying disease biology. Using the framework provided by\nthe CausalBench benchmark, participants were tasked with enhancing the capacity\nof the state of the art methods to leverage large-scale genetic perturbation\ndata. This report provides an analysis and summary of the methods submitted\nduring the challenge to give a partial image of the state of the art at the\ntime of the challenge. The winning solutions significantly improved performance\ncompared to previous baselines, establishing a new state of the art for this\ncritical task in biology and medicine.\n","authors":["Mathieu Chevalley","Jacob Sackett-Sanders","Yusuf Roohani","Pascal Notin","Artemy Bakulin","Dariusz Brzezinski","Kaiwen Deng","Yuanfang Guan","Justin Hong","Michael Ibrahim","Wojciech Kotlowski","Marcin Kowiel","Panagiotis Misiakos","Achille Nazaret","Markus Püschel","Chris Wendler","Arash Mehrjou","Patrick Schwab"],"pdf_url":"https://arxiv.org/pdf/2308.15395v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03854v4","updated":"2023-08-29T15:51:05Z","published":"2023-07-07T22:00:31Z","title":"inTformer: A Time-Embedded Attention-Based Transformer for Crash\n Likelihood Prediction at Intersections Using Connected Vehicle Data","summary":" The real-time crash likelihood prediction model is an essential component of\nthe proactive traffic safety management system. Over the years, numerous\nstudies have attempted to construct a crash likelihood prediction model in\norder to enhance traffic safety, but mostly on freeways. In the majority of the\nexisting studies, researchers have primarily employed a deep learning-based\nframework to identify crash potential. Lately, Transformer has emerged as a\npotential deep neural network that fundamentally operates through\nattention-based mechanisms. Transformer has several functional benefits over\nextant deep learning models such as LSTM, CNN, etc. Firstly, Transformer can\nreadily handle long-term dependencies in a data sequence. Secondly,\nTransformers can parallelly process all elements in a data sequence during\ntraining. Finally, a Transformer does not have the vanishing gradient issue.\nRealizing the immense possibility of Transformers, this paper proposes\ninTersection-Transformer (inTformer), a time-embedded attention-based\nTransformer model that can effectively predict intersection crash likelihood in\nreal-time. The proposed model was evaluated using connected vehicle data\nextracted from Signal Analytics Platform. Acknowledging the complex traffic\noperation mechanism at intersection, this study developed zone-specific models\nby dividing the intersection region into two distinct zones:\nwithin-intersection and approach zone. The best inTformer models in\n'within-intersection,' and 'approach' zone achieved a sensitivity of 73%, and\n70%, respectively. The zone-level models were also compared to earlier studies\non crash likelihood prediction at intersections and with several established\ndeep learning models trained on the same connected vehicle dataset.\n","authors":["B M Tazbiul Hassan Anik","Zubayer Islam","Mohamed Abdel-Aty"],"pdf_url":"https://arxiv.org/pdf/2307.03854v4.pdf","comment":"29 pages, 10 figures, 8 tables"},{"id":"http://arxiv.org/abs/2308.15394v1","updated":"2023-08-29T15:48:49Z","published":"2023-08-29T15:48:49Z","title":"Decentralized Multi-agent Reinforcement Learning based State-of-Charge\n Balancing Strategy for Distributed Energy Storage System","summary":" This paper develops a Decentralized Multi-Agent Reinforcement Learning\n(Dec-MARL) method to solve the SoC balancing problem in the distributed energy\nstorage system (DESS). First, the SoC balancing problem is formulated into a\nfinite Markov decision process with action constraints derived from demand\nbalance, which can be solved by Dec-MARL. Specifically, the first-order average\nconsensus algorithm is utilized to expand the observations of the DESS state in\na fully-decentralized way, and the initial actions (i.e., output power) are\ndecided by the agents (i.e., energy storage units) according to these\nobservations. In order to get the final actions in the allowable range, a\ncounterfactual demand balance algorithm is proposed to balance the total demand\nand the initial actions. Next, the agents execute the final actions and get\nlocal rewards from the environment, and the DESS steps into the next state.\nFinally, through the first-order average consensus algorithm, the agents get\nthe average reward and the expended observation of the next state for later\ntraining. By the above procedure, Dec-MARL reveals outstanding performance in a\nfully-decentralized system without any expert experience or constructing any\ncomplicated model. Besides, it is flexible and can be extended to other\ndecentralized multi-agent systems straightforwardly. Extensive simulations have\nvalidated the effectiveness and efficiency of Dec-MARL.\n","authors":["Zheng Xiong","Biao Luo","Bing-Chuan Wang","Xiaodong Xu","Xiaodong Liu","Tingwen Huang"],"pdf_url":"https://arxiv.org/pdf/2308.15394v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.01351v3","updated":"2023-08-29T15:42:35Z","published":"2022-12-02T18:13:48Z","title":"A Bayesian Framework for Digital Twin-Based Control, Monitoring, and\n Data Collection in Wireless Systems","summary":" Commonly adopted in the manufacturing and aerospace sectors, digital twin\n(DT) platforms are increasingly seen as a promising paradigm to control,\nmonitor, and analyze software-based, \"open\", communication systems. Notably, DT\nplatforms provide a sandbox in which to test artificial intelligence (AI)\nsolutions for communication systems, potentially reducing the need to collect\ndata and test algorithms in the field, i.e., on the physical twin (PT). A key\nchallenge in the deployment of DT systems is to ensure that virtual control\noptimization, monitoring, and analysis at the DT are safe and reliable,\navoiding incorrect decisions caused by \"model exploitation\". To address this\nchallenge, this paper presents a general Bayesian framework with the aim of\nquantifying and accounting for model uncertainty at the DT that is caused by\nlimitations in the amount and quality of data available at the DT from the PT.\nIn the proposed framework, the DT builds a Bayesian model of the communication\nsystem, which is leveraged to enable core DT functionalities such as control\nvia multi-agent reinforcement learning (MARL), monitoring of the PT for anomaly\ndetection, prediction, data-collection optimization, and counterfactual\nanalysis. To exemplify the application of the proposed framework, we\nspecifically investigate a case-study system encompassing multiple sensing\ndevices that report to a common receiver. Experimental results validate the\neffectiveness of the proposed Bayesian framework as compared to standard\nfrequentist model-based solutions.\n","authors":["Clement Ruah","Osvaldo Simeone","Bashir Al-Hashimi"],"pdf_url":"https://arxiv.org/pdf/2212.01351v3.pdf","comment":"Accepted for publication in IEEE Journal on Selected Areas in\n Communications ; Extends and subsumes arXiv:2210.05582 ; Updates: -\n 18/01/2023: Updated reference ; - 29/08/2023: Revised manuscript version"},{"id":"http://arxiv.org/abs/2306.10033v2","updated":"2023-08-29T15:39:32Z","published":"2023-06-07T19:40:37Z","title":"Investigating Reproducibility at Interspeech Conferences: A Longitudinal\n and Comparative Perspective","summary":" Reproducibility is a key aspect for scientific advancement across\ndisciplines, and reducing barriers for open science is a focus area for the\ntheme of Interspeech 2023. Availability of source code is one of the indicators\nthat facilitates reproducibility. However, less is known about the rates of\nreproducibility at Interspeech conferences in comparison to other conferences\nin the field. In order to fill this gap, we have surveyed 27,717 papers at\nseven conferences across speech and language processing disciplines. We find\nthat despite having a close number of accepted papers to the other conferences,\nInterspeech has up to 40% less source code availability. In addition to\nreporting the difficulties we have encountered during our research, we also\nprovide recommendations and possible directions to increase reproducibility for\nfurther studies.\n","authors":["Mohammad Arvan","A. Seza Doğruöz","Natalie Parde"],"pdf_url":"https://arxiv.org/pdf/2306.10033v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15386v1","updated":"2023-08-29T15:29:06Z","published":"2023-08-29T15:29:06Z","title":"Shape-Margin Knowledge Augmented Network for Thyroid Nodule Segmentation\n and Diagnosis","summary":" Thyroid nodule segmentation is a crucial step in the diagnostic procedure of\nphysicians and computer-aided diagnosis systems. Mostly, current studies treat\nsegmentation and diagnosis as independent tasks without considering the\ncorrelation between these tasks. The sequence steps of these independent tasks\nin computer-aided diagnosis systems may lead to the accumulation of errors.\nTherefore, it is worth combining them as a whole through exploring the\nrelationship between thyroid nodule segmentation and diagnosis. According to\nthe thyroid imaging reporting and data system (TI-RADS), the assessment of\nshape and margin characteristics is the prerequisite for the discrimination of\nbenign and malignant thyroid nodules. These characteristics can be observed in\nthe thyroid nodule segmentation masks. Inspired by the diagnostic procedure of\nTI-RADS, this paper proposes a shape-margin knowledge augmented network\n(SkaNet) for simultaneously thyroid nodule segmentation and diagnosis. Due to\nthe similarity in visual features between segmentation and diagnosis, SkaNet\nshares visual features in the feature extraction stage and then utilizes a\ndual-branch architecture to perform thyroid nodule segmentation and diagnosis\ntasks simultaneously. To enhance effective discriminative features, an\nexponential mixture module is devised, which incorporates convolutional feature\nmaps and self-attention maps by exponential weighting. Then, SkaNet is jointly\noptimized by a knowledge augmented multi-task loss function with a constraint\npenalty term. It embeds shape and margin characteristics through numerical\ncomputation and models the relationship between the thyroid nodule diagnosis\nresults and segmentation masks.\n","authors":["Weihua Liu","Chaochao Lin"],"pdf_url":"https://arxiv.org/pdf/2308.15386v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15370v1","updated":"2023-08-29T15:06:47Z","published":"2023-08-29T15:06:47Z","title":"Multi-Response Heteroscedastic Gaussian Process Models and Their\n Inference","summary":" Despite the widespread utilization of Gaussian process models for versatile\nnonparametric modeling, they exhibit limitations in effectively capturing\nabrupt changes in function smoothness and accommodating relationships with\nheteroscedastic errors. Addressing these shortcomings, the heteroscedastic\nGaussian process (HeGP) regression seeks to introduce flexibility by\nacknowledging the variability of residual variances across covariates in the\nregression model. In this work, we extend the HeGP concept, expanding its scope\nbeyond regression tasks to encompass classification and state-space models. To\nachieve this, we propose a novel framework where the Gaussian process is\ncoupled with a covariate-induced precision matrix process, adopting a mixture\nformulation. This approach enables the modeling of heteroscedastic covariance\nfunctions across covariates. To mitigate the computational challenges posed by\nsampling, we employ variational inference to approximate the posterior and\nfacilitate posterior predictive modeling. Additionally, our training process\nleverages an EM algorithm featuring closed-form M-step updates to efficiently\nevaluate the heteroscedastic covariance function. A notable feature of our\nmodel is its consistent performance on multivariate responses, accommodating\nvarious types (continuous or categorical) seamlessly. Through a combination of\nsimulations and real-world applications in climatology, we illustrate the\nmodel's prowess and advantages. By overcoming the limitations of traditional\nGaussian process models, our proposed framework offers a robust and versatile\ntool for a wide array of applications.\n","authors":["Taehee Lee","Jun S. Liu"],"pdf_url":"https://arxiv.org/pdf/2308.15370v1.pdf","comment":"submitted to the Journal of the American Statistical Association\n (JASA)"},{"id":"http://arxiv.org/abs/2308.15367v1","updated":"2023-08-29T15:03:05Z","published":"2023-08-29T15:03:05Z","title":"Efficient Model Personalization in Federated Learning via\n Client-Specific Prompt Generation","summary":" Federated learning (FL) emerges as a decentralized learning framework which\ntrains models from multiple distributed clients without sharing their data to\npreserve privacy. Recently, large-scale pre-trained models (e.g., Vision\nTransformer) have shown a strong capability of deriving robust representations.\nHowever, the data heterogeneity among clients, the limited computation\nresources, and the communication bandwidth restrict the deployment of\nlarge-scale models in FL frameworks. To leverage robust representations from\nlarge-scale models while enabling efficient model personalization for\nheterogeneous clients, we propose a novel personalized FL framework of\nclient-specific Prompt Generation (pFedPG), which learns to deploy a\npersonalized prompt generator at the server for producing client-specific\nvisual prompts that efficiently adapts frozen backbones to local data\ndistributions. Our proposed framework jointly optimizes the stages of\npersonalized prompt adaptation locally and personalized prompt generation\nglobally. The former aims to train visual prompts that adapt foundation models\nto each client, while the latter observes local optimization directions to\ngenerate personalized prompts for all clients. Through extensive experiments on\nbenchmark datasets, we show that our pFedPG is favorable against\nstate-of-the-art personalized FL methods under various types of data\nheterogeneity, allowing computation and communication efficient model\npersonalization.\n","authors":["Fu-En Yang","Chien-Yi Wang","Yu-Chiang Frank Wang"],"pdf_url":"https://arxiv.org/pdf/2308.15367v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.15364v1","updated":"2023-08-29T15:01:01Z","published":"2023-08-29T15:01:01Z","title":"Heterogeneous Multi-Task Gaussian Cox Processes","summary":" This paper presents a novel extension of multi-task Gaussian Cox processes\nfor modeling multiple heterogeneous correlated tasks jointly, e.g.,\nclassification and regression, via multi-output Gaussian processes (MOGP). A\nMOGP prior over the parameters of the dedicated likelihoods for classification,\nregression and point process tasks can facilitate sharing of information\nbetween heterogeneous tasks, while allowing for nonparametric parameter\nestimation. To circumvent the non-conjugate Bayesian inference in the MOGP\nmodulated heterogeneous multi-task framework, we employ the data augmentation\ntechnique and derive a mean-field approximation to realize closed-form\niterative updates for estimating model parameters. We demonstrate the\nperformance and inference on both 1D synthetic data as well as 2D urban data of\nVancouver.\n","authors":["Feng Zhou","Quyu Kong","Zhijie Deng","Fengxiang He","Peng Cui","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.15364v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15363v1","updated":"2023-08-29T14:59:54Z","published":"2023-08-29T14:59:54Z","title":"Text-to-SQL Empowered by Large Language Models: A Benchmark Evaluation","summary":" Large language models (LLMs) have emerged as a new paradigm for Text-to-SQL\ntask. However, the absence of a systematical benchmark inhibits the development\nof designing effective, efficient and economic LLM-based Text-to-SQL solutions.\nTo address this challenge, in this paper, we first conduct a systematical and\nextensive comparison over existing prompt engineering methods, including\nquestion representation, example selection and example organization, and with\nthese experimental results, we elaborates their pros and cons. Based on these\nfindings, we propose a new integrated solution, named DAIL-SQL, which refreshes\nthe Spider leaderboard with 86.6% execution accuracy and sets a new bar.\nTowards an efficient and economic LLM-based Text-to-SQL solution, we emphasize\nthe token efficiency in prompt engineering and compare the prior studies under\nthis metric. Additionally, we investigate open-source LLMs in in-context\nlearning, and further enhance their performance with task-specific supervised\nfine-tuning. Our explorations highlight open-source LLMs' potential in\nText-to-SQL, as well as the advantages and disadvantages of the task-specific\nsupervised fine-tuning. We hope that our work provides a deeper understanding\nof Text-to-SQL with LLMs, and inspire further investigations and broad\napplications.\n","authors":["Dawei Gao","Haibin Wang","Yaliang Li","Xiuyu Sun","Yichen Qian","Bolin Ding","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.15363v1.pdf","comment":"We have released code on https://github.com/BeachWang/DAIL-SQL"},{"id":"http://arxiv.org/abs/2305.14594v2","updated":"2023-08-29T14:51:08Z","published":"2023-05-24T00:20:59Z","title":"torchgfn: A PyTorch GFlowNet library","summary":" The growing popularity of generative flow networks (GFlowNets or GFNs) from a\nrange of researchers with diverse backgrounds and areas of expertise\nnecessitates a library which facilitates the testing of new features such as\ntraining losses that can be easily compared to standard benchmark\nimplementations, or on a set of common environments. torchgfn is a PyTorch\nlibrary that aims to address this need. It provides users with a simple API for\nenvironments and useful abstractions for samplers and losses. Multiple examples\nare provided, replicating and unifying published results. The code is available\nin https://github.com/saleml/torchgfn.\n","authors":["Salem Lahlou","Joseph D. Viviano","Victor Schmidt","Yoshua Bengio"],"pdf_url":"https://arxiv.org/pdf/2305.14594v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15349v1","updated":"2023-08-29T14:45:23Z","published":"2023-08-29T14:45:23Z","title":"Lie-Poisson Neural Networks (LPNets): Data-Based Computing of\n Hamiltonian Systems with Symmetries","summary":" An accurate data-based prediction of the long-term evolution of Hamiltonian\nsystems requires a network that preserves the appropriate structure under each\ntime step. Every Hamiltonian system contains two essential ingredients: the\nPoisson bracket and the Hamiltonian. Hamiltonian systems with symmetries, whose\nparadigm examples are the Lie-Poisson systems, have been shown to describe a\nbroad category of physical phenomena, from satellite motion to underwater\nvehicles, fluids, geophysical applications, complex fluids, and plasma physics.\nThe Poisson bracket in these systems comes from the symmetries, while the\nHamiltonian comes from the underlying physics. We view the symmetry of the\nsystem as primary, hence the Lie-Poisson bracket is known exactly, whereas the\nHamiltonian is regarded as coming from physics and is considered not known, or\nknown approximately. Using this approach, we develop a network based on\ntransformations that exactly preserve the Poisson bracket and the special\nfunctions of the Lie-Poisson systems (Casimirs) to machine precision. We\npresent two flavors of such systems: one, where the parameters of\ntransformations are computed from data using a dense neural network (LPNets),\nand another, where the composition of transformations is used as building\nblocks (G-LPNets). We also show how to adapt these methods to a larger class of\nPoisson brackets. We apply the resulting methods to several examples, such as\nrigid body (satellite) motion, underwater vehicles, a particle in a magnetic\nfield, and others. The methods developed in this paper are important for the\nconstruction of accurate data-based methods for simulating the long-term\ndynamics of physical systems.\n","authors":["Christopher Eldred","François Gay-Balmaz","Sofiia Huraka","Vakhtang Putkaradze"],"pdf_url":"https://arxiv.org/pdf/2308.15349v1.pdf","comment":"57 pages, 13 figures"},{"id":"http://arxiv.org/abs/2308.15344v1","updated":"2023-08-29T14:41:05Z","published":"2023-08-29T14:41:05Z","title":"Imperceptible Adversarial Attack on Deep Neural Networks from Image\n Boundary","summary":" Although Deep Neural Networks (DNNs), such as the convolutional neural\nnetworks (CNN) and Vision Transformers (ViTs), have been successfully applied\nin the field of computer vision, they are demonstrated to be vulnerable to\nwell-sought Adversarial Examples (AEs) that can easily fool the DNNs. The\nresearch in AEs has been active, and many adversarial attacks and explanations\nhave been proposed since they were discovered in 2014. The mystery of the AE's\nexistence is still an open question, and many studies suggest that DNN training\nalgorithms have blind spots. The salient objects usually do not overlap with\nboundaries; hence, the boundaries are not the DNN model's attention.\nNevertheless, recent studies show that the boundaries can dominate the behavior\nof the DNN models. Hence, this study aims to look at the AEs from a different\nperspective and proposes an imperceptible adversarial attack that systemically\nattacks the input image boundary for finding the AEs. The experimental results\nhave shown that the proposed boundary attacking method effectively attacks six\nCNN models and the ViT using only 32% of the input image content (from the\nboundaries) with an average success rate (SR) of 95.2% and an average peak\nsignal-to-noise ratio of 41.37 dB. Correlation analyses are conducted,\nincluding the relation between the adversarial boundary's width and the SR and\nhow the adversarial boundary changes the DNN model's attention. This paper's\ndiscoveries can potentially advance the understanding of AEs and provide a\ndifferent perspective on how AEs can be constructed.\n","authors":["Fahad Alrasheedi","Xin Zhong"],"pdf_url":"https://arxiv.org/pdf/2308.15344v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.10184v3","updated":"2023-08-29T14:36:29Z","published":"2022-10-18T22:12:29Z","title":"Application Performance Modeling via Tensor Completion","summary":" Performance tuning, software/hardware co-design, and job scheduling are among\nthe many tasks that rely on models to predict application performance. We\npropose and evaluate low-rank tensor decomposition for modeling application\nperformance. We discretize the input and configuration domains of an\napplication using regular grids. Application execution times mapped within\ngrid-cells are averaged and represented by tensor elements. We show that\nlow-rank canonical-polyadic (CP) tensor decomposition is effective in\napproximating these tensors. We further show that this decomposition enables\naccurate extrapolation of unobserved regions of an application's parameter\nspace. We then employ tensor completion to optimize a CP decomposition given a\nsparse set of observed execution times. We consider alternative\npiecewise/grid-based models and supervised learning models for six applications\nand demonstrate that CP decomposition optimized using tensor completion offers\nhigher prediction accuracy and memory-efficiency for high-dimensional\nperformance modeling.\n","authors":["Edward Hutter","Edgar Solomonik"],"pdf_url":"https://arxiv.org/pdf/2210.10184v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06399v2","updated":"2023-08-29T14:27:58Z","published":"2023-08-11T21:46:45Z","title":"Learning Bayesian Networks with Heterogeneous Agronomic Data Sets via\n Mixed-Effect Models and Hierarchical Clustering","summary":" Research involving diverse but related data sets, where associations between\ncovariates and outcomes may vary, is prevalent in various fields including\nagronomic studies. In these scenarios, hierarchical models, also known as\nmultilevel models, are frequently employed to assimilate information from\ndifferent data sets while accommodating their distinct characteristics.\nHowever, their structure extend beyond simple heterogeneity, as variables often\nform complex networks of causal relationships.\n Bayesian networks (BNs) provide a powerful framework for modelling such\nrelationships using directed acyclic graphs to illustrate the connections\nbetween variables. This study introduces a novel approach that integrates\nrandom effects into BN learning. Rooted in linear mixed-effects models, this\napproach is particularly well-suited for handling hierarchical data. Results\nfrom a real-world agronomic trial suggest that employing this approach enhances\nstructural learning, leading to the discovery of new connections and the\nimprovement of improved model specification. Furthermore, we observe a\nreduction in prediction errors from 28% to 17%. By extending the applicability\nof BNs to complex data set structures, this approach contributes to the\neffective utilisation of BNs for hierarchical agronomic data. This, in turn,\nenhances their value as decision-support tools in the field.\n","authors":["Lorenzo Valleggi","Marco Scutari","Federico Mattia Stefanini"],"pdf_url":"https://arxiv.org/pdf/2308.06399v2.pdf","comment":"28 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.15327v1","updated":"2023-08-29T14:23:44Z","published":"2023-08-29T14:23:44Z","title":"Enhancing Robot Learning through Learned Human-Attention Feature Maps","summary":" Robust and efficient learning remains a challenging problem in robotics, in\nparticular with complex visual inputs. Inspired by human attention mechanism,\nwith which we quickly process complex visual scenes and react to changes in the\nenvironment, we think that embedding auxiliary information about focus point\ninto robot learning would enhance efficiency and robustness of the learning\nprocess. In this paper, we propose a novel approach to model and emulate the\nhuman attention with an approximate prediction model. We then leverage this\noutput and feed it as a structured auxiliary feature map into downstream\nlearning tasks. We validate this idea by learning a prediction model from\nhuman-gaze recordings of manual driving in the real world. We test our approach\non two learning tasks - object detection and imitation learning. Our\nexperiments demonstrate that the inclusion of predicted human attention leads\nto improved robustness of the trained models to out-of-distribution samples and\nfaster learning in low-data regime settings. Our work highlights the potential\nof incorporating structured auxiliary information in representation learning\nfor robotics and opens up new avenues for research in this direction. All code\nand data are available online.\n","authors":["Daniel Scheuchenstuhl","Stefan Ulmer","Felix Resch","Luigi Berducci","Radu Grosu"],"pdf_url":"https://arxiv.org/pdf/2308.15327v1.pdf","comment":"This work has been accepted for the RAP4Robots workshop at ICRA 2023\n in London"},{"id":"http://arxiv.org/abs/2308.15323v1","updated":"2023-08-29T14:20:13Z","published":"2023-08-29T14:20:13Z","title":"Occlusion-Aware Deep Convolutional Neural Network via Homogeneous\n Tanh-transforms for Face Parsing","summary":" Face parsing infers a pixel-wise label map for each semantic facial\ncomponent. Previous methods generally work well for uncovered faces, however\noverlook the facial occlusion and ignore some contextual area outside a single\nface, especially when facial occlusion has become a common situation during the\nCOVID-19 epidemic. Inspired by the illumination theory of image, we propose a\nnovel homogeneous tanh-transforms for image preprocessing, which made up of\nfour tanh-transforms, that fuse the central vision and the peripheral vision\ntogether. Our proposed method addresses the dilemma of face parsing under\nocclusion and compresses more information of surrounding context. Based on\nhomogeneous tanh-transforms, we propose an occlusion-aware convolutional neural\nnetwork for occluded face parsing. It combines the information both in\nTanh-polar space and Tanh-Cartesian space, capable of enhancing receptive\nfields. Furthermore, we introduce an occlusion-aware loss to focus on the\nboundaries of occluded regions. The network is simple and flexible, and can be\ntrained end-to-end. To facilitate future research of occluded face parsing, we\nalso contribute a new cleaned face parsing dataset, which is manually purified\nfrom several academic or industrial datasets, including CelebAMask-HQ,\nShort-video Face Parsing as well as Helen dataset and will make it public.\nExperiments demonstrate that our method surpasses state-of-art methods of face\nparsing under occlusion.\n","authors":["Weihua Liu","Chaochao Lin","Haoping Yu","Said Boumaraf","Zhaoqiong Pi"],"pdf_url":"https://arxiv.org/pdf/2308.15323v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15321v1","updated":"2023-08-29T14:16:09Z","published":"2023-08-29T14:16:09Z","title":"Elucidating the Exposure Bias in Diffusion Models","summary":" Diffusion models have demonstrated impressive generative capabilities, but\ntheir 'exposure bias' problem, described as the input mismatch between training\nand sampling, lacks in-depth exploration. In this paper, we systematically\ninvestigate the exposure bias problem in diffusion models by first analytically\nmodelling the sampling distribution, based on which we then attribute the\nprediction error at each sampling step as the root cause of the exposure bias\nissue. Furthermore, we discuss potential solutions to this issue and propose an\nintuitive metric for it. Along with the elucidation of exposure bias, we\npropose a simple, yet effective, training-free method called Epsilon Scaling to\nalleviate the exposure bias. We show that Epsilon Scaling explicitly moves the\nsampling trajectory closer to the vector field learned in the training phase by\nscaling down the network output (Epsilon), mitigating the input mismatch\nbetween training and sampling. Experiments on various diffusion frameworks\n(ADM, DDPM/DDIM, LDM), unconditional and conditional settings, and\ndeterministic vs. stochastic sampling verify the effectiveness of our method.\n","authors":["Mang Ning","Mingxiao Li","Jianlin Su","Albert Ali Salah","Itir Onal Ertugrul"],"pdf_url":"https://arxiv.org/pdf/2308.15321v1.pdf","comment":"7 pages, code available soon"},{"id":"http://arxiv.org/abs/2303.07122v4","updated":"2023-08-29T14:07:12Z","published":"2023-02-22T19:35:28Z","title":"Quantifying Causes of Arctic Amplification via Deep Learning based\n Time-series Causal Inference","summary":" The warming of the Arctic, also known as Arctic amplification, is led by\nseveral atmospheric and oceanic drivers. However, the details of its underlying\nthermodynamic causes are still unknown. Inferring the causal effects of\natmospheric processes on sea ice melt using fixed treatment effect strategies\nleads to unrealistic counterfactual estimations. Such models are also prone to\nbias due to time-varying confoundedness. Further, the complex non-linearity in\nEarth science data makes it infeasible to perform causal inference using\nexisting marginal structural techniques. In order to tackle these challenges,\nwe propose TCINet - time-series causal inference model to infer causation under\ncontinuous treatment using recurrent neural networks and a novel probabilistic\nbalancing technique. Through experiments on synthetic and observational data,\nwe show how our research can substantially improve the ability to quantify\nleading causes of Arctic sea ice melt, further paving paths for causal\ninference in observational Earth science.\n","authors":["Sahara Ali","Omar Faruque","Yiyi Huang","Md. Osman Gani","Aneesh Subramanian","Nicole-Jienne Shchlegel","Jianwu Wang"],"pdf_url":"https://arxiv.org/pdf/2303.07122v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2010.01654v3","updated":"2023-08-29T13:58:43Z","published":"2020-10-04T18:56:34Z","title":"Bayesian Feature Selection in Joint Quantile Time Series Analysis","summary":" Quantile feature selection over correlated multivariate time series data has\nalways been a methodological challenge and is an open problem. In this paper,\nwe propose a general Bayesian dimension reduction methodology for feature\nselection in high-dimensional joint quantile time series analysis, under the\nname of the quantile feature selection time series (QFSTS) model. The QFSTS\nmodel is a general structural time series model, where each component yields an\nadditive contribution to the time series modeling with direct interpretations.\nIts flexibility is compound in the sense that users can add/deduct components\nfor each time series and each time series can have its own specific valued\ncomponents of different sizes. Feature selection is conducted in the quantile\nregression component, where each time series has its own pool of\ncontemporaneous external predictors allowing nowcasting. Bayesian methodology\nin extending feature selection to the quantile time series research area is\ndeveloped using multivariate asymmetric Laplace distribution, spike-and-slab\nprior setup, the Metropolis-Hastings algorithm, and the Bayesian model\naveraging technique, all implemented consistently in the Bayesian paradigm. The\nQFSTS model requires small datasets to train and converges fast. Extensive\nexaminations confirmed that the QFSTS model has superior performance in feature\nselection, parameter estimation, and forecast.\n","authors":["Ning Ning"],"pdf_url":"https://arxiv.org/pdf/2010.01654v3.pdf","comment":"Accepted to the Bayesian Analysis journal"},{"id":"http://arxiv.org/abs/2304.03981v3","updated":"2023-08-29T13:50:43Z","published":"2023-04-08T10:47:41Z","title":"Uncertainty-inspired Open Set Learning for Retinal Anomaly\n Identification","summary":" Failure to recognize samples from the classes unseen during training is a\nmajor limitation of artificial intelligence in the real-world implementation\nfor recognition and classification of retinal anomalies. We established an\nuncertainty-inspired open-set (UIOS) model, which was trained with fundus\nimages of 9 retinal conditions. Besides assessing the probability of each\ncategory, UIOS also calculated an uncertainty score to express its confidence.\nOur UIOS model with thresholding strategy achieved an F1 score of 99.55%,\n97.01% and 91.91% for the internal testing set, external target categories\n(TC)-JSIEC dataset and TC-unseen testing set, respectively, compared to the F1\nscore of 92.20%, 80.69% and 64.74% by the standard AI model. Furthermore, UIOS\ncorrectly predicted high uncertainty scores, which would prompt the need for a\nmanual check in the datasets of non-target categories retinal diseases,\nlow-quality fundus images, and non-fundus images. UIOS provides a robust method\nfor real-world screening of retinal anomalies.\n","authors":["Meng Wang","Tian Lin","Lianyu Wang","Aidi Lin","Ke Zou","Xinxing Xu","Yi Zhou","Yuanyuan Peng","Qingquan Meng","Yiming Qian","Guoyao Deng","Zhiqun Wu","Junhong Chen","Jianhong Lin","Mingzhi Zhang","Weifang Zhu","Changqing Zhang","Daoqiang Zhang","Rick Siow Mong Goh","Yong Liu","Chi Pui Pang","Xinjian Chen","Haoyu Chen","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2304.03981v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15308v1","updated":"2023-08-29T13:48:35Z","published":"2023-08-29T13:48:35Z","title":"On-Device Learning with Binary Neural Networks","summary":" Existing Continual Learning (CL) solutions only partially address the\nconstraints on power, memory and computation of the deep learning models when\ndeployed on low-power embedded CPUs. In this paper, we propose a CL solution\nthat embraces the recent advancements in CL field and the efficiency of the\nBinary Neural Networks (BNN), that use 1-bit for weights and activations to\nefficiently execute deep learning models. We propose a hybrid quantization of\nCWR* (an effective CL approach) that considers differently forward and backward\npass in order to retain more precision during gradient update step and at the\nsame time minimizing the latency overhead. The choice of a binary network as\nbackbone is essential to meet the constraints of low power devices and, to the\nbest of authors' knowledge, this is the first attempt to prove on-device\nlearning with BNN. The experimental validation carried out confirms the\nvalidity and the suitability of the proposed method.\n","authors":["Lorenzo Vorabbi","Davide Maltoni","Stefano Santi"],"pdf_url":"https://arxiv.org/pdf/2308.15308v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2012.15584v2","updated":"2023-08-29T13:35:42Z","published":"2020-12-31T12:40:52Z","title":"Combinatorial Pure Exploration with Full-bandit Feedback and Beyond:\n Solving Combinatorial Optimization under Uncertainty with Limited Observation","summary":" Combinatorial optimization is one of the fundamental research fields that has\nbeen extensively studied in theoretical computer science and operations\nresearch. When developing an algorithm for combinatorial optimization, it is\ncommonly assumed that parameters such as edge weights are exactly known as\ninputs. However, this assumption may not be fulfilled since input parameters\nare often uncertain or initially unknown in many applications such as\nrecommender systems, crowdsourcing, communication networks, and online\nadvertisement. To resolve such uncertainty, the problem of combinatorial pure\nexploration of multi-armed bandits (CPE) and its variants have recieved\nincreasing attention. Earlier work on CPE has studied the semi-bandit feedback\nor assumed that the outcome from each individual edge is always accessible at\nall rounds. However, due to practical constraints such as a budget ceiling or\nprivacy concern, such strong feedback is not always available in recent\napplications. In this article, we review recently proposed techniques for\ncombinatorial pure exploration problems with limited feedback.\n","authors":["Yuko Kuroki","Junya Honda","Masashi Sugiyama"],"pdf_url":"https://arxiv.org/pdf/2012.15584v2.pdf","comment":"Preprint of an Invited Review Article, In Fields Institute"},{"id":"http://arxiv.org/abs/2306.11167v3","updated":"2023-08-29T13:33:52Z","published":"2023-06-19T21:14:57Z","title":"Large Language Models are Fixated by Red Herrings: Exploring Creative\n Problem Solving and Einstellung Effect using the Only Connect Wall Dataset","summary":" The quest for human imitative AI has been an enduring topic in AI research\nsince its inception. The technical evolution and emerging capabilities of the\nlatest cohort of large language models (LLMs) have reinvigorated the subject\nbeyond academia to the cultural zeitgeist. While recent NLP evaluation\nbenchmark tasks test some aspects of human-imitative behaviour (e.g.,\nBIG-bench's 'human-like behavior' tasks), few, if not none, examine creative\nproblem solving abilities. Creative problem solving in humans is a well-studied\ntopic in cognitive neuroscience with standardized tests that predominantly use\nthe ability to associate (heterogeneous) connections among clue words as a\nmetric for creativity. Exposure to misleading stimuli - distractors dubbed red\nherrings - impede human performance in such tasks via the fixation effect and\nEinstellung paradigm. In cognitive neuroscience studies, such fixations are\nexperimentally induced by pre-exposing participants to orthographically similar\nincorrect words to subsequent word-fragments or clues. The popular British quiz\nshow Only Connect's Connecting Wall segment essentially mimics Mednick's Remote\nAssociates Test (RAT) formulation with built-in, deliberate red herrings, which\nmakes it an ideal proxy dataset to explore and study fixation effect and\nEinstellung paradigm from cognitive neuroscience in LLMs. In this paper we\npresent the novel Only Connect Wall (OCW) dataset and report results from our\nevaluation of selected pre-trained language models and LLMs on creative problem\nsolving tasks like grouping clue words by heterogeneous connections, and\nidentifying correct open knowledge domain connections in respective groups. We\nsynthetically generate two additional datasets: OCW-Randomized, OCW-WordNet to\nfurther analyze our red-herrings hypothesis in language models. The code and\nlink to the dataset are available at https://github.com/TaatiTeam/OCW.\n","authors":["Saeid Naeini","Raeid Saqur","Mozhgan Saeidi","John Giorgi","Babak Taati"],"pdf_url":"https://arxiv.org/pdf/2306.11167v3.pdf","comment":"V3: Minor cosmetic adjustment from V2. Fixed Fig. 2 caption\n overlapping with text in S2.2. V2: with added OCW-Randomized and OCW-WordNet\n results in Section 4.3 (added). 22 pages with Appendix"},{"id":"http://arxiv.org/abs/2209.04512v3","updated":"2023-08-29T13:33:49Z","published":"2022-09-09T20:29:54Z","title":"Deep Learning Based Residuals in Non-linear Factor Models: Precision\n Matrix Estimation of Returns with Low Signal-to-Noise Ratio","summary":" This paper introduces a consistent estimator and rate of convergence for the\nprecision matrix of asset returns in large portfolios using a non-linear factor\nmodel within the deep learning framework. Our estimator remains valid even in\nlow signal-to-noise ratio environments typical for financial markets and is\ncompatible with weak factors. Our theoretical analysis establishes uniform\nbounds on expected estimation risk based on deep neural networks for an\nexpanding number of assets. Additionally, we provide a new consistent\ndata-dependent estimator of error covariance in deep neural networks. Our\nmodels demonstrate superior accuracy in extensive simulations and the empirics.\n","authors":["Mehmet Caner","Maurizio Daniele"],"pdf_url":"https://arxiv.org/pdf/2209.04512v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15291v1","updated":"2023-08-29T13:25:26Z","published":"2023-08-29T13:25:26Z","title":"Towards quantitative precision for ECG analysis: Leveraging state space\n models, self-supervision and patient metadata","summary":" Deep learning has emerged as the preferred modeling approach for automatic\nECG analysis. In this study, we investigate three elements aimed at improving\nthe quantitative accuracy of such systems. These components consistently\nenhance performance beyond the existing state-of-the-art, which is\npredominantly based on convolutional models. Firstly, we explore more\nexpressive architectures by exploiting structured state space models (SSMs).\nThese models have shown promise in capturing long-term dependencies in time\nseries data. By incorporating SSMs into our approach, we not only achieve\nbetter performance, but also gain insights into long-standing questions in the\nfield. Specifically, for standard diagnostic tasks, we find no advantage in\nusing higher sampling rates such as 500Hz compared to 100Hz. Similarly,\nextending the input size of the model beyond 3 seconds does not lead to\nsignificant improvements. Secondly, we demonstrate that self-supervised\nlearning using contrastive predictive coding can further improve the\nperformance of SSMs. By leveraging self-supervision, we enable the model to\nlearn more robust and representative features, leading to improved analysis\naccuracy. Lastly, we depart from synthetic benchmarking scenarios and\nincorporate basic demographic metadata alongside the ECG signal as input. This\ninclusion of patient metadata departs from the conventional practice of relying\nsolely on the signal itself. Remarkably, this addition consistently yields\npositive effects on predictive performance. We firmly believe that all three\ncomponents should be considered when developing next-generation ECG analysis\nalgorithms.\n","authors":["Temesgen Mehari","Nils Strodthoff"],"pdf_url":"https://arxiv.org/pdf/2308.15291v1.pdf","comment":"extended version of arXiv:2211.07579"},{"id":"http://arxiv.org/abs/2308.15283v1","updated":"2023-08-29T13:14:53Z","published":"2023-08-29T13:14:53Z","title":"Structural Node Embeddings with Homomorphism Counts","summary":" Graph homomorphism counts, first explored by Lov\\'asz in 1967, have recently\ngarnered interest as a powerful tool in graph-based machine learning. Grohe\n(PODS 2020) proposed the theoretical foundations for using homomorphism counts\nin machine learning on graph level as well as node level tasks. By their very\nnature, these capture local structural information, which enables the creation\nof robust structural embeddings. While a first approach for graph level tasks\nhas been made by Nguyen and Maehara (ICML 2020), we experimentally show the\neffectiveness of homomorphism count based node embeddings. Enriched with node\nlabels, node weights, and edge weights, these offer an interpretable\nrepresentation of graph data, allowing for enhanced explainability of machine\nlearning models.\n We propose a theoretical framework for isomorphism-invariant homomorphism\ncount based embeddings which lend themselves to a wide variety of downstream\ntasks. Our approach capitalises on the efficient computability of graph\nhomomorphism counts for bounded treewidth graph classes, rendering it a\npractical solution for real-world applications. We demonstrate their\nexpressivity through experiments on benchmark datasets. Although our results do\nnot match the accuracy of state-of-the-art neural architectures, they are\ncomparable to other advanced graph learning models. Remarkably, our approach\ndemarcates itself by ensuring explainability for each individual feature. By\nintegrating interpretable machine learning algorithms like SVMs or Random\nForests, we establish a seamless, end-to-end explainable pipeline. Our study\ncontributes to the advancement of graph-based techniques that offer both\nperformance and interpretability.\n","authors":["Hinrikus Wolf","Luca Oeljeklaus","Pascal Kühner","Martin Grohe"],"pdf_url":"https://arxiv.org/pdf/2308.15283v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07439v2","updated":"2023-08-29T12:57:17Z","published":"2023-07-14T16:04:03Z","title":"Atlas-Based Interpretable Age Prediction In Whole-Body MR Images","summary":" Age prediction is an important part of medical assessments and research. It\ncan aid in detecting diseases as well as abnormal ageing by highlighting the\ndiscrepancy between chronological and biological age. To gain a comprehensive\nunderstanding of age-related changes observed in various body parts, we\ninvestigate them on a larger scale by using whole-body images. We utilise the\nGrad-CAM interpretability method to determine the body areas most predictive of\na person's age. We expand our analysis beyond individual subjects by employing\nregistration techniques to generate population-wide interpretability maps.\nFurthermore, we set state-of-the-art whole-body age prediction with a model\nthat achieves a mean absolute error of 2.76 years. Our findings reveal three\nprimary areas of interest: the spine, the autochthonous back muscles, and the\ncardiac region, which exhibits the highest importance.\n","authors":["Sophie Starck","Yadunandan Vivekanand Kini","Jessica Johanna Maria Ritter","Rickmer Braren","Daniel Rueckert","Tamara Mueller"],"pdf_url":"https://arxiv.org/pdf/2307.07439v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.10167v3","updated":"2023-08-29T12:49:55Z","published":"2023-03-17T17:54:25Z","title":"Generalized partitioned local depth","summary":" In this paper we provide a generalization of the concept of cohesion as\nintroduced recently by Berenhaut, Moore and Melvin [Proceedings of the National\nAcademy of Sciences, 119 (4) (2022)]. The formulation presented builds on the\ntechnique of partitioned local depth by distilling two key probabilistic\nconcepts: local relevance and support division. Earlier results are extended\nwithin the new context, and examples of applications to revealing communities\nin data with uncertainty are included. The work sheds light on the foundations\nof partitioned local depth, and extends the original ideas to enable\nprobabilistic consideration of uncertain, variable and potentially conflicting\ninformation.\n","authors":["Kenneth S. Berenhaut","John D. Foley","Liangdongsheng Lyu"],"pdf_url":"https://arxiv.org/pdf/2303.10167v3.pdf","comment":"Improved exposition & motivation, references added, 19 pages, 6\n figures"},{"id":"http://arxiv.org/abs/2308.15256v1","updated":"2023-08-29T12:30:53Z","published":"2023-08-29T12:30:53Z","title":"Let There Be Sound: Reconstructing High Quality Speech from Silent\n Videos","summary":" The goal of this work is to reconstruct high quality speech from lip motions\nalone, a task also known as lip-to-speech. A key challenge of lip-to-speech\nsystems is the one-to-many mapping caused by (1) the existence of homophenes\nand (2) multiple speech variations, resulting in a mispronounced and\nover-smoothed speech. In this paper, we propose a novel lip-to-speech system\nthat significantly improves the generation quality by alleviating the\none-to-many mapping problem from multiple perspectives. Specifically, we\nincorporate (1) self-supervised speech representations to disambiguate\nhomophenes, and (2) acoustic variance information to model diverse speech\nstyles. Additionally, to better solve the aforementioned problem, we employ a\nflow based post-net which captures and refines the details of the generated\nspeech. We perform extensive experiments and demonstrate that our method\nachieves the generation quality close to that of real human utterance,\noutperforming existing methods in terms of speech naturalness and\nintelligibility by a large margin. Synthesised samples are available at the\nanonymous demo page: https://mm.kaist.ac.kr/projects/LTBS.\n","authors":["Ji-Hoon Kim","Jaehun Kim","Joon Son Chung"],"pdf_url":"https://arxiv.org/pdf/2308.15256v1.pdf","comment":"10 pages, 2 figures, 5 tables"},{"id":"http://arxiv.org/abs/2208.14407v2","updated":"2023-08-29T12:21:30Z","published":"2022-08-30T17:19:26Z","title":"An Analysis of Abstracted Model-Based Reinforcement Learning","summary":" Many methods for Model-based Reinforcement learning (MBRL) in Markov decision\nprocesses (MDPs) provide guarantees for both the accuracy of the model they can\ndeliver and the learning efficiency. At the same time, state abstraction\ntechniques allow for a reduction of the size of an MDP while maintaining a\nbounded loss with respect to the original problem. Therefore, it may come as a\nsurprise that no such guarantees are available when combining both techniques,\ni.e., where MBRL merely observes abstract states. Our theoretical analysis\nshows that abstraction can introduce a dependence between samples collected\nonline (e.g., in the real world). That means that, without taking this\ndependence into account, results for MBRL do not directly extend to this\nsetting. Our result shows that we can use concentration inequalities for\nmartingales to overcome this problem. This result makes it possible to extend\nthe guarantees of existing MBRL algorithms to the setting with abstraction. We\nillustrate this by combining R-MAX, a prototypical MBRL algorithm, with\nabstraction, thus producing the first performance guarantees for model-based\n`RL from Abstracted Observations': model-based reinforcement learning with an\nabstract model.\n","authors":["Rolf A. N. Starre","Marco Loog","Elena Congeduti","Frans A. Oliehoek"],"pdf_url":"https://arxiv.org/pdf/2208.14407v2.pdf","comment":"36 pages, 2 figures, submitted to TMLR"},{"id":"http://arxiv.org/abs/2206.07785v4","updated":"2023-08-29T12:19:19Z","published":"2022-06-15T19:48:10Z","title":"Strategic Coalition for Data Pricing in IoT Data Markets","summary":" This paper considers a market for trading Internet of Things (IoT) data that\nis used to train machine learning models. The data, either raw or processed, is\nsupplied to the market platform through a network and the price of such data is\ncontrolled based on the value it brings to the machine learning model. We\nexplore the correlation property of data in a game-theoretical setting to\neventually derive a simplified distributed solution for a data trading\nmechanism that emphasizes the mutual benefit of devices and the market. The key\nproposal is an efficient algorithm for markets that jointly addresses the\nchallenges of availability and heterogeneity in participation, as well as the\ntransfer of trust and the economic value of data exchange in IoT networks. The\nproposed approach establishes the data market by reinforcing collaboration\nopportunities between device with correlated data to avoid information leakage.\nTherein, we develop a network-wide optimization problem that maximizes the\nsocial value of coalition among the IoT devices of similar data types; at the\nsame time, it minimizes the cost due to network externalities, i.e., the impact\nof information leakage due to data correlation, as well as the opportunity\ncosts. Finally, we reveal the structure of the formulated problem as a\ndistributed coalition game and solve it following the simplified\nsplit-and-merge algorithm. Simulation results show the efficacy of our proposed\nmechanism design toward a trusted IoT data market, with up to 32.72% gain in\nthe average payoff for each seller.\n","authors":["Shashi Raj Pandey","Pierre Pinson","Petar Popovski"],"pdf_url":"https://arxiv.org/pdf/2206.07785v4.pdf","comment":"15 pages. 12 figures. This paper has been accepted for publication in\n IEEE Internet of Things Journal. Copyright may change without notice"},{"id":"http://arxiv.org/abs/2308.15250v1","updated":"2023-08-29T12:16:57Z","published":"2023-08-29T12:16:57Z","title":"The Relative Gaussian Mechanism and its Application to Private Gradient\n Descent","summary":" The Gaussian Mechanism (GM), which consists in adding Gaussian noise to a\nvector-valued query before releasing it, is a standard privacy protection\nmechanism. In particular, given that the query respects some L2 sensitivity\nproperty (the L2 distance between outputs on any two neighboring inputs is\nbounded), GM guarantees R\\'enyi Differential Privacy (RDP). Unfortunately,\nprecisely bounding the L2 sensitivity can be hard, thus leading to loose\nprivacy bounds. In this work, we consider a Relative L2 sensitivity assumption,\nin which the bound on the distance between two query outputs may also depend on\ntheir norm. Leveraging this assumption, we introduce the Relative Gaussian\nMechanism (RGM), in which the variance of the noise depends on the norm of the\noutput. We prove tight bounds on the RDP parameters under relative L2\nsensitivity, and characterize the privacy loss incurred by using\noutput-dependent noise. In particular, we show that RGM naturally adapts to a\nlatent variable that would control the norm of the output. Finally, we\ninstantiate our framework to show tight guarantees for Private Gradient\nDescent, a problem that naturally fits our relative L2 sensitivity assumption.\n","authors":["Hadrien Hendrikx","Paul Mangold","Aurélien Bellet"],"pdf_url":"https://arxiv.org/pdf/2308.15250v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15243v1","updated":"2023-08-29T12:09:22Z","published":"2023-08-29T12:09:22Z","title":"Reliability Gaps Between Groups in COMPAS Dataset","summary":" This paper investigates the inter-rater reliability of risk assessment\ninstruments (RAIs). The main question is whether different, socially salient\ngroups are affected differently by a lack of inter-rater reliability of RAIs,\nthat is, whether mistakes with respect to different groups affects them\ndifferently. The question is investigated with a simulation study of the COMPAS\ndataset. A controlled degree of noise is injected into the input data of a\npredictive model; the noise can be interpreted as a synthetic rater that makes\nmistakes. The main finding is that there are systematic differences in output\nreliability between groups in the COMPAS dataset. The sign of the difference\ndepends on the kind of inter-rater statistic that is used (Cohen's Kappa,\nByrt's PABAK, ICC), and in particular whether or not a correction of\npredictions prevalences of the groups is used.\n","authors":["Tim Räz"],"pdf_url":"https://arxiv.org/pdf/2308.15243v1.pdf","comment":"15 pages + appendix"},{"id":"http://arxiv.org/abs/2308.15237v1","updated":"2023-08-29T11:52:31Z","published":"2023-08-29T11:52:31Z","title":"Assessing Cyclostationary Malware Detection via Feature Selection and\n Classification","summary":" Cyclostationarity involves periodic statistical variations in signals and\nprocesses, commonly used in signal analysis and network security. In the\ncontext of attacks, cyclostationarity helps detect malicious behaviors within\nnetwork traffic, such as traffic patterns in Distributed Denial of Service\n(DDoS) attacks or hidden communication channels in malware. This approach\nenhances security by identifying abnormal patterns and informing Network\nIntrusion Detection Systems (NIDSs) to recognize potential attacks, enhancing\nprotection against both known and novel threats. This research focuses on\nidentifying cyclostationary malware behavior and its detection. The main goal\nis to pinpoint essential cyclostationary features used in NIDSs. These features\nare extracted using algorithms such as Boruta and Principal Component Analysis\n(PCA), and then categorized to find the most significant cyclostationary\npatterns. The aim of this article is to reveal periodically changing malware\nbehaviors through cyclostationarity. The study highlights the importance of\nspotting cyclostationary malware in NIDSs by using established datasets like\nKDD99, NSL-KDD, and the UGRansome dataset. The UGRansome dataset is designed\nfor anomaly detection research and includes both normal and abnormal network\nthreat categories of zero-day attacks. A comparison is made using the Random\nForest (RF) and Support Vector Machine (SVM) algorithms, while also evaluating\nthe effectiveness of Boruta and PCA. The findings show that PCA is more\npromising than using Boruta alone for extracting cyclostationary network\nfeature patterns. Additionally, the analysis identifies the internet protocol\nas the most noticeable cyclostationary feature pattern used by malware.\nNotably, the UGRansome dataset outperforms the KDD99 and NSL-KDD, achieving 99%\naccuracy in signature malware detection using the RF algorithm and 98% with the\nSVM.\n","authors":["Mike Nkongolo"],"pdf_url":"https://arxiv.org/pdf/2308.15237v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.05516v2","updated":"2023-08-29T11:42:29Z","published":"2023-02-10T21:40:37Z","title":"Cyclic and Randomized Stepsizes Invoke Heavier Tails in SGD than\n Constant Stepsize","summary":" Cyclic and randomized stepsizes are widely used in the deep learning practice\nand can often outperform standard stepsize choices such as constant stepsize in\nSGD. Despite their empirical success, not much is currently known about when\nand why they can theoretically improve the generalization performance. We\nconsider a general class of Markovian stepsizes for learning, which contain\ni.i.d. random stepsize, cyclic stepsize as well as the constant stepsize as\nspecial cases, and motivated by the literature which shows that heaviness of\nthe tails (measured by the so-called \"tail-index\") in the SGD iterates is\ncorrelated with generalization, we study tail-index and provide a number of\ntheoretical results that demonstrate how the tail-index varies on the stepsize\nscheduling. Our results bring a new understanding of the benefits of cyclic and\nrandomized stepsizes compared to constant stepsize in terms of the tail\nbehavior. We illustrate our theory on linear regression experiments and show\nthrough deep learning experiments that Markovian stepsizes can achieve even a\nheavier tail and be a viable alternative to cyclic and i.i.d. randomized\nstepsize rules.\n","authors":["Mert Gürbüzbalaban","Yuanhan Hu","Umut Şimşekli","Lingjiong Zhu"],"pdf_url":"https://arxiv.org/pdf/2302.05516v2.pdf","comment":"To Appear"},{"id":"http://arxiv.org/abs/2308.15232v1","updated":"2023-08-29T11:40:24Z","published":"2023-08-29T11:40:24Z","title":"Classification-Aware Neural Topic Model Combined With Interpretable\n Analysis -- For Conflict Classification","summary":" A large number of conflict events are affecting the world all the time. In\norder to analyse such conflict events effectively, this paper presents a\nClassification-Aware Neural Topic Model (CANTM-IA) for Conflict Information\nClassification and Topic Discovery. The model provides a reliable\ninterpretation of classification results and discovered topics by introducing\ninterpretability analysis. At the same time, interpretation is introduced into\nthe model architecture to improve the classification performance of the model\nand to allow interpretation to focus further on the details of the data.\nFinally, the model architecture is optimised to reduce the complexity of the\nmodel.\n","authors":["Tianyu Liang","Yida Mu","Soonho Kim","Darline Larissa Kengne Kuate","Julie Lang","Rob Vos","Xingyi Song"],"pdf_url":"https://arxiv.org/pdf/2308.15232v1.pdf","comment":"Accepted by RANLP 2023"},{"id":"http://arxiv.org/abs/2308.15230v1","updated":"2023-08-29T11:37:33Z","published":"2023-08-29T11:37:33Z","title":"Providing Previously Unseen Users Fair Recommendations Using Variational\n Autoencoders","summary":" An emerging definition of fairness in machine learning requires that models\nare oblivious to demographic user information, e.g., a user's gender or age\nshould not influence the model. Personalized recommender systems are\nparticularly prone to violating this definition through their explicit user\nfocus and user modelling. Explicit user modelling is also an aspect that makes\nmany recommender systems incapable of providing hitherto unseen users with\nrecommendations. We propose novel approaches for mitigating discrimination in\nVariational Autoencoder-based recommender systems by limiting the encoding of\ndemographic information. The approaches are capable of, and evaluated on,\nproviding users that are not represented in the training data with fair\nrecommendations.\n","authors":["Bjørnar Vassøy","Helge Langseth","Benjamin Kille"],"pdf_url":"https://arxiv.org/pdf/2308.15230v1.pdf","comment":"Appearing in RecSys 2023 proceedings"},{"id":"http://arxiv.org/abs/2308.15223v1","updated":"2023-08-29T11:24:12Z","published":"2023-08-29T11:24:12Z","title":"Evaluating Explanation Methods for Multivariate Time Series\n Classification","summary":" Multivariate time series classification is an important computational task\narising in applications where data is recorded over time and over multiple\nchannels. For example, a smartwatch can record the acceleration and orientation\nof a person's motion, and these signals are recorded as multivariate time\nseries. We can classify this data to understand and predict human movement and\nvarious properties such as fitness levels. In many applications classification\nalone is not enough, we often need to classify but also understand what the\nmodel learns (e.g., why was a prediction given, based on what information in\nthe data). The main focus of this paper is on analysing and evaluating\nexplanation methods tailored to Multivariate Time Series Classification (MTSC).\nWe focus on saliency-based explanation methods that can point out the most\nrelevant channels and time series points for the classification decision. We\nanalyse two popular and accurate multivariate time series classifiers, ROCKET\nand dResNet, as well as two popular explanation methods, SHAP and dCAM. We\nstudy these methods on 3 synthetic datasets and 2 real-world datasets and\nprovide a quantitative and qualitative analysis of the explanations provided.\nWe find that flattening the multivariate datasets by concatenating the channels\nworks as well as using multivariate classifiers directly and adaptations of\nSHAP for MTSC work quite well. Additionally, we also find that the popular\nsynthetic datasets we used are not suitable for time series analysis.\n","authors":["Davide Italo Serramazza","Thu Trang Nguyen","Thach Le Nguyen","Georgiana Ifrim"],"pdf_url":"https://arxiv.org/pdf/2308.15223v1.pdf","comment":"Accepted at AALTD '23"},{"id":"http://arxiv.org/abs/2304.12154v2","updated":"2023-08-29T11:19:40Z","published":"2023-04-24T15:05:04Z","title":"Explainable AI Insights for Symbolic Computation: A case study on\n selecting the variable ordering for cylindrical algebraic decomposition","summary":" In recent years there has been increased use of machine learning (ML)\ntechniques within mathematics, including symbolic computation where it may be\napplied safely to optimise or select algorithms. This paper explores whether\nusing explainable AI (XAI) techniques on such ML models can offer new insight\nfor symbolic computation, inspiring new implementations within computer algebra\nsystems that do not directly call upon AI tools. We present a case study on the\nuse of ML to select the variable ordering for cylindrical algebraic\ndecomposition. It has already been demonstrated that ML can make the choice\nwell, but here we show how the SHAP tool for explainability can be used to\ninform new heuristics of a size and complexity similar to those human-designed\nheuristics currently commonly used in symbolic computation.\n","authors":["Lynn Pickering","Tereso Del Rio Almajano","Matthew England","Kelly Cohen"],"pdf_url":"https://arxiv.org/pdf/2304.12154v2.pdf","comment":"40 pages"},{"id":"http://arxiv.org/abs/2301.03364v4","updated":"2023-08-29T11:18:43Z","published":"2022-12-20T15:04:20Z","title":"Towards an AI-enabled Connected Industry: AGV Communication and Sensor\n Measurement Datasets","summary":" This paper presents two wireless measurement campaigns in industrial\ntestbeds: industrial Vehicle-to-vehicle (iV2V) and industrial\nVehicle-to-infrastructure plus Sensor (iV2I+), together with detailed\ninformation about the two captured datasets. iV2V covers sidelink communication\nscenarios between Automated Guided Vehicles (AGVs), while iV2I+ is conducted at\nan industrial setting where an autonomous cleaning robot is connected to a\nprivate cellular network. The combination of different communication\ntechnologies within a common measurement methodology provides insights that can\nbe exploited by Machine Learning (ML) for tasks such as fingerprinting,\nline-of-sight detection, prediction of quality of service or link selection.\nMoreover, the datasets are publicly available, labelled and prefiltered for\nfast on-boarding and applicability.\n","authors":["Rodrigo Hernangómez","Alexandros Palaios","Cara Watermann","Daniel Schäufele","Philipp Geuer","Rafail Ismayilov","Mohammad Parvini","Anton Krause","Martin Kasparick","Thomas Neugebauer","Oscar D. Ramos-Cantor","Hugues Tchouankem","Jose Leon Calvo","Bo Chen","Gerhard Fettweis","Sławomir Stańczak"],"pdf_url":"https://arxiv.org/pdf/2301.03364v4.pdf","comment":"7 pages, 3 figures. Submitted to a magazine. Datasets available at\n https://ieee-dataport.org/open-access/ai4mobile-industrial-wireless-datasets-iv2v-and-iv2i"},{"id":"http://arxiv.org/abs/2211.14573v3","updated":"2023-08-29T10:59:41Z","published":"2022-11-26T14:00:18Z","title":"Deep Curvilinear Editing: Commutative and Nonlinear Image Manipulation\n for Pretrained Deep Generative Model","summary":" Semantic editing of images is the fundamental goal of computer vision.\nAlthough deep learning methods, such as generative adversarial networks (GANs),\nare capable of producing high-quality images, they often do not have an\ninherent way of editing generated images semantically. Recent studies have\ninvestigated a way of manipulating the latent variable to determine the images\nto be generated. However, methods that assume linear semantic arithmetic have\ncertain limitations in terms of the quality of image editing, whereas methods\nthat discover nonlinear semantic pathways provide non-commutative editing,\nwhich is inconsistent when applied in different orders. This study proposes a\nnovel method called deep curvilinear editing (DeCurvEd) to determine semantic\ncommuting vector fields on the latent space. We theoretically demonstrate that\nowing to commutativity, the editing of multiple attributes depends only on the\nquantities and not on the order. Furthermore, we experimentally demonstrate\nthat compared to previous methods, the nonlinear and commutative nature of\nDeCurvEd facilitates the disentanglement of image attributes and provides\nhigher-quality editing.\n","authors":["Takehiro Aoshima","Takashi Matsubara"],"pdf_url":"https://arxiv.org/pdf/2211.14573v3.pdf","comment":"15 pages. The last update made no changes except for adding the\n following link to the CVF repository:\n https://openaccess.thecvf.com/content/CVPR2023/html/Aoshima_Deep_Curvilinear_Editing_Commutative_and_Nonlinear_Image_Manipulation_for_Pretrained_CVPR_2023_paper.html.\n Here, you can find our code to reproduce our results"},{"id":"http://arxiv.org/abs/2205.09862v2","updated":"2023-08-29T10:47:39Z","published":"2022-05-19T21:10:52Z","title":"Recurrent segmentation meets block models in temporal networks","summary":" A popular approach to model interactions is to represent them as a network\nwith nodes being the agents and the interactions being the edges. Interactions\nare often timestamped, which leads to having timestamped edges. Many real-world\ntemporal networks have a recurrent or possibly cyclic behaviour. For example,\nsocial network activity may be heightened during certain hours of day. In this\npaper, our main interest is to model recurrent activity in such temporal\nnetworks. As a starting point we use stochastic block model, a popular choice\nfor modelling static networks, where nodes are split into $R$ groups. We extend\nthis model to temporal networks by modelling the edges with a Poisson process.\nWe make the parameters of the process dependent on time by segmenting the time\nline into $K$ segments. To enforce the recurring activity we require that only\n$H < K$ different set of parameters can be used, that is, several, not\nnecessarily consecutive, segments must share their parameters. We prove that\nthe searching for optimal blocks and segmentation is an NP-hard problem.\nConsequently, we split the problem into 3 subproblems where we optimize blocks,\nmodel parameters, and segmentation in turn while keeping the remaining\nstructures fixed. We propose an iterative algorithm that requires $O(KHm + Rn +\nR^2H)$ time per iteration, where $n$ and $m$ are the number of nodes and edges\nin the network. We demonstrate experimentally that the number of required\niterations is typically low, the algorithm is able to discover the ground truth\nfrom synthetic datasets, and show that certain real-world networks exhibit\nrecurrent behaviour as the likelihood does not deteriorate when $H$ is lowered.\n","authors":["Chamalee Wickrama Arachchi","Nikolaj Tatti"],"pdf_url":"https://arxiv.org/pdf/2205.09862v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15194v1","updated":"2023-08-29T10:21:50Z","published":"2023-08-29T10:21:50Z","title":"Ensemble of Counterfactual Explainers","summary":" In eXplainable Artificial Intelligence (XAI), several counterfactual\nexplainers have been proposed, each focusing on some desirable properties of\ncounterfactual instances: minimality, actionability, stability, diversity,\nplausibility, discriminative power. We propose an ensemble of counterfactual\nexplainers that boosts weak explainers, which provide only a subset of such\nproperties, to a powerful method covering all of them. The ensemble runs weak\nexplainers on a sample of instances and of features, and it combines their\nresults by exploiting a diversity-driven selection function. The method is\nmodel-agnostic and, through a wrapping approach based on autoencoders, it is\nalso data-agnostic.\n","authors":["Riccardo Guidotti","Salvatore Ruggieri"],"pdf_url":"https://arxiv.org/pdf/2308.15194v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.11705v2","updated":"2023-08-29T10:08:24Z","published":"2023-04-23T17:43:29Z","title":"Walking Your LiDOG: A Journey Through Multiple Domains for LiDAR\n Semantic Segmentation","summary":" The ability to deploy robots that can operate safely in diverse environments\nis crucial for developing embodied intelligent agents. As a community, we have\nmade tremendous progress in within-domain LiDAR semantic segmentation. However,\ndo these methods generalize across domains? To answer this question, we design\nthe first experimental setup for studying domain generalization (DG) for LiDAR\nsemantic segmentation (DG-LSS). Our results confirm a significant gap between\nmethods, evaluated in a cross-domain setting: for example, a model trained on\nthe source dataset (SemanticKITTI) obtains $26.53$ mIoU on the target data,\ncompared to $48.49$ mIoU obtained by the model trained on the target domain\n(nuScenes). To tackle this gap, we propose the first method specifically\ndesigned for DG-LSS, which obtains $34.88$ mIoU on the target domain,\noutperforming all baselines. Our method augments a sparse-convolutional\nencoder-decoder 3D segmentation network with an additional, dense 2D\nconvolutional decoder that learns to classify a birds-eye view of the point\ncloud. This simple auxiliary task encourages the 3D network to learn features\nthat are robust to sensor placement shifts and resolution, and are transferable\nacross domains. With this work, we aim to inspire the community to develop and\nevaluate future models in such cross-domain conditions.\n","authors":["Cristiano Saltori","Aljoša Ošep","Elisa Ricci","Laura Leal-Taixé"],"pdf_url":"https://arxiv.org/pdf/2304.11705v2.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2308.15172v1","updated":"2023-08-29T09:54:30Z","published":"2023-08-29T09:54:30Z","title":"Is visual explanation with Grad-CAM more reliable for deeper neural\n networks? a case study with automatic pneumothorax diagnosis","summary":" While deep learning techniques have provided the state-of-the-art performance\nin various clinical tasks, explainability regarding their decision-making\nprocess can greatly enhance the credence of these methods for safer and quicker\nclinical adoption. With high flexibility, Gradient-weighted Class Activation\nMapping (Grad-CAM) has been widely adopted to offer intuitive visual\ninterpretation of various deep learning models' reasoning processes in\ncomputer-assisted diagnosis. However, despite the popularity of the technique,\nthere is still a lack of systematic study on Grad-CAM's performance on\ndifferent deep learning architectures. In this study, we investigate its\nrobustness and effectiveness across different popular deep learning models,\nwith a focus on the impact of the networks' depths and architecture types, by\nusing a case study of automatic pneumothorax diagnosis in X-ray scans. Our\nresults show that deeper neural networks do not necessarily contribute to a\nstrong improvement of pneumothorax diagnosis accuracy, and the effectiveness of\nGradCAM also varies among different network architectures.\n","authors":["Zirui Qiu","Hassan Rivaz","Yiming Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.15172v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15164v1","updated":"2023-08-29T09:46:52Z","published":"2023-08-29T09:46:52Z","title":"ABS-SGD: A Delayed Synchronous Stochastic Gradient Descent Algorithm\n with Adaptive Batch Size for Heterogeneous GPU Clusters","summary":" As the size of models and datasets grows, it has become increasingly common\nto train models in parallel. However, existing distributed stochastic gradient\ndescent (SGD) algorithms suffer from insufficient utilization of computational\nresources and poor convergence in heterogeneous clusters. In this paper, we\npropose a delayed synchronous SGD algorithm with adaptive batch size (ABS-SGD)\nfor heterogeneous GPU clusters. In ABS-SGD, workers perform global\nsynchronization to accumulate delayed gradients and use the accumulated delayed\ngradients to update parameters. While workers are performing global\nsynchronization for delayed gradients, they perform the computation of the next\nbatch without specifying batch size in advance, which lasts until the next\nglobal synchronization starts, realizing the full utilization of computational\nresources. Since the gradient delay is only one iteration, the stale gradient\nproblem can be alleviated. We theoretically prove the convergence of ABS-SGD in\nheterogeneous clusters. Extensive experiments in three types of heterogeneous\nclusters demonstrate that ABS-SGD can make full use of computational resources\nand accelerate model convergence: When training ResNet18 network with 4\nworkers, ABS-SGD increases the convergence speed by 1.30x on average compared\nwith the best baseline algorithm.\n","authors":["Xin Zhou","Ling Chen","Houming Wu"],"pdf_url":"https://arxiv.org/pdf/2308.15164v1.pdf","comment":"15 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.15157v1","updated":"2023-08-29T09:39:12Z","published":"2023-08-29T09:39:12Z","title":"On the improvement of model-predictive controllers","summary":" This article investigates synthetic model-predictive control (MPC) problems\nto demonstrate that an increased precision of the internal prediction model\n(PM) automatially entails an improvement of the controller as a whole. In\ncontrast to reinforcement learning (RL), MPC uses the PM to predict subsequent\nstates of the controlled system (CS), instead of directly recommending suitable\nactions. To assess how the precision of the PM translates into the quality of\nthe model-predictive controller, we compare a DNN-based PM to the optimal\nbaseline PM for three well-known control problems of varying complexity. The\nbaseline PM achieves perfect accuracy by accessing the simulation of the CS\nitself. Based on the obtained results, we argue that an improvement of the PM\nwill always improve the controller as a whole, without considering the impact\nof other components such as action selection (which, in this article, relies on\nevolutionary optimization).\n","authors":["L. Féret","A. Gepperth","S. Lambeck"],"pdf_url":"https://arxiv.org/pdf/2308.15157v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.01055v2","updated":"2023-08-29T09:38:14Z","published":"2023-03-02T08:24:27Z","title":"Physics-informed neural networks for solving forward and inverse\n problems in complex beam systems","summary":" This paper proposes a new framework using physics-informed neural networks\n(PINNs) to simulate complex structural systems that consist of single and\ndouble beams based on Euler-Bernoulli and Timoshenko theory, where the double\nbeams are connected with a Winkler foundation. In particular, forward and\ninverse problems for the Euler-Bernoulli and Timoshenko partial differential\nequations (PDEs) are solved using nondimensional equations with the\nphysics-informed loss function. Higher-order complex beam PDEs are efficiently\nsolved for forward problems to compute the transverse displacements and\ncross-sectional rotations with less than 1e-3 percent error. Furthermore,\ninverse problems are robustly solved to determine the unknown dimensionless\nmodel parameters and applied force in the entire space-time domain, even in the\ncase of noisy data. The results suggest that PINNs are a promising strategy for\nsolving problems in engineering structures and machines involving beam systems.\n","authors":["Taniya Kapoor","Hongrui Wang","Alfredo Nunez","Rolf Dollevoet"],"pdf_url":"https://arxiv.org/pdf/2303.01055v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13821v2","updated":"2023-08-29T09:31:11Z","published":"2023-08-26T09:11:44Z","title":"A Survey of Imbalanced Learning on Graphs: Problems, Techniques, and\n Future Directions","summary":" Graphs represent interconnected structures prevalent in a myriad of\nreal-world scenarios. Effective graph analytics, such as graph learning\nmethods, enables users to gain profound insights from graph data, underpinning\nvarious tasks including node classification and link prediction. However, these\nmethods often suffer from data imbalance, a common issue in graph data where\ncertain segments possess abundant data while others are scarce, thereby leading\nto biased learning outcomes. This necessitates the emerging field of imbalanced\nlearning on graphs, which aims to correct these data distribution skews for\nmore accurate and representative learning outcomes. In this survey, we embark\non a comprehensive review of the literature on imbalanced learning on graphs.\nWe begin by providing a definitive understanding of the concept and related\nterminologies, establishing a strong foundational understanding for readers.\nFollowing this, we propose two comprehensive taxonomies: (1) the problem\ntaxonomy, which describes the forms of imbalance we consider, the associated\ntasks, and potential solutions; (2) the technique taxonomy, which details key\nstrategies for addressing these imbalances, and aids readers in their method\nselection process. Finally, we suggest prospective future directions for both\nproblems and techniques within the sphere of imbalanced learning on graphs,\nfostering further innovation in this critical area.\n","authors":["Zemin Liu","Yuan Li","Nan Chen","Qian Wang","Bryan Hooi","Bingsheng He"],"pdf_url":"https://arxiv.org/pdf/2308.13821v2.pdf","comment":"The collection of awesome literature on imbalanced learning on\n graphs: https://github.com/Xtra-Computing/Awesome-Literature-ILoGs"},{"id":"http://arxiv.org/abs/2210.00991v2","updated":"2023-08-29T09:23:24Z","published":"2022-10-03T14:57:46Z","title":"Policy Gradient for Reinforcement Learning with General Utilities","summary":" In Reinforcement Learning (RL), the goal of agents is to discover an optimal\npolicy that maximizes the expected cumulative rewards. This objective may also\nbe viewed as finding a policy that optimizes a linear function of its\nstate-action occupancy measure, hereafter referred as Linear RL. However, many\nsupervised and unsupervised RL problems are not covered in the Linear RL\nframework, such as apprenticeship learning, pure exploration and variational\nintrinsic control, where the objectives are non-linear functions of the\noccupancy measures. RL with non-linear utilities looks unwieldy, as methods\nlike Bellman equation, value iteration, policy gradient, dynamic programming\nthat had tremendous success in Linear RL, fail to trivially generalize. In this\npaper, we derive the policy gradient theorem for RL with general utilities. The\npolicy gradient theorem proves to be a cornerstone in Linear RL due to its\nelegance and ease of implementability. Our policy gradient theorem for RL with\ngeneral utilities shares the same elegance and ease of implementability. Based\non the policy gradient theorem derived, we also present a simple sample-based\nalgorithm. We believe our results will be of interest to the community and\noffer inspiration to future works in this generalized setting.\n","authors":["Navdeep Kumar","Kaixin Wang","Kfir Levy","Shie Mannor"],"pdf_url":"https://arxiv.org/pdf/2210.00991v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15141v1","updated":"2023-08-29T09:19:49Z","published":"2023-08-29T09:19:49Z","title":"Uncertainty Aware Training to Improve Deep Learning Model Calibration\n for Classification of Cardiac MR Images","summary":" Quantifying uncertainty of predictions has been identified as one way to\ndevelop more trustworthy artificial intelligence (AI) models beyond\nconventional reporting of performance metrics. When considering their role in a\nclinical decision support setting, AI classification models should ideally\navoid confident wrong predictions and maximise the confidence of correct\npredictions. Models that do this are said to be well-calibrated with regard to\nconfidence. However, relatively little attention has been paid to how to\nimprove calibration when training these models, i.e., to make the training\nstrategy uncertainty-aware. In this work we evaluate three novel\nuncertainty-aware training strategies comparing against two state-of-the-art\napproaches. We analyse performance on two different clinical applications:\ncardiac resynchronisation therapy (CRT) response prediction and coronary artery\ndisease (CAD) diagnosis from cardiac magnetic resonance (CMR) images. The\nbest-performing model in terms of both classification accuracy and the most\ncommon calibration measure, expected calibration error (ECE) was the Confidence\nWeight method, a novel approach that weights the loss of samples to explicitly\npenalise confident incorrect predictions. The method reduced the ECE by 17% for\nCRT response prediction and by 22% for CAD diagnosis when compared to a\nbaseline classifier in which no uncertainty-aware strategy was included. In\nboth applications, as well as reducing the ECE there was a slight increase in\naccuracy from 69% to 70% and 70% to 72% for CRT response prediction and CAD\ndiagnosis respectively. However, our analysis showed a lack of consistency in\nterms of optimal models when using different calibration measures. This\nindicates the need for careful consideration of performance metrics when\ntraining and selecting models for complex high-risk applications in healthcare.\n","authors":["Tareen Dawood","Chen Chen","Baldeep S. Sidhua","Bram Ruijsink","Justin Goulda","Bradley Porter","Mark K. Elliott","Vishal Mehta","Christopher A. Rinaldi","Esther Puyol-Anton","Reza Razavi","Andrew P. King"],"pdf_url":"https://arxiv.org/pdf/2308.15141v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07015v2","updated":"2023-08-29T09:17:05Z","published":"2023-06-12T10:39:57Z","title":"Combining Primal and Dual Representations in Deep Restricted Kernel\n Machines Classifiers","summary":" In the context of deep learning with kernel machines, the deep Restricted\nKernel Machine (DRKM) framework allows multiple levels of kernel PCA (KPCA) and\nLeast-Squares Support Vector Machines (LSSVM) to be combined into a deep\narchitecture using visible and hidden units. We propose a new method for DRKM\nclassification coupling the objectives of KPCA and classification levels, with\nthe hidden feature matrix lying on the Stiefel manifold. The classification\nlevel can be formulated as an LSSVM or as an MLP feature map, combining depth\nin terms of levels and layers. The classification level is expressed in its\nprimal formulation, as the deep KPCA levels, in their dual formulation, can\nembed the most informative components of the data in a much lower dimensional\nspace. The dual setting is independent of the dimension of the inputs and the\nprimal setting is parametric, which makes the proposed method computationally\nefficient for both high-dimensional inputs and large datasets. In the\nexperiments, we show that our developed algorithm can effectively learn from\nsmall datasets, while using less memory than the convolutional neural network\n(CNN) with high-dimensional data. and that models with multiple KPCA levels can\noutperform models with a single level. On the tested larger-scale datasets,\nDRKM is more energy efficient than CNN while maintaining comparable\nperformance.\n","authors":["Francesco Tonin","Panagiotis Patrinos","Johan A. K. Suykens"],"pdf_url":"https://arxiv.org/pdf/2306.07015v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15132v1","updated":"2023-08-29T08:57:47Z","published":"2023-08-29T08:57:47Z","title":"Biquality Learning: a Framework to Design Algorithms Dealing with\n Closed-Set Distribution Shifts","summary":" Training machine learning models from data with weak supervision and dataset\nshifts is still challenging. Designing algorithms when these two situations\narise has not been explored much, and existing algorithms cannot always handle\nthe most complex distributional shifts. We think the biquality data setup is a\nsuitable framework for designing such algorithms. Biquality Learning assumes\nthat two datasets are available at training time: a trusted dataset sampled\nfrom the distribution of interest and the untrusted dataset with dataset shifts\nand weaknesses of supervision (aka distribution shifts). The trusted and\nuntrusted datasets available at training time make designing algorithms dealing\nwith any distribution shifts possible. We propose two methods, one inspired by\nthe label noise literature and another by the covariate shift literature for\nbiquality learning. We experiment with two novel methods to synthetically\nintroduce concept drift and class-conditional shifts in real-world datasets\nacross many of them. We opened some discussions and assessed that developing\nbiquality learning algorithms robust to distributional changes remains an\ninteresting problem for future research.\n","authors":["Pierre Nodet","Vincent Lemaire","Alexis Bondu","Antoine Cornuéjols"],"pdf_url":"https://arxiv.org/pdf/2308.15132v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15126v1","updated":"2023-08-29T08:51:24Z","published":"2023-08-29T08:51:24Z","title":"Evaluation and Analysis of Hallucination in Large Vision-Language Models","summary":" Large Vision-Language Models (LVLMs) have recently achieved remarkable\nsuccess. However, LVLMs are still plagued by the hallucination problem, which\nlimits the practicality in many scenarios. Hallucination refers to the\ninformation of LVLMs' responses that does not exist in the visual input, which\nposes potential risks of substantial consequences. There has been limited work\nstudying hallucination evaluation in LVLMs. In this paper, we propose\nHallucination Evaluation based on Large Language Models (HaELM), an LLM-based\nhallucination evaluation framework. HaELM achieves an approximate 95%\nperformance comparable to ChatGPT and has additional advantages including low\ncost, reproducibility, privacy preservation and local deployment. Leveraging\nthe HaELM, we evaluate the hallucination in current LVLMs. Furthermore, we\nanalyze the factors contributing to hallucination in LVLMs and offer helpful\nsuggestions to mitigate the hallucination problem. Our training data and human\nannotation hallucination data will be made public soon.\n","authors":["Junyang Wang","Yiyang Zhou","Guohai Xu","Pengcheng Shi","Chenlin Zhao","Haiyang Xu","Qinghao Ye","Ming Yan","Ji Zhang","Jihua Zhu","Jitao Sang","Haoyu Tang"],"pdf_url":"https://arxiv.org/pdf/2308.15126v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2210.13004v2","updated":"2023-08-29T08:35:07Z","published":"2022-10-24T07:50:02Z","title":"Efficient Representation of Natural Image Patches","summary":" In the complex domain of neural information processing, discerning\nfundamental principles from ancillary details remains a significant challenge.\nWhile there is extensive knowledge about the anatomy and physiology of the\nearly visual system, a comprehensive computational theory remains elusive. Can\nwe gain insights into the underlying principles of a biological system by\nabstracting away from its detailed implementation and focusing on the\nfundamental problems that the system is designed to solve? Utilizing an\nabstract model based on minimal yet realistic assumptions, we show how to\nachieve the early visual system's two ultimate objectives: efficient\ninformation transmission and sensor probability distribution modeling. We show\nthat optimizing for information transmission does not yield optimal probability\ndistribution modeling. We illustrate, using a two-pixel (2D) system and image\npatches, that an efficient representation can be realized via nonlinear\npopulation code driven by two types of biologically plausible loss functions\nthat depend solely on output. After unsupervised learning, our abstract IPU\nmodel bears remarkable resemblances to biological systems, despite not\nmimicking many features of real neurons, such as spiking activity. A\npreliminary comparison with a contemporary deep learning model suggests that\nthe IPU model offers a significant efficiency advantage. Our model provides\nnovel insights into the computational theory of early visual systems as well as\na potential new approach to enhance the efficiency of deep learning models.\n","authors":["Cheng Guo"],"pdf_url":"https://arxiv.org/pdf/2210.13004v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15116v1","updated":"2023-08-29T08:29:08Z","published":"2023-08-29T08:29:08Z","title":"Mixup-Augmented Meta-Learning for Sample-Efficient Fine-Tuning of\n Protein Simulators","summary":" Molecular dynamics simulations have emerged as a fundamental instrument for\nstudying biomolecules. At the same time, it is desirable to perform simulations\nof a collection of particles under various conditions in which the molecules\ncan fluctuate. In this paper, we explore and adapt the soft prompt-based\nlearning method to molecular dynamics tasks. Our model can remarkably\ngeneralize to unseen and out-of-distribution scenarios with limited training\ndata. While our work focuses on temperature as a test case, the versatility of\nour approach allows for efficient simulation through any continuous dynamic\nconditions, such as pressure and volumes. Our framework has two stages: 1)\nPre-trains with data mixing technique, augments molecular structure data and\ntemperature prompts, then applies a curriculum learning method by increasing\nthe ratio of them smoothly. 2) Meta-learning-based fine-tuning framework\nimproves sample-efficiency of fine-tuning process and gives the soft\nprompt-tuning better initialization points. Comprehensive experiments reveal\nthat our framework excels in accuracy for in-domain data and demonstrates\nstrong generalization capabilities for unseen and out-of-distribution samples.\n","authors":["Jingbang Chen","Yian Wang","Xingwei Qu","Shuangjia Zheng","Yaodong Yang","Hao Dong","Jie Fu"],"pdf_url":"https://arxiv.org/pdf/2308.15116v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15107v1","updated":"2023-08-29T08:14:19Z","published":"2023-08-29T08:14:19Z","title":"Stochastic Graph Bandit Learning with Side-Observations","summary":" In this paper, we investigate the stochastic contextual bandit with general\nfunction space and graph feedback. We propose an algorithm that addresses this\nproblem by adapting to both the underlying graph structures and reward gaps. To\nthe best of our knowledge, our algorithm is the first to provide a\ngap-dependent upper bound in this stochastic setting, bridging the research gap\nleft by the work in [35]. In comparison to [31,33,35], our method offers\nimproved regret upper bounds and does not require knowledge of graphical\nquantities. We conduct numerical experiments to demonstrate the computational\nefficiency and effectiveness of our approach in terms of regret upper bounds.\nThese findings highlight the significance of our algorithm in advancing the\nfield of stochastic contextual bandits with graph feedback, opening up avenues\nfor practical applications in various domains.\n","authors":["Xueping Gong","Jiheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.15107v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2010.03104 by other authors"},{"id":"http://arxiv.org/abs/2201.01079v5","updated":"2023-08-29T08:10:29Z","published":"2022-01-04T10:49:30Z","title":"Incomplete Multi-View Weak-Label Learning with Noisy Features and\n Imbalanced Labels","summary":" A variety of modern applications exhibit multi-view multi-label learning,\nwhere each sample has multi-view features, and multiple labels are correlated\nvia common views. Current methods usually fail to directly deal with the\nsetting where only a subset of features and labels are observed for each\nsample, and ignore the presence of noisy views and imbalanced labels in\nreal-world problems. In this paper, we propose a novel method to overcome the\nlimitations. It jointly embeds incomplete views and weak labels into a\nlow-dimensional subspace with adaptive weights, and facilitates the difference\nbetween embedding weight matrices via auto-weighted Hilbert-Schmidt\nIndependence Criterion (HSIC) to reduce the redundancy. Moreover, it adaptively\nlearns view-wise importance for embedding to detect noisy views, and mitigates\nthe label imbalance problem by focal loss. Experimental results on four\nreal-world multi-view multi-label datasets demonstrate the effectiveness of the\nproposed method.\n","authors":["Zhiwei Li","Zijian Yang","Lu Sun","Mineichi Kudo","Kego Kimura"],"pdf_url":"https://arxiv.org/pdf/2201.01079v5.pdf","comment":"6 pages, 2 figures, conference"},{"id":"http://arxiv.org/abs/2308.15096v1","updated":"2023-08-29T08:04:45Z","published":"2023-08-29T08:04:45Z","title":"How Faithful are Self-Explainable GNNs?","summary":" Self-explainable deep neural networks are a recent class of models that can\noutput ante-hoc local explanations that are faithful to the model's reasoning,\nand as such represent a step forward toward filling the gap between\nexpressiveness and interpretability. Self-explainable graph neural networks\n(GNNs) aim at achieving the same in the context of graph data. This begs the\nquestion: do these models fulfill their implicit guarantees in terms of\nfaithfulness? In this extended abstract, we analyze the faithfulness of several\nself-explainable GNNs using different measures of faithfulness, identify\nseveral limitations -- both in the models themselves and in the evaluation\nmetrics -- and outline possible ways forward.\n","authors":["Marc Christiansen","Lea Villadsen","Zhiqiang Zhong","Stefano Teso","Davide Mottin"],"pdf_url":"https://arxiv.org/pdf/2308.15096v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15094v1","updated":"2023-08-29T08:02:41Z","published":"2023-08-29T08:02:41Z","title":"Group-Conditional Conformal Prediction via Quantile Regression\n Calibration for Crop and Weed Classification","summary":" As deep learning predictive models become an integral part of a large\nspectrum of precision agricultural systems, a barrier to the adoption of such\nautomated solutions is the lack of user trust in these highly complex, opaque\nand uncertain models. Indeed, deep neural networks are not equipped with any\nexplicit guarantees that can be used to certify the system's performance,\nespecially in highly varying uncontrolled environments such as the ones\ntypically faced in computer vision for agriculture.Fortunately, certain methods\ndeveloped in other communities can prove to be important for agricultural\napplications. This article presents the conformal prediction framework that\nprovides valid statistical guarantees on the predictive performance of any\nblack box prediction machine, with almost no assumptions, applied to the\nproblem of deep visual classification of weeds and crops in real-world\nconditions. The framework is exposed with a focus on its practical aspects and\nspecial attention accorded to the Adaptive Prediction Sets (APS) approach that\ndelivers marginal guarantees on the model's coverage. Marginal results are then\nshown to be insufficient to guarantee performance on all groups of individuals\nin the population as characterized by their environmental and pedo-climatic\nauxiliary data gathered during image acquisition.To tackle this shortcoming,\ngroup-conditional conformal approaches are presented: the ''classical'' method\nthat consists of iteratively applying the APS procedure on all groups, and a\nproposed elegant reformulation and implementation of the procedure using\nquantile regression on group membership indicators. Empirical results showing\nthe validity of the proposed approach are presented and compared to the\nmarginal APS then discussed.\n","authors":["Paul Melki","Lionel Bombrun","Boubacar Diallo","Jérôme Dias","Jean-Pierre da Costa"],"pdf_url":"https://arxiv.org/pdf/2308.15094v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15092v1","updated":"2023-08-29T07:58:19Z","published":"2023-08-29T07:58:19Z","title":"Can We Rely on AI?","summary":" Over the last decade, adversarial attack algorithms have revealed\ninstabilities in deep learning tools. These algorithms raise issues regarding\nsafety, reliability and interpretability in artificial intelligence; especially\nin high risk settings. From a practical perspective, there has been a war of\nescalation between those developing attack and defence strategies. At a more\ntheoretical level, researchers have also studied bigger picture questions\nconcerning the existence and computability of attacks. Here we give a brief\noverview of the topic, focusing on aspects that are likely to be of interest to\nresearchers in applied and computational mathematics.\n","authors":["Desmond J. Higham"],"pdf_url":"https://arxiv.org/pdf/2308.15092v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15088v1","updated":"2023-08-29T07:51:36Z","published":"2023-08-29T07:51:36Z","title":"Using deep learning for an automatic detection and classification of the\n vascular bifurcations along the Circle of Willis","summary":" Most of the intracranial aneurysms (ICA) occur on a specific portion of the\ncerebral vascular tree named the Circle of Willis (CoW). More particularly,\nthey mainly arise onto fifteen of the major arterial bifurcations constituting\nthis circular structure. Hence, for an efficient and timely diagnosis it is\ncritical to develop some methods being able to accurately recognize each\nBifurcation of Interest (BoI). Indeed, an automatic extraction of the\nbifurcations presenting the higher risk of developing an ICA would offer the\nneuroradiologists a quick glance at the most alarming areas. Due to the recent\nefforts on Artificial Intelligence, Deep Learning turned out to be the best\nperforming technology for many pattern recognition tasks. Moreover, various\nmethods have been particularly designed for medical image analysis purposes.\nThis study intends to assist the neuroradiologists to promptly locate any\nbifurcation presenting a high risk of ICA occurrence. It can be seen as a\nComputer Aided Diagnosis scheme, where the Artificial Intelligence facilitates\nthe access to the regions of interest within the MRI. In this work, we propose\na method for a fully automatic detection and recognition of the bifurcations of\ninterest forming the Circle of Willis. Several neural networks architectures\nhave been tested, and we thoroughly evaluate the bifurcation recognition rate.\n","authors":["Rafic Nader","Romain Bourcier","Florent Autrusseau"],"pdf_url":"https://arxiv.org/pdf/2308.15088v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.10398v2","updated":"2023-08-29T07:33:15Z","published":"2023-04-20T15:34:20Z","title":"Multi-label Node Classification On Graph-Structured Data","summary":" Graph Neural Networks (GNNs) have shown state-of-the-art improvements in node\nclassification tasks on graphs. While these improvements have been largely\ndemonstrated in a multi-class classification scenario, a more general and\nrealistic scenario in which each node could have multiple labels has so far\nreceived little attention. The first challenge in conducting focused studies on\nmulti-label node classification is the limited number of publicly available\nmulti-label graph datasets. Therefore, as our first contribution, we collect\nand release three real-world biological datasets and develop a multi-label\ngraph generator to generate datasets with tunable properties. While high label\nsimilarity (high homophily) is usually attributed to the success of GNNs, we\nargue that a multi-label scenario does not follow the usual semantics of\nhomophily and heterophily so far defined for a multi-class scenario. As our\nsecond contribution, besides defining homophily for the multi-label scenario,\nwe develop a new approach that dynamically fuses the feature and label\ncorrelation information to learn label-informed representations. Finally, we\nperform a large-scale comparative study with $10$ methods and $9$ datasets\nwhich also showcase the effectiveness of our approach. We release our benchmark\nat \\url{https://anonymous.4open.science/r/LFLF-5D8C/}.\n","authors":["Tianqi Zhao","Ngan Thi Dong","Alan Hanjalic","Megha Khosla"],"pdf_url":"https://arxiv.org/pdf/2304.10398v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15074v1","updated":"2023-08-29T07:15:57Z","published":"2023-08-29T07:15:57Z","title":"Exploring Model Transferability through the Lens of Potential Energy","summary":" Transfer learning has become crucial in computer vision tasks due to the vast\navailability of pre-trained deep learning models. However, selecting the\noptimal pre-trained model from a diverse pool for a specific downstream task\nremains a challenge. Existing methods for measuring the transferability of\npre-trained models rely on statistical correlations between encoded static\nfeatures and task labels, but they overlook the impact of underlying\nrepresentation dynamics during fine-tuning, leading to unreliable results,\nespecially for self-supervised models. In this paper, we present an insightful\nphysics-inspired approach named PED to address these challenges. We reframe the\nchallenge of model selection through the lens of potential energy and directly\nmodel the interaction forces that influence fine-tuning dynamics. By capturing\nthe motion of dynamic representations to decline the potential energy within a\nforce-driven physical model, we can acquire an enhanced and more stable\nobservation for estimating transferability. The experimental results on 10\ndownstream tasks and 12 self-supervised models demonstrate that our approach\ncan seamlessly integrate into existing ranking techniques and enhance their\nperformances, revealing its effectiveness for the model selection task and its\npotential for understanding the mechanism in transfer learning. Code will be\navailable at https://github.com/lixiaotong97/PED.\n","authors":["Xiaotong Li","Zixuan Hu","Yixiao Ge","Ying Shan","Ling-Yu Duan"],"pdf_url":"https://arxiv.org/pdf/2308.15074v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.15072v1","updated":"2023-08-29T07:13:31Z","published":"2023-08-29T07:13:31Z","title":"Advancing Adversarial Robustness Through Adversarial Logit Update","summary":" Deep Neural Networks are susceptible to adversarial perturbations.\nAdversarial training and adversarial purification are among the most widely\nrecognized defense strategies. Although these methods have different underlying\nlogic, both rely on absolute logit values to generate label predictions. In\nthis study, we theoretically analyze the logit difference around successful\nadversarial attacks from a theoretical point of view and propose a new\nprinciple, namely Adversarial Logit Update (ALU), to infer adversarial sample's\nlabels. Based on ALU, we introduce a new classification paradigm that utilizes\npre- and post-purification logit differences for model's adversarial robustness\nboost. Without requiring adversarial or additional data for model training, our\nclean data synthesis model can be easily applied to various pre-trained models\nfor both adversarial sample detection and ALU-based data classification.\nExtensive experiments on both CIFAR-10, CIFAR-100, and tiny-ImageNet datasets\nshow that even with simple components, the proposed solution achieves superior\nrobustness performance compared to state-of-the-art methods against a wide\nrange of adversarial attacks. Our python implementation is submitted in our\nSupplementary document and will be published upon the paper's acceptance.\n","authors":["Hao Xuan","Peican Zhu","Xingyu Li"],"pdf_url":"https://arxiv.org/pdf/2308.15072v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.06815v2","updated":"2023-08-29T07:09:16Z","published":"2023-04-13T20:49:35Z","title":"Improving Few-Shot Prompts with Relevant Static Analysis Products","summary":" Large Language Models (LLM) are a new class of computation engines,\n\"programmed\" via prompt engineering. We are still learning how to best\n\"program\" these LLMs to help developers. We start with the intuition that\ndevelopers tend to consciously and unconsciously have a collection of semantics\nfacts in mind when working on coding tasks. Mostly these are shallow, simple\nfacts arising from a quick read. For a function, examples of facts might\ninclude parameter and local variable names, return expressions, simple pre- and\npost-conditions, and basic control and data flow, etc.\n One might assume that the powerful multi-layer architecture of\ntransformer-style LLMs makes them inherently capable of doing this simple level\nof \"code analysis\" and extracting such information, implicitly, while\nprocessing code: but are they, really? If they aren't, could explicitly adding\nthis information help? Our goal here is to investigate this question, using the\ncode summarization task and evaluate whether automatically augmenting an LLM's\nprompt with semantic facts explicitly, actually helps.\n Prior work shows that LLM performance on code summarization benefits from\nfew-shot samples drawn either from the same-project or from examples found via\ninformation retrieval methods (such as BM25). While summarization performance\nhas steadily increased since the early days, there is still room for\nimprovement: LLM performance on code summarization still lags its performance\non natural-language tasks like translation and text summarization.\n We find that adding semantic facts actually does help! This approach improves\nperformance in several different settings suggested by prior work, including\nfor two different Large Language Models. In most cases, improvement nears or\nexceeds 2 BLEU; for the PHP language in the challenging CodeSearchNet dataset,\nthis augmentation actually yields performance surpassing 30 BLEU.\n","authors":["Toufique Ahmed","Kunal Suresh Pai","Premkumar Devanbu","Earl T. Barr"],"pdf_url":"https://arxiv.org/pdf/2304.06815v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15069v1","updated":"2023-08-29T07:04:50Z","published":"2023-08-29T07:04:50Z","title":"MadSGM: Multivariate Anomaly Detection with Score-based Generative\n Models","summary":" The time-series anomaly detection is one of the most fundamental tasks for\ntime-series. Unlike the time-series forecasting and classification, the\ntime-series anomaly detection typically requires unsupervised (or\nself-supervised) training since collecting and labeling anomalous observations\nare difficult. In addition, most existing methods resort to limited forms of\nanomaly measurements and therefore, it is not clear whether they are optimal in\nall circumstances. To this end, we present a multivariate time-series anomaly\ndetector based on score-based generative models, called MadSGM, which considers\nthe broadest ever set of anomaly measurement factors: i) reconstruction-based,\nii) density-based, and iii) gradient-based anomaly measurements. We also design\na conditional score network and its denoising score matching loss for the\ntime-series anomaly detection. Experiments on five real-world benchmark\ndatasets illustrate that MadSGM achieves the most robust and accurate\npredictions.\n","authors":["Haksoo Lim","Sewon Park","Minjung Kim","Jaehoon Lee","Seonkyu Lim","Noseong Park"],"pdf_url":"https://arxiv.org/pdf/2308.15069v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15059v1","updated":"2023-08-29T06:43:29Z","published":"2023-08-29T06:43:29Z","title":"OEBench: Investigating Open Environment Challenges in Real-World\n Relational Data Streams","summary":" Relational datasets are widespread in real-world scenarios and are usually\ndelivered in a streaming fashion. This type of data stream can present unique\nchallenges, such as distribution drifts, outliers, emerging classes, and\nchanging features, which have recently been described as open environment\nchallenges for machine learning. While some work has been done on incremental\nlearning for data streams, their evaluations are mostly conducted with manually\npartitioned datasets. Moreover, while several real-world streaming datasets are\navailable, it is uncertain whether these open environment challenges are\nprevalent and how existing incremental learning algorithms perform on real\ndatasets. To fill this gap, we develop an Open Environment Benchmark named\nOEBench to evaluate open environment challenges in relational data streams.\nSpecifically, we investigate 55 real-world streaming datasets and establish\nthat open environment scenarios are indeed widespread in real-world datasets,\nwhich presents significant challenges for stream learning algorithms. Through\nbenchmarks, we find that increased data quantity may not consistently enhance\nthe model accuracy when applied in open environment scenarios, where machine\nlearning models can be significantly compromised by distribution shifts,\nanomalies, or untrustworthy data within real-world data streams. The current\ntechniques are insufficient in effectively mitigating these challenges posed by\nopen environments. Thus, it is promising to conduct more researches to address\nreal-world new challenges of open environment scenarios.\n","authors":["Yiqun Diao","Yutong Yang","Qinbin Li","Bingsheng He","Mian Lu"],"pdf_url":"https://arxiv.org/pdf/2308.15059v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.06714v4","updated":"2023-08-29T06:41:58Z","published":"2022-01-18T03:13:19Z","title":"AdaTerm: Adaptive T-Distribution Estimated Robust Moments for\n Noise-Robust Stochastic Gradient Optimization","summary":" With the increasing practicality of deep learning applications, practitioners\nare inevitably faced with datasets corrupted by noise from various sources such\nas measurement errors, mislabeling, and estimated surrogate inputs/outputs that\ncan adversely impact the optimization results. It is a common practice to\nimprove the optimization algorithm's robustness to noise, since this algorithm\nis ultimately in charge of updating the network parameters. Previous studies\nrevealed that the first-order moment used in Adam-like stochastic gradient\ndescent optimizers can be modified based on the Student's t-distribution. While\nthis modification led to noise-resistant updates, the other associated\nstatistics remained unchanged, resulting in inconsistencies in the assumed\nmodels. In this paper, we propose AdaTerm, a novel approach that incorporates\nthe Student's t-distribution to derive not only the first-order moment but also\nall the associated statistics. This provides a unified treatment of the\noptimization process, offering a comprehensive framework under the statistical\nmodel of the t-distribution for the first time. The proposed approach offers\nseveral advantages over previously proposed approaches, including reduced\nhyperparameters and improved robustness and adaptability. This noise-adaptive\nbehavior contributes to AdaTerm's exceptional learning performance, as\ndemonstrated through various optimization problems with different and/or\nunknown noise ratios. Furthermore, we introduce a new technique for deriving a\ntheoretical regret bound without relying on AMSGrad, providing a valuable\ncontribution to the field\n","authors":["Wendyam Eric Lionel Ilboudo","Taisuke Kobayashi","Takamitsu Matsubara"],"pdf_url":"https://arxiv.org/pdf/2201.06714v4.pdf","comment":"27 pages; Final version accepted by Elsevier Neurocomputing Journal\n (2023-08; https://doi.org/10.1016/j.neucom.2023.126692)"},{"id":"http://arxiv.org/abs/2011.01710v3","updated":"2023-08-29T06:39:04Z","published":"2020-11-03T13:54:01Z","title":"Ballistocardiogram artifact removal in simultaneous EEG-fMRI using\n generative adversarial network","summary":" Due to its advantages of high temporal and spatial resolution, the technology\nof simultaneous electroencephalogram-functional magnetic resonance imaging\n(EEG-fMRI) acquisition and analysis has attracted much attention, and has been\nwidely used in various research fields of brain science. However, during the\nfMRI of the brain, ballistocardiogram (BCG) artifacts can seriously contaminate\nthe EEG. As an unpaired problem, BCG artifact removal now remains a\nconsiderable challenge. Aiming to provide a solution, this paper proposed a\nnovel modular generative adversarial network (GAN) and corresponding training\nstrategy to improve the network performance by optimizing the parameters of\neach module. In this manner, we hope to improve the local representation\nability of the network model, thereby improving its overall performance and\nobtaining a reliable generator for BCG artifact removal. Moreover, the proposed\nmethod does not rely on additional reference signal or complex hardware\nequipment. Experimental results show that, compared with multiple methods, the\ntechnique presented in this paper can remove the BCG artifact more effectively\nwhile retaining essential EEG information.\n","authors":["Guang Lin","Jianhai Zhang","Yuxi Liu","Tianyang Gao","Wanzeng Kong","Xu Lei","Tao Qiu"],"pdf_url":"https://arxiv.org/pdf/2011.01710v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15055v1","updated":"2023-08-29T06:31:21Z","published":"2023-08-29T06:31:21Z","title":"Taxonomic Loss for Morphological Glossing of Low-Resource Languages","summary":" Morpheme glossing is a critical task in automated language documentation and\ncan benefit other downstream applications greatly. While state-of-the-art\nglossing systems perform very well for languages with large amounts of existing\ndata, it is more difficult to create useful models for low-resource languages.\nIn this paper, we propose the use of a taxonomic loss function that exploits\nmorphological information to make morphological glossing more performant when\ndata is scarce. We find that while the use of this loss function does not\noutperform a standard loss function with regards to single-label prediction\naccuracy, it produces better predictions when considering the top-n predicted\nlabels. We suggest this property makes the taxonomic loss function useful in a\nhuman-in-the-loop annotation setting.\n","authors":["Michael Ginn","Alexis Palmer"],"pdf_url":"https://arxiv.org/pdf/2308.15055v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06564v2","updated":"2023-08-29T06:25:48Z","published":"2023-08-12T13:17:09Z","title":"EquiDiff: A Conditional Equivariant Diffusion Model For Trajectory\n Prediction","summary":" Accurate trajectory prediction is crucial for the safe and efficient\noperation of autonomous vehicles. The growing popularity of deep learning has\nled to the development of numerous methods for trajectory prediction. While\ndeterministic deep learning models have been widely used, deep generative\nmodels have gained popularity as they learn data distributions from training\ndata and account for trajectory uncertainties. In this study, we propose\nEquiDiff, a deep generative model for predicting future vehicle trajectories.\nEquiDiff is based on the conditional diffusion model, which generates future\ntrajectories by incorporating historical information and random Gaussian noise.\nThe backbone model of EquiDiff is an SO(2)-equivariant transformer that fully\nutilizes the geometric properties of location coordinates. In addition, we\nemploy Recurrent Neural Networks and Graph Attention Networks to extract social\ninteractions from historical trajectories. To evaluate the performance of\nEquiDiff, we conduct extensive experiments on the NGSIM dataset. Our results\ndemonstrate that EquiDiff outperforms other baseline models in short-term\nprediction, but has slightly higher errors for long-term prediction.\nFurthermore, we conduct an ablation study to investigate the contribution of\neach component of EquiDiff to the prediction accuracy. Additionally, we present\na visualization of the generation process of our diffusion model, providing\ninsights into the uncertainty of the prediction.\n","authors":["Kehua Chen","Xianda Chen","Zihan Yu","Meixin Zhu","Hai Yang"],"pdf_url":"https://arxiv.org/pdf/2308.06564v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15050v1","updated":"2023-08-29T06:20:36Z","published":"2023-08-29T06:20:36Z","title":"iBARLE: imBalance-Aware Room Layout Estimation","summary":" Room layout estimation predicts layouts from a single panorama. It requires\ndatasets with large-scale and diverse room shapes to train the models. However,\nthere are significant imbalances in real-world datasets including the\ndimensions of layout complexity, camera locations, and variation in scene\nappearance. These issues considerably influence the model training performance.\nIn this work, we propose the imBalance-Aware Room Layout Estimation (iBARLE)\nframework to address these issues. iBARLE consists of (1) Appearance Variation\nGeneration (AVG) module, which promotes visual appearance domain\ngeneralization, (2) Complex Structure Mix-up (CSMix) module, which enhances\ngeneralizability w.r.t. room structure, and (3) a gradient-based layout\nobjective function, which allows more effective accounting for occlusions in\ncomplex layouts. All modules are jointly trained and help each other to achieve\nthe best performance. Experiments and ablation studies based on\nZInD~\\cite{cruz2021zillow} dataset illustrate that iBARLE has state-of-the-art\nperformance compared with other layout estimation baselines.\n","authors":["Taotao Jing","Lichen Wang","Naji Khosravan","Zhiqiang Wan","Zachary Bessinger","Zhengming Ding","Sing Bing Kang"],"pdf_url":"https://arxiv.org/pdf/2308.15050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.09121v2","updated":"2023-08-29T06:11:58Z","published":"2023-05-16T03:00:04Z","title":"A Conditional Denoising Diffusion Probabilistic Model for Radio\n Interferometric Image Reconstruction","summary":" In radio astronomy, signals from radio telescopes are transformed into images\nof observed celestial objects, or sources. However, these images, called dirty\nimages, contain real sources as well as artifacts due to signal sparsity and\nother factors. Therefore, radio interferometric image reconstruction is\nperformed on dirty images, aiming to produce clean images in which artifacts\nare reduced and real sources are recovered. So far, existing methods have\nlimited success on recovering faint sources, preserving detailed structures,\nand eliminating artifacts. In this paper, we present VIC-DDPM, a Visibility and\nImage Conditioned Denoising Diffusion Probabilistic Model. Our main idea is to\nuse both the original visibility data in the spectral domain and dirty images\nin the spatial domain to guide the image generation process with DDPM. This\nway, we can leverage DDPM to generate fine details and eliminate noise, while\nutilizing visibility data to separate signals from noise and retaining spatial\ninformation in dirty images. We have conducted experiments in comparison with\nboth traditional methods and recent deep learning based approaches. Our results\nshow that our method significantly improves the resulting images by reducing\nartifacts, preserving fine details, and recovering dim sources. This\nadvancement further facilitates radio astronomical data analysis tasks on\ncelestial phenomena.\n","authors":["Ruoqi Wang","Zhuoyang Chen","Qiong Luo","Feng Wang"],"pdf_url":"https://arxiv.org/pdf/2305.09121v2.pdf","comment":"Accepted by ECAI 2023"},{"id":"http://arxiv.org/abs/2308.15047v1","updated":"2023-08-29T06:09:47Z","published":"2023-08-29T06:09:47Z","title":"Large language models converge toward human-like concept organization","summary":" Large language models show human-like performance in knowledge extraction,\nreasoning and dialogue, but it remains controversial whether this performance\nis best explained by memorization and pattern matching, or whether it reflects\nhuman-like inferential semantics and world knowledge. Knowledge bases such as\nWikiData provide large-scale, high-quality representations of inferential\nsemantics and world knowledge. We show that large language models learn to\norganize concepts in ways that are strikingly similar to how concepts are\norganized in such knowledge bases. Knowledge bases model collective,\ninstitutional knowledge, and large language models seem to induce such\nknowledge from raw text. We show that bigger and better models exhibit more\nhuman-like concept organization, across four families of language models and\nthree knowledge graph embeddings.\n","authors":["Mathias Lykke Gammelgaard","Jonathan Gabel Christiansen","Anders Søgaard"],"pdf_url":"https://arxiv.org/pdf/2308.15047v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10875v2","updated":"2023-08-29T05:42:49Z","published":"2023-07-20T13:47:30Z","title":"Risk-optimized Outlier Removal for Robust Point Cloud Classification","summary":" With the growth of 3D sensing technology, deep learning system for 3D point\nclouds has become increasingly important, especially in applications like\nautonomous vehicles where safety is a primary concern. However, there are also\ngrowing concerns about the reliability of these systems when they encounter\nnoisy point clouds, whether occurring naturally or introduced with malicious\nintent. This paper highlights the challenges of point cloud classification\nposed by various forms of noise, from simple background noise to malicious\nbackdoor attacks that can intentionally skew model predictions. While there's\nan urgent need for optimized point cloud denoising, current point outlier\nremoval approaches, an essential step for denoising, rely heavily on\nhandcrafted strategies and are not adapted for higher-level tasks, such as\nclassification. To address this issue, we introduce an innovative point outlier\ncleansing method that harnesses the power of downstream classification models.\nBy employing gradient-based attribution analysis, we define a novel concept:\npoint risk. Drawing inspiration from tail risk minimization in finance, we\nrecast the outlier removal process as an optimization problem, named PointCVaR.\nExtensive experiments show that our proposed technique not only robustly\nfilters diverse point cloud outliers but also consistently and significantly\nenhances existing robust methods for point cloud classification.\n","authors":["Xinke Li","Junchi Lu","Henghui Ding","Changsheng Sun","Joey Tianyi Zhou","Chee Yeow Meng"],"pdf_url":"https://arxiv.org/pdf/2307.10875v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.12095v5","updated":"2023-08-29T05:34:25Z","published":"2023-02-22T11:01:20Z","title":"On the Robustness of ChatGPT: An Adversarial and Out-of-distribution\n Perspective","summary":" ChatGPT is a recent chatbot service released by OpenAI and is receiving\nincreasing attention over the past few months. While evaluations of various\naspects of ChatGPT have been done, its robustness, i.e., the performance to\nunexpected inputs, is still unclear to the public. Robustness is of particular\nconcern in responsible AI, especially for safety-critical applications. In this\npaper, we conduct a thorough evaluation of the robustness of ChatGPT from the\nadversarial and out-of-distribution (OOD) perspective. To do so, we employ the\nAdvGLUE and ANLI benchmarks to assess adversarial robustness and the Flipkart\nreview and DDXPlus medical diagnosis datasets for OOD evaluation. We select\nseveral popular foundation models as baselines. Results show that ChatGPT shows\nconsistent advantages on most adversarial and OOD classification and\ntranslation tasks. However, the absolute performance is far from perfection,\nwhich suggests that adversarial and OOD robustness remains a significant threat\nto foundation models. Moreover, ChatGPT shows astounding performance in\nunderstanding dialogue-related texts and we find that it tends to provide\ninformal suggestions for medical tasks instead of definitive answers. Finally,\nwe present in-depth discussions of possible research directions.\n","authors":["Jindong Wang","Xixu Hu","Wenxin Hou","Hao Chen","Runkai Zheng","Yidong Wang","Linyi Yang","Haojun Huang","Wei Ye","Xiubo Geng","Binxin Jiao","Yue Zhang","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2302.12095v5.pdf","comment":"Highlighted paper at ICLR 2023 workshop on Trustworthy and Reliable\n Large-Scale Machine Learning Models; code is at:\n https://github.com/microsoft/robustlearn; more works:\n https://llm-eval.github.io/"},{"id":"http://arxiv.org/abs/2308.15020v1","updated":"2023-08-29T04:50:07Z","published":"2023-08-29T04:50:07Z","title":"Massively Parallel Continuous Local Search for Hybrid SAT Solving on\n GPUs","summary":" Although state-of-the-art (SOTA) SAT solvers based on conflict-driven clause\nlearning (CDCL) have achieved remarkable engineering success, their sequential\nnature limits the parallelism that may be extracted for acceleration on\nplatforms such as the graphics processing unit (GPU). In this work, we propose\nFastFourierSAT, a highly parallel hybrid SAT solver based on gradient-driven\ncontinuous local search (CLS). This is realized by a novel parallel algorithm\ninspired by the Fast Fourier Transform (FFT)-based convolution for computing\nthe elementary symmetric polynomials (ESPs), which is the major computational\ntask in previous CLS methods. The complexity of our algorithm matches the best\nprevious result. Furthermore, the substantial parallelism inherent in our\nalgorithm can leverage the GPU for acceleration, demonstrating significant\nimprovement over the previous CLS approaches. We also propose to incorporate\nthe restart heuristics in CLS to improve search efficiency. We compare our\napproach with the SOTA parallel SAT solvers on several benchmarks. Our results\nshow that FastFourierSAT computes the gradient 100+ times faster than previous\nprototypes implemented on CPU. Moreover, FastFourierSAT solves most instances\nand demonstrates promising performance on larger-size instances.\n","authors":["Yunuo Cen","Zhiwei Zhang","Xuanyao Fong"],"pdf_url":"https://arxiv.org/pdf/2308.15020v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.11872v3","updated":"2023-08-29T04:00:03Z","published":"2022-06-22T17:47:41Z","title":"Provable Acceleration of Heavy Ball beyond Quadratics for a Class of\n Polyak-Łojasiewicz Functions when the Non-Convexity is Averaged-Out","summary":" Heavy Ball (HB) nowadays is one of the most popular momentum methods in\nnon-convex optimization. It has been widely observed that incorporating the\nHeavy Ball dynamic in gradient-based methods accelerates the training process\nof modern machine learning models. However, the progress on establishing its\ntheoretical foundation of acceleration is apparently far behind its empirical\nsuccess. Existing provable acceleration results are of the quadratic or\nclose-to-quadratic functions, as the current techniques of showing HB's\nacceleration are limited to the case when the Hessian is fixed. In this work,\nwe develop some new techniques that help show acceleration beyond quadratics,\nwhich is achieved by analyzing how the change of the Hessian at two consecutive\ntime points affects the convergence speed. Based on our technical results, a\nclass of Polyak-\\L{}ojasiewicz (PL) optimization problems for which provable\nacceleration can be achieved via HB is identified. Moreover, our analysis\ndemonstrates a benefit of adaptively setting the momentum parameter.\n (Update: 08/29/2023) Erratum is added in Appendix J. This is an updated\nversion that fixes an issue in the previous version. An additional condition\nneeds to be satisfied for the acceleration result of HB beyond quadratics in\nthis work, which naturally holds when the dimension is one or, more broadly,\nwhen the Hessian is diagonal. We elaborate on the issue in Appendix J.\n","authors":["Jun-Kun Wang","Chi-Heng Lin","Andre Wibisono","Bin Hu"],"pdf_url":"https://arxiv.org/pdf/2206.11872v3.pdf","comment":"(ICML 2022) Proceedings of the 39th International Conference on\n Machine Learning;"},{"id":"http://arxiv.org/abs/2308.15006v1","updated":"2023-08-29T03:54:53Z","published":"2023-08-29T03:54:53Z","title":"Exploiting Problem Geometry in Safe Linear Bandits","summary":" The safe linear bandit problem is a version of the classic linear bandit\nproblem where the learner's actions must satisfy an uncertain linear constraint\nat all rounds. Due its applicability to many real-world settings, this problem\nhas received considerable attention in recent years. We find that by exploiting\nthe geometry of the specific problem setting, we can achieve improved regret\nguarantees for both well-separated problem instances and action sets that are\nfinite star convex sets. Additionally, we propose a novel algorithm for this\nsetting that chooses problem parameters adaptively and enjoys at least as good\nregret guarantees as existing algorithms. Lastly, we introduce a generalization\nof the safe linear bandit setting where the constraints are convex and adapt\nour algorithms and analyses to this setting by leveraging a novel\nconvex-analysis based approach. Simulation results show improved performance\nover existing algorithms for a variety of randomly sampled settings.\n","authors":["Spencer Hutchinson","Berkay Turan","Mahnoosh Alizadeh"],"pdf_url":"https://arxiv.org/pdf/2308.15006v1.pdf","comment":"38 pages, 4 figures"},{"id":"http://arxiv.org/abs/1909.04883v4","updated":"2023-08-29T03:36:39Z","published":"2019-09-11T07:30:53Z","title":"Semi-supervised Vector-valued Learning: Improved Bounds and Algorithms","summary":" Vector-valued learning, where the output space admits a vector-valued\nstructure, is an important problem that covers a broad family of important\ndomains, e.g. multi-task learning and transfer learning. Using local Rademacher\ncomplexity and unlabeled data, we derive novel semi-supervised excess risk\nbounds for general vector-valued learning from both kernel perspective and\nlinear perspective. The derived bounds are much sharper than existing ones and\nthe convergence rates are improved from the square root of labeled sample size\nto the square root of total sample size or directly dependent on labeled sample\nsize. Motivated by our theoretical analysis, we propose a general\nsemi-supervised algorithm for efficiently learning vector-valued functions,\nincorporating both local Rademacher complexity and Laplacian regularization.\nExtensive experimental results illustrate the proposed algorithm significantly\noutperforms the compared methods, which coincides with our theoretical\nfindings.\n","authors":["Jian Li","Yong Liu","Weiping Wang"],"pdf_url":"https://arxiv.org/pdf/1909.04883v4.pdf","comment":"Accepted at Pattern Recognition"},{"id":"http://arxiv.org/abs/2307.00290v2","updated":"2023-08-29T03:31:58Z","published":"2023-07-01T10:12:46Z","title":"All-in-SAM: from Weak Annotation to Pixel-wise Nuclei Segmentation with\n Prompt-based Finetuning","summary":" The Segment Anything Model (SAM) is a recently proposed prompt-based\nsegmentation model in a generic zero-shot segmentation approach. With the\nzero-shot segmentation capacity, SAM achieved impressive flexibility and\nprecision on various segmentation tasks. However, the current pipeline requires\nmanual prompts during the inference stage, which is still resource intensive\nfor biomedical image segmentation. In this paper, instead of using prompts\nduring the inference stage, we introduce a pipeline that utilizes the SAM,\ncalled all-in-SAM, through the entire AI development workflow (from annotation\ngeneration to model finetuning) without requiring manual prompts during the\ninference stage. Specifically, SAM is first employed to generate pixel-level\nannotations from weak prompts (e.g., points, bounding box). Then, the\npixel-level annotations are used to finetune the SAM segmentation model rather\nthan training from scratch. Our experimental results reveal two key findings:\n1) the proposed pipeline surpasses the state-of-the-art (SOTA) methods in a\nnuclei segmentation task on the public Monuseg dataset, and 2) the utilization\nof weak and few annotations for SAM finetuning achieves competitive performance\ncompared to using strong pixel-wise annotated data.\n","authors":["Can Cui","Ruining Deng","Quan Liu","Tianyuan Yao","Shunxing Bao","Lucas W. Remedios","Yucheng Tang","Yuankai Huo"],"pdf_url":"https://arxiv.org/pdf/2307.00290v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.15860v2","updated":"2023-08-29T03:13:32Z","published":"2023-03-28T10:05:06Z","title":"The Wyner Variational Autoencoder for Unsupervised Multi-Layer Wireless\n Fingerprinting","summary":" Wireless fingerprinting refers to a device identification method leveraging\nhardware imperfections and wireless channel variations as signatures. Beyond\nphysical layer characteristics, recent studies demonstrated that user behaviors\ncould be identified through network traffic, e.g., packet length, without\ndecryption of the payload. Inspired by these results, we propose a multi-layer\nfingerprinting framework that jointly considers the multi-layer signatures for\nimproved identification performance. In contrast to previous works, by\nleveraging the recent multi-view machine learning paradigm, i.e., data with\nmultiple forms, our method can cluster the device information shared among the\nmulti-layer features without supervision. Our information-theoretic approach\ncan be extended to supervised and semi-supervised settings with straightforward\nderivations. In solving the formulated problem, we obtain a tight surrogate\nbound using variational inference for efficient optimization. In extracting the\nshared device information, we develop an algorithm based on the Wyner common\ninformation method, enjoying reduced computation complexity as compared to\nexisting approaches. The algorithm can be applied to data distributions\nbelonging to the exponential family class. Empirically, we evaluate the\nalgorithm in a synthetic dataset with real-world video traffic and simulated\nphysical layer characteristics. Our empirical results show that the proposed\nmethod outperforms the state-of-the-art baselines in both supervised and\nunsupervised settings.\n","authors":["Teng-Hui Huang","Thilini Dahanayaka","Kanchana Thilakarathna","Philip H. W. Leong","Hesham El Gamal"],"pdf_url":"https://arxiv.org/pdf/2303.15860v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14995v1","updated":"2023-08-29T02:50:36Z","published":"2023-08-29T02:50:36Z","title":"WSAM: Visual Explanations from Style Augmentation as Adversarial\n Attacker and Their Influence in Image Classification","summary":" Currently, style augmentation is capturing attention due to convolutional\nneural networks (CNN) being strongly biased toward recognizing textures rather\nthan shapes. Most existing styling methods either perform a low-fidelity style\ntransfer or a weak style representation in the embedding vector. This paper\noutlines a style augmentation algorithm using stochastic-based sampling with\nnoise addition to improving randomization on a general linear transformation\nfor style transfer. With our augmentation strategy, all models not only present\nincredible robustness against image stylizing but also outperform all previous\nmethods and surpass the state-of-the-art performance for the STL-10 dataset. In\naddition, we present an analysis of the model interpretations under different\nstyle variations. At the same time, we compare comprehensive experiments\ndemonstrating the performance when applied to deep neural architectures in\ntraining settings.\n","authors":["Felipe Moreno-Vera","Edgar Medina","Jorge Poco"],"pdf_url":"https://arxiv.org/pdf/2308.14995v1.pdf","comment":"8 pages, 10 figures"},{"id":"http://arxiv.org/abs/2207.03364v4","updated":"2023-08-29T02:44:33Z","published":"2022-07-07T15:12:02Z","title":"Group Equality in Adaptive Submodular Maximization","summary":" In this paper, we study the classic submodular maximization problem subject\nto a group equality constraint under both non-adaptive and adaptive settings.\nIt has been shown that the utility function of many machine learning\napplications, including data summarization, influence maximization in social\nnetworks, and personalized recommendation, satisfies the property of\nsubmodularity. Hence, maximizing a submodular function subject to various\nconstraints can be found at the heart of many of those applications. On a high\nlevel, submodular maximization aims to select a group of most representative\nitems (e.g., data points). However, the design of most existing algorithms does\nnot incorporate the fairness constraint, leading to under- or\nover-representation of some particular groups. This motivates us to study the\nsubmodular maximization problem with group equality, where we aim to select a\ngroup of items to maximize a (possibly non-monotone) submodular utility\nfunction subject to a group equality constraint. To this end, we develop the\nfirst constant-factor approximation algorithm for this problem. The design of\nour algorithm is robust enough to be extended to solving the submodular\nmaximization problem under a more complicated adaptive setting. Moreover, we\nfurther extend our study to incorporating a global cardinality constraint and\nother fairness notations.\n","authors":["Shaojie Tang","Jing Yuan"],"pdf_url":"https://arxiv.org/pdf/2207.03364v4.pdf","comment":"This paper has been accepted by INFORMS Journal on Computing"},{"id":"http://arxiv.org/abs/2308.14991v1","updated":"2023-08-29T02:43:58Z","published":"2023-08-29T02:43:58Z","title":"Incorporating Neuro-Inspired Adaptability for Continual Learning in\n Artificial Intelligence","summary":" Continual learning aims to empower artificial intelligence (AI) with strong\nadaptability to the real world. For this purpose, a desirable solution should\nproperly balance memory stability with learning plasticity, and acquire\nsufficient compatibility to capture the observed distributions. Existing\nadvances mainly focus on preserving memory stability to overcome catastrophic\nforgetting, but remain difficult to flexibly accommodate incremental changes as\nbiological intelligence (BI) does. By modeling a robust Drosophila learning\nsystem that actively regulates forgetting with multiple learning modules, here\nwe propose a generic approach that appropriately attenuates old memories in\nparameter distributions to improve learning plasticity, and accordingly\ncoordinates a multi-learner architecture to ensure solution compatibility.\nThrough extensive theoretical and empirical validation, our approach not only\nclearly enhances the performance of continual learning, especially over\nsynaptic regularization methods in task-incremental settings, but also\npotentially advances the understanding of neurological adaptive mechanisms,\nserving as a novel paradigm to progress AI and BI together.\n","authors":["Liyuan Wang","Xingxing Zhang","Qian Li","Mingtian Zhang","Hang Su","Jun Zhu","Yi Zhong"],"pdf_url":"https://arxiv.org/pdf/2308.14991v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14983v1","updated":"2023-08-29T02:23:58Z","published":"2023-08-29T02:23:58Z","title":"Constructive Incremental Learning for Fault Diagnosis of Rolling\n Bearings with Ensemble Domain Adaptation","summary":" Given the prevalence of rolling bearing fault diagnosis as a practical issue\nacross various working conditions, the limited availability of samples\ncompounds the challenge. Additionally, the complexity of the external\nenvironment and the structure of rolling bearings often manifests faults\ncharacterized by randomness and fuzziness, hindering the effective extraction\nof fault characteristics and restricting the accuracy of fault diagnosis. To\novercome these problems, this paper presents a novel approach termed\nconstructive Incremental learning-based ensemble domain adaptation (CIL-EDA)\napproach. Specifically, it is implemented on stochastic configuration networks\n(SCN) to constructively improve its adaptive performance in multi-domains.\nConcretely, a cloud feature extraction method is employed in conjunction with\nwavelet packet decomposition (WPD) to capture the uncertainty of fault\ninformation from multiple resolution aspects. Subsequently, constructive\nIncremental learning-based domain adaptation (CIL-DA) is firstly developed to\nenhance the cross-domain learning capability of each hidden node through domain\nmatching and construct a robust fault classifier by leveraging limited labeled\ndata from both target and source domains. Finally, fault diagnosis results are\nobtained by a majority voting of CIL-EDA which integrates CIL-DA and parallel\nensemble learning. Experimental results demonstrate that our CIL-DA outperforms\nseveral domain adaptation methods and CIL-EDA consistently outperforms\nstate-of-art fault diagnosis methods in few-shot scenarios.\n","authors":["Jiang Liu","Wei Dai"],"pdf_url":"https://arxiv.org/pdf/2308.14983v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14981v1","updated":"2023-08-29T02:16:48Z","published":"2023-08-29T02:16:48Z","title":"Sub-universal variational circuits for combinatorial optimization\n problems","summary":" Quantum variational circuits have gained significant attention due to their\napplications in the quantum approximate optimization algorithm and quantum\nmachine learning research. This work introduces a novel class of classical\nprobabilistic circuits designed for generating approximate solutions to\ncombinatorial optimization problems constructed using two-bit stochastic\nmatrices. Through a numerical study, we investigate the performance of our\nproposed variational circuits in solving the Max-Cut problem on various graphs\nof increasing sizes. Our classical algorithm demonstrates improved performance\nfor several graph types to the quantum approximate optimization algorithm. Our\nfindings suggest that evaluating the performance of quantum variational\ncircuits against variational circuits with sub-universal gate sets is a\nvaluable benchmark for identifying areas where quantum variational circuits can\nexcel.\n","authors":["Gal Weitz","Lirandë Pira","Chris Ferrie","Joshua Combes"],"pdf_url":"https://arxiv.org/pdf/2308.14981v1.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.14976v1","updated":"2023-08-29T02:05:40Z","published":"2023-08-29T02:05:40Z","title":"Efficient labeling of solar flux evolution videos by a deep learning\n model","summary":" Machine learning (ML) is becoming a critical tool for interrogation of large\ncomplex data. Labeling, defined as the process of adding meaningful\nannotations, is a crucial step of supervised ML. However, labeling datasets is\ntime consuming. Here we show that convolutional neural networks (CNNs), trained\non crudely labeled astronomical videos, can be leveraged to improve the quality\nof data labeling and reduce the need for human intervention. We use videos of\nthe solar magnetic field, crudely labeled into two classes: emergence or\nnon-emergence of bipolar magnetic regions (BMRs), based on their first\ndetection on the solar disk. We train CNNs using crude labels, manually verify,\ncorrect labeling vs. CNN disagreements, and repeat this process until\nconvergence. Traditionally, flux emergence labelling is done manually. We find\nthat a high-quality labeled dataset, derived through this iterative process,\nreduces the necessary manual verification by 50%. Furthermore, by gradually\nmasking the videos and looking for maximum change in CNN inference, we locate\nBMR emergence time without retraining the CNN. This demonstrates the\nversatility of CNNs for simplifying the challenging task of labeling complex\ndynamic events.\n","authors":["Subhamoy Chatterjee","Andrés Muñoz-Jaramillo","Derek A. Lamb"],"pdf_url":"https://arxiv.org/pdf/2308.14976v1.pdf","comment":"16 pages, 7 figures, published in Nature Astronomy, June 27, 2022"},{"id":"http://arxiv.org/abs/2308.14328v2","updated":"2023-08-29T01:58:02Z","published":"2023-08-28T06:15:14Z","title":"Reinforcement Learning for Generative AI: A Survey","summary":" Deep Generative AI has been a long-standing essential topic in the machine\nlearning community, which can impact a number of application areas like text\ngeneration and computer vision. The major paradigm to train a generative model\nis maximum likelihood estimation, which pushes the learner to capture and\napproximate the target data distribution by decreasing the divergence between\nthe model distribution and the target distribution. This formulation\nsuccessfully establishes the objective of generative tasks, while it is\nincapable of satisfying all the requirements that a user might expect from a\ngenerative model. Reinforcement learning, serving as a competitive option to\ninject new training signals by creating new objectives that exploit novel\nsignals, has demonstrated its power and flexibility to incorporate human\ninductive bias from multiple angles, such as adversarial learning,\nhand-designed rules and learned reward model to build a performant model.\nThereby, reinforcement learning has become a trending research field and has\nstretched the limits of generative AI in both model design and application. It\nis reasonable to summarize and conclude advances in recent years with a\ncomprehensive review. Although there are surveys in different application areas\nrecently, this survey aims to shed light on a high-level review that spans a\nrange of application areas. We provide a rigorous taxonomy in this area and\nmake sufficient coverage on various models and applications. Notably, we also\nsurveyed the fast-developing large language model area. We conclude this survey\nby showing the potential directions that might tackle the limit of current\nmodels and expand the frontiers for generative AI.\n","authors":["Yuanjiang Cao","Quan Z. Sheng","Julian McAuley","Lina Yao"],"pdf_url":"https://arxiv.org/pdf/2308.14328v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14971v1","updated":"2023-08-29T01:53:14Z","published":"2023-08-29T01:53:14Z","title":"Distributed multi-agent target search and tracking with Gaussian process\n and reinforcement learning","summary":" Deploying multiple robots for target search and tracking has many practical\napplications, yet the challenge of planning over unknown or partially known\ntargets remains difficult to address. With recent advances in deep learning,\nintelligent control techniques such as reinforcement learning have enabled\nagents to learn autonomously from environment interactions with little to no\nprior knowledge. Such methods can address the exploration-exploitation tradeoff\nof planning over unknown targets in a data-driven manner, eliminating the\nreliance on heuristics typical of traditional approaches and streamlining the\ndecision-making pipeline with end-to-end training. In this paper, we propose a\nmulti-agent reinforcement learning technique with target map building based on\ndistributed Gaussian process. We leverage the distributed Gaussian process to\nencode belief over the target locations and efficiently plan over unknown\ntargets. We evaluate the performance and transferability of the trained policy\nin simulation and demonstrate the method on a swarm of micro unmanned aerial\nvehicles with hardware experiments.\n","authors":["Jigang Kim","Dohyun Jang","H. Jin Kim"],"pdf_url":"https://arxiv.org/pdf/2308.14971v1.pdf","comment":"10 pages, 6 figures; preprint submitted to IJCAS; first two authors\n contributed equally"},{"id":"http://arxiv.org/abs/2308.14969v1","updated":"2023-08-29T01:47:49Z","published":"2023-08-29T01:47:49Z","title":"Reprogramming under constraints: Revisiting efficient and reliable\n transferability of lottery tickets","summary":" In the era of foundation models with huge pre-training budgets, the\ndownstream tasks have been shifted to the narrative of efficient and fast\nadaptation. For classification-based tasks in the domain of computer vision,\nthe two most efficient approaches have been linear probing (LP) and visual\nprompting/reprogramming (VP); the former aims to learn a classifier in the form\nof a linear head on the features extracted by the pre-trained model, while the\nlatter maps the input data to the domain of the source data on which the model\nwas originally pre-trained on. Although extensive studies have demonstrated the\ndifferences between LP and VP in terms of downstream performance, we explore\nthe capabilities of the two aforementioned methods via the sparsity axis: (a)\nData sparsity: the impact of few-shot adaptation and (b) Model sparsity: the\nimpact of lottery tickets (LT). We demonstrate that LT are not universal\nreprogrammers, i.e., for certain target datasets, reprogramming an LT yields\nsignificantly lower performance than the reprogrammed dense model although\ntheir corresponding upstream performance is similar. Further, we demonstrate\nthat the calibration of dense models is always superior to that of their\nlottery ticket counterparts under both LP and VP regimes. Our empirical study\nopens a new avenue of research into VP for sparse models and encourages further\nunderstanding of the performance beyond the accuracy achieved by VP under\nconstraints of sparsity. Code and logs can be accessed at\n\\url{https://github.com/landskape-ai/Reprogram_LT}.\n","authors":["Diganta Misra","Agam Goyal","Bharat Runwal","Pin Yu Chen"],"pdf_url":"https://arxiv.org/pdf/2308.14969v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2308.03312v4","updated":"2023-08-29T01:44:39Z","published":"2023-08-07T05:40:58Z","title":"Symmetry-Preserving Program Representations for Learning Code Semantics","summary":" Large Language Models (LLMs) have shown promise in automated program\nreasoning, a crucial aspect of many security tasks. However, existing LLM\narchitectures for code are often borrowed from other domains like natural\nlanguage processing, raising concerns about their generalization and robustness\nto unseen code. A key generalization challenge is to incorporate the knowledge\nof code semantics, including control and data flow, into the LLM architectures.\n Drawing inspiration from examples of convolution layers exploiting\ntranslation symmetry, we explore how code symmetries can enhance LLM\narchitectures for program analysis and modeling. We present a rigorous\ngroup-theoretic framework that formally defines code symmetries as\nsemantics-preserving transformations and provides techniques for precisely\nreasoning about symmetry preservation within LLM architectures. Using this\nframework, we introduce a novel variant of self-attention that preserves\nprogram symmetries, demonstrating its effectiveness in generalization and\nrobustness through detailed experimental evaluations across different binary\nand source code analysis tasks. Overall, our code symmetry framework offers\nrigorous and powerful reasoning techniques that can guide the future\ndevelopment of specialized LLMs for code and advance LLM-guided program\nreasoning tasks.\n","authors":["Kexin Pei","Weichen Li","Qirui Jin","Shuyang Liu","Scott Geng","Lorenzo Cavallaro","Junfeng Yang","Suman Jana"],"pdf_url":"https://arxiv.org/pdf/2308.03312v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14962v1","updated":"2023-08-29T01:29:26Z","published":"2023-08-29T01:29:26Z","title":"Streaming Compression of Scientific Data via weak-SINDy","summary":" In this paper a streaming weak-SINDy algorithm is developed specifically for\ncompressing streaming scientific data. The production of scientific data,\neither via simulation or experiments, is undergoing an stage of exponential\ngrowth, which makes data compression important and often necessary for storing\nand utilizing large scientific data sets. As opposed to classical ``offline\"\ncompression algorithms that perform compression on a readily available data\nset, streaming compression algorithms compress data ``online\" while the data\ngenerated from simulation or experiments is still flowing through the system.\nThis feature makes streaming compression algorithms well-suited for scientific\ndata compression, where storing the full data set offline is often infeasible.\nThis work proposes a new streaming compression algorithm, streaming weak-SINDy,\nwhich takes advantage of the underlying data characteristics during\ncompression. The streaming weak-SINDy algorithm constructs feature matrices and\ntarget vectors in the online stage via a streaming integration method in a\nmemory efficient manner. The feature matrices and target vectors are then used\nin the offline stage to build a model through a regression process that aims to\nrecover equations that govern the evolution of the data. For compressing\nhigh-dimensional streaming data, we adopt a streaming proper orthogonal\ndecomposition (POD) process to reduce the data dimension and then use the\nstreaming weak-SINDy algorithm to compress the temporal data of the POD\nexpansion. We propose modifications to the streaming weak-SINDy algorithm to\naccommodate the dynamically updated POD basis. By combining the built model\nfrom the streaming weak-SINDy algorithm and a small amount of data samples, the\nfull data flow could be reconstructed accurately at a low memory cost, as shown\nin the numerical tests.\n","authors":["Benjamin P. Russo","M. Paul Laiu","Richard Archibald"],"pdf_url":"https://arxiv.org/pdf/2308.14962v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03454v2","updated":"2023-08-29T01:20:04Z","published":"2023-06-06T07:17:56Z","title":"Benchmarking Robustness of AI-Enabled Multi-sensor Fusion Systems:\n Challenges and Opportunities","summary":" Multi-Sensor Fusion (MSF) based perception systems have been the foundation\nin supporting many industrial applications and domains, such as self-driving\ncars, robotic arms, and unmanned aerial vehicles. Over the past few years, the\nfast progress in data-driven artificial intelligence (AI) has brought a\nfast-increasing trend to empower MSF systems by deep learning techniques to\nfurther improve performance, especially on intelligent systems and their\nperception systems. Although quite a few AI-enabled MSF perception systems and\ntechniques have been proposed, up to the present, limited benchmarks that focus\non MSF perception are publicly available. Given that many intelligent systems\nsuch as self-driving cars are operated in safety-critical contexts where\nperception systems play an important role, there comes an urgent need for a\nmore in-depth understanding of the performance and reliability of these MSF\nsystems. To bridge this gap, we initiate an early step in this direction and\nconstruct a public benchmark of AI-enabled MSF-based perception systems\nincluding three commonly adopted tasks (i.e., object detection, object\ntracking, and depth completion). Based on this, to comprehensively understand\nMSF systems' robustness and reliability, we design 14 common and realistic\ncorruption patterns to synthesize large-scale corrupted datasets. We further\nperform a systematic evaluation of these systems through our large-scale\nevaluation. Our results reveal the vulnerability of the current AI-enabled MSF\nperception systems, calling for researchers and practitioners to take\nrobustness and reliability into account when designing AI-enabled MSF.\n","authors":["Xinyu Gao","Zhijie Wang","Yang Feng","Lei Ma","Zhenyu Chen","Baowen Xu"],"pdf_url":"https://arxiv.org/pdf/2306.03454v2.pdf","comment":"To appear in ESEC/FSE 2023"},{"id":"http://arxiv.org/abs/2306.09539v2","updated":"2023-08-29T01:08:30Z","published":"2023-06-15T22:48:08Z","title":"Block-State Transformer","summary":" State space models (SSMs) have shown impressive results on tasks that require\nmodeling long-range dependencies and efficiently scale to long sequences owing\nto their subquadratic runtime complexity. Originally designed for continuous\nsignals, SSMs have shown superior performance on a plethora of tasks, in vision\nand audio; however, SSMs still lag Transformer performance in Language Modeling\ntasks. In this work, we propose a hybrid layer named Block-State Transformer\n(BST), that internally combines an SSM sublayer for long-range\ncontextualization, and a Block Transformer sublayer for short-term\nrepresentation of sequences. We study three different, and completely\nparallelizable, variants that integrate SSMs and block-wise attention. We show\nthat our model outperforms similar Transformer-based architectures on language\nmodeling perplexity and generalizes to longer sequences. In addition, the\nBlock-State Transformer demonstrates more than tenfold increase in speed at the\nlayer level compared to the Block-Recurrent Transformer when model\nparallelization is employed.\n","authors":["Mahan Fathi","Jonathan Pilault","Pierre-Luc Bacon","Christopher Pal","Orhan Firat","Ross Goroshin"],"pdf_url":"https://arxiv.org/pdf/2306.09539v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.02003v2","updated":"2023-08-29T00:59:44Z","published":"2023-06-03T05:01:51Z","title":"On Optimal Caching and Model Multiplexing for Large Model Inference","summary":" Large Language Models (LLMs) and other large foundation models have achieved\nnoteworthy success, but their size exacerbates existing resource consumption\nand latency challenges. In particular, the large-scale deployment of these\nmodels is hindered by the significant resource requirements during inference.\nIn this paper, we study two approaches for mitigating these challenges:\nemploying a cache to store previous queries and learning a model multiplexer to\nchoose from an ensemble of models for query processing.\n Theoretically, we provide an optimal algorithm for jointly optimizing both\napproaches to reduce the inference cost in both offline and online tabular\nsettings. By combining a caching algorithm, namely Greedy Dual Size with\nFrequency (GDSF) or Least Expected Cost (LEC), with a model multiplexer, we\nachieve optimal rates in both offline and online settings. Empirically,\nsimulations show that the combination of our caching and model multiplexing\nalgorithms greatly improves over the baselines, with up to $50\\times$\nimprovement over the baseline when the ratio between the maximum cost and\nminimum cost is $100$. Experiments on real datasets show a $4.3\\times$\nimprovement in FLOPs over the baseline when the ratio for FLOPs is $10$, and a\n$1.8\\times$ improvement in latency when the ratio for average latency is\n$1.85$.\n","authors":["Banghua Zhu","Ying Sheng","Lianmin Zheng","Clark Barrett","Michael I. Jordan","Jiantao Jiao"],"pdf_url":"https://arxiv.org/pdf/2306.02003v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09297v3","updated":"2023-08-29T00:49:40Z","published":"2023-06-15T17:25:15Z","title":"Fix Fairness, Don't Ruin Accuracy: Performance Aware Fairness Repair\n using AutoML","summary":" Machine learning (ML) is increasingly being used in critical decision-making\nsoftware, but incidents have raised questions about the fairness of ML\npredictions. To address this issue, new tools and methods are needed to\nmitigate bias in ML-based software. Previous studies have proposed bias\nmitigation algorithms that only work in specific situations and often result in\na loss of accuracy. Our proposed solution is a novel approach that utilizes\nautomated machine learning (AutoML) techniques to mitigate bias. Our approach\nincludes two key innovations: a novel optimization function and a\nfairness-aware search space. By improving the default optimization function of\nAutoML and incorporating fairness objectives, we are able to mitigate bias with\nlittle to no loss of accuracy. Additionally, we propose a fairness-aware search\nspace pruning method for AutoML to reduce computational cost and repair time.\nOur approach, built on the state-of-the-art Auto-Sklearn tool, is designed to\nreduce bias in real-world scenarios. In order to demonstrate the effectiveness\nof our approach, we evaluated our approach on four fairness problems and 16\ndifferent ML models, and our results show a significant improvement over the\nbaseline and existing bias mitigation techniques. Our approach, Fair-AutoML,\nsuccessfully repaired 60 out of 64 buggy cases, while existing bias mitigation\ntechniques only repaired up to 44 out of 64 cases.\n","authors":["Giang Nguyen","Sumon Biswas","Hridesh Rajan"],"pdf_url":"https://arxiv.org/pdf/2306.09297v3.pdf","comment":"In Proceedings of The 31st ACM Joint European Software Engineering\n Conference and Symposium on the Foundations of Software Engineering (ESEC/FSE\n 2023)"},{"id":"http://arxiv.org/abs/2102.02409v3","updated":"2023-08-29T00:49:34Z","published":"2021-02-04T04:36:58Z","title":"Variational Inference for Deblending Crowded Starfields","summary":" In images collected by astronomical surveys, stars and galaxies often overlap\nvisually. Deblending is the task of distinguishing and characterizing\nindividual light sources in survey images. We propose StarNet, a Bayesian\nmethod to deblend sources in astronomical images of crowded star fields.\nStarNet leverages recent advances in variational inference, including amortized\nvariational distributions and an optimization objective targeting an\nexpectation of the forward KL divergence. In our experiments with SDSS images\nof the M2 globular cluster, StarNet is substantially more accurate than two\ncompeting methods: Probabilistic Cataloging (PCAT), a method that uses MCMC for\ninference, and DAOPHOT, a software pipeline employed by SDSS for deblending. In\naddition, the amortized approach to inference gives StarNet the scaling\ncharacteristics necessary to perform Bayesian inference on modern astronomical\nsurveys.\n","authors":["Runjing Liu","Jon D. McAuliffe","Jeffrey Regier"],"pdf_url":"https://arxiv.org/pdf/2102.02409v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14951v1","updated":"2023-08-29T00:44:27Z","published":"2023-08-29T00:44:27Z","title":"Robust Open-Set Spoken Language Identification and the CU MultiLang\n Dataset","summary":" Most state-of-the-art spoken language identification models are closed-set;\nin other words, they can only output a language label from the set of classes\nthey were trained on. Open-set spoken language identification systems, however,\ngain the ability to detect when an input exhibits none of the original\nlanguages. In this paper, we implement a novel approach to open-set spoken\nlanguage identification that uses MFCC and pitch features, a TDNN model to\nextract meaningful feature embeddings, confidence thresholding on softmax\noutputs, and LDA and pLDA for learning to classify new unknown languages. We\npresent a spoken language identification system that achieves 91.76% accuracy\non trained languages and has the capability to adapt to unknown languages on\nthe fly. To that end, we also built the CU MultiLang Dataset, a large and\ndiverse multilingual speech corpus which was used to train and evaluate our\nsystem.\n","authors":["Mustafa Eyceoz","Justin Lee","Siddharth Pittie","Homayoon Beigi"],"pdf_url":"https://arxiv.org/pdf/2308.14951v1.pdf","comment":"6pages, 1 table, 6 figures"},{"id":"http://arxiv.org/abs/2308.14949v1","updated":"2023-08-29T00:25:02Z","published":"2023-08-29T00:25:02Z","title":"Low-bit Quantization for Deep Graph Neural Networks with\n Smoothness-aware Message Propagation","summary":" Graph Neural Network (GNN) training and inference involve significant\nchallenges of scalability with respect to both model sizes and number of\nlayers, resulting in degradation of efficiency and accuracy for large and deep\nGNNs. We present an end-to-end solution that aims to address these challenges\nfor efficient GNNs in resource constrained environments while avoiding the\noversmoothing problem in deep GNNs. We introduce a quantization based approach\nfor all stages of GNNs, from message passing in training to node\nclassification, compressing the model and enabling efficient processing. The\nproposed GNN quantizer learns quantization ranges and reduces the model size\nwith comparable accuracy even under low-bit quantization. To scale with the\nnumber of layers, we devise a message propagation mechanism in training that\ncontrols layer-wise changes of similarities between neighboring nodes. This\nobjective is incorporated into a Lagrangian function with constraints and a\ndifferential multiplier method is utilized to iteratively find optimal\nembeddings. This mitigates oversmoothing and suppresses the quantization error\nto a bound. Significant improvements are demonstrated over state-of-the-art\nquantization methods and deep GNN approaches in both full-precision and\nquantized models. The proposed quantizer demonstrates superior performance in\nINT2 configurations across all stages of GNN, achieving a notable level of\naccuracy. In contrast, existing quantization approaches fail to generate\nsatisfactory accuracy levels. Finally, the inference with INT2 and INT4\nrepresentations exhibits a speedup of 5.11 $\\times$ and 4.70 $\\times$ compared\nto full precision counterparts, respectively.\n","authors":["Shuang Wang","Bahaeddin Eravci","Rustam Guliyev","Hakan Ferhatosmanoglu"],"pdf_url":"https://arxiv.org/pdf/2308.14949v1.pdf","comment":"To appear in CIKM2023"},{"id":"http://arxiv.org/abs/2112.01694v4","updated":"2023-08-29T00:20:32Z","published":"2021-12-03T03:31:08Z","title":"On the Existence of the Adversarial Bayes Classifier (Extended Version)","summary":" Adversarial robustness is a critical property in a variety of modern machine\nlearning applications. While it has been the subject of several recent\ntheoretical studies, many important questions related to adversarial robustness\nare still open. In this work, we study a fundamental question regarding Bayes\noptimality for adversarial robustness. We provide general sufficient conditions\nunder which the existence of a Bayes optimal classifier can be guaranteed for\nadversarial robustness. Our results can provide a useful tool for a subsequent\nstudy of surrogate losses in adversarial robustness and their consistency\nproperties. This manuscript is the extended and corrected version of the paper\n\\emph{On the Existence of the Adversarial Bayes Classifier} published in\nNeurIPS 2021. There were two errors in theorem statements in the original paper\n-- one in the definition of pseudo-certifiable robustness and the other in the\nmeasurability of $A^\\e$ for arbitrary metric spaces. In this version we correct\nthe errors. Furthermore, the results of the original paper did not apply to\nsome non-strictly convex norms and here we extend our results to all possible\nnorms.\n","authors":["Pranjal Awasthi","Natalie S. Frank","Mehryar Mohri"],"pdf_url":"https://arxiv.org/pdf/2112.01694v4.pdf","comment":"27 pages, 3 figures. Version 2: Corrects 2 errors in the paper \"On\n the Existence of the Adversarial Bayes Classifier\" published in NeurIPS.\n Version 3: Update to acknowledgements"},{"id":"http://arxiv.org/abs/2103.10000v5","updated":"2023-08-29T00:09:24Z","published":"2021-03-18T03:24:38Z","title":"Human-Inspired Multi-Agent Navigation using Knowledge Distillation","summary":" Despite significant advancements in the field of multi-agent navigation,\nagents still lack the sophistication and intelligence that humans exhibit in\nmulti-agent settings. In this paper, we propose a framework for learning a\nhuman-like general collision avoidance policy for agent-agent interactions in\nfully decentralized, multi-agent environments. Our approach uses knowledge\ndistillation with reinforcement learning to shape the reward function based on\nexpert policies extracted from human trajectory demonstrations through behavior\ncloning. We show that agents trained with our approach can take human-like\ntrajectories in collision avoidance and goal-directed steering tasks not\nprovided by the demonstrations, outperforming the experts as well as\nlearning-based agents trained without knowledge distillation.\n","authors":["Pei Xu","Ioannis Karamouzas"],"pdf_url":"https://arxiv.org/pdf/2103.10000v5.pdf","comment":"IEEE/RSJ International Conference on Intelligent Robots and Systems\n (IROS), 2021"},{"id":"http://arxiv.org/abs/2202.03402v3","updated":"2023-08-29T00:03:00Z","published":"2022-02-07T18:40:38Z","title":"Preserving Privacy and Security in Federated Learning","summary":" Federated learning is known to be vulnerable to both security and privacy\nissues. Existing research has focused either on preventing poisoning attacks\nfrom users or on concealing the local model updates from the server, but not\nboth. However, integrating these two lines of research remains a crucial\nchallenge since they often conflict with one another with respect to the threat\nmodel. In this work, we develop a principle framework that offers both privacy\nguarantees for users and detection against poisoning attacks from them. With a\nnew threat model that includes both an honest-but-curious server and malicious\nusers, we first propose a secure aggregation protocol using homomorphic\nencryption for the server to combine local model updates in a private manner.\nThen, a zero-knowledge proof protocol is leveraged to shift the task of\ndetecting attacks in the local models from the server to the users. The key\nobservation here is that the server no longer needs access to the local models\nfor attack detection. Therefore, our framework enables the central server to\nidentify poisoned model updates without violating the privacy guarantees of\nsecure aggregation.\n","authors":["Truc Nguyen","My T. Thai"],"pdf_url":"https://arxiv.org/pdf/2202.03402v3.pdf","comment":"Published in IEEE/ACM Transactions on Networking"},{"id":"http://arxiv.org/abs/2308.14947v1","updated":"2023-08-29T00:00:18Z","published":"2023-08-29T00:00:18Z","title":"Improving Reinforcement Learning Training Regimes for Social Robot\n Navigation","summary":" In order for autonomous mobile robots to navigate in human spaces, they must\nabide by our social norms. Reinforcement learning (RL) has emerged as an\neffective method to train robot navigation policies that are able to respect\nthese norms. However, a large portion of existing work in the field conducts\nboth RL training and testing in simplistic environments. This limits the\ngeneralization potential of these models to unseen environments, and the\nmeaningfulness of their reported results. We propose a method to improve the\ngeneralization performance of RL social navigation methods using curriculum\nlearning. By employing multiple environment types and by modeling pedestrians\nusing multiple dynamics models, we are able to progressively diversify and\nescalate difficulty in training. Our results show that the use of curriculum\nlearning in training can be used to achieve better generalization performance\nthan previous training methods. We also show that results presented in many\nexisting state-of-the art RL social navigation works do not evaluate their\nmethods outside of their training environments, and thus do not reflect their\npolicies' failure to adequately generalize to out-of-distribution scenarios. In\nresponse, we validate our training approach on larger and more crowded testing\nenvironments than those used in training, allowing for more meaningful\nmeasurements of model performance.\n","authors":["Adam Sigal","Hsiu-Chin Lin","AJung Moon"],"pdf_url":"https://arxiv.org/pdf/2308.14947v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.01255v4","updated":"2023-08-29T23:41:36Z","published":"2022-06-02T19:11:27Z","title":"Compressive Fourier collocation methods for high-dimensional diffusion\n equations with periodic boundary conditions","summary":" High-dimensional Partial Differential Equations (PDEs) are a popular\nmathematical modelling tool, with applications ranging from finance to\ncomputational chemistry. However, standard numerical techniques for solving\nthese PDEs are typically affected by the curse of dimensionality. In this work,\nwe tackle this challenge while focusing on stationary diffusion equations\ndefined over a high-dimensional domain with periodic boundary conditions.\nInspired by recent progress in sparse function approximation in high\ndimensions, we propose a new method called compressive Fourier collocation.\nCombining ideas from compressive sensing and spectral collocation, our method\nreplaces the use of structured collocation grids with Monte Carlo sampling and\nemploys sparse recovery techniques, such as orthogonal matching pursuit and\n$\\ell^1$ minimization, to approximate the Fourier coefficients of the PDE\nsolution. We conduct a rigorous theoretical analysis showing that the\napproximation error of the proposed method is comparable with the best $s$-term\napproximation (with respect to the Fourier basis) to the solution. Using the\nrecently introduced framework of random sampling in bounded Riesz systems, our\nanalysis shows that the compressive Fourier collocation method mitigates the\ncurse of dimensionality with respect to the number of collocation points under\nsufficient conditions on the regularity of the diffusion coefficient. We also\npresent numerical experiments that illustrate the accuracy and stability of the\nmethod for the approximation of sparse and compressible solutions.\n","authors":["Weiqi Wang","Simone Brugiapaglia"],"pdf_url":"https://arxiv.org/pdf/2206.01255v4.pdf","comment":"33 pages, 9 figures"},{"id":"http://arxiv.org/abs/2308.15667v1","updated":"2023-08-29T23:35:36Z","published":"2023-08-29T23:35:36Z","title":"Bridging Distribution Learning and Image Clustering in High-dimensional\n Space","summary":" Distribution learning focuses on learning the probability density function\nfrom a set of data samples. In contrast, clustering aims to group similar\nobjects together in an unsupervised manner. Usually, these two tasks are\nconsidered unrelated. However, the relationship between the two may be\nindirectly correlated, with Gaussian Mixture Models (GMM) acting as a bridge.\nIn this paper, we focus on exploring the correlation between distribution\nlearning and clustering, with the motivation to fill the gap between these two\nfields, utilizing an autoencoder (AE) to encode images into a high-dimensional\nlatent space. Then, Monte-Carlo Marginalization (MCMarg) and Kullback-Leibler\n(KL) divergence loss are used to fit the Gaussian components of the GMM and\nlearn the data distribution. Finally, image clustering is achieved through each\nGaussian component of GMM. Yet, the \"curse of dimensionality\" poses severe\nchallenges for most clustering algorithms. Compared with the classic\nExpectation-Maximization (EM) Algorithm, experimental results show that MCMarg\nand KL divergence can greatly alleviate the difficulty. Based on the\nexperimental results, we believe distribution learning can exploit the\npotential of GMM in image clustering within high-dimensional space.\n","authors":["Guanfang Dong","Chenqiu Zhao","Anup Basu"],"pdf_url":"https://arxiv.org/pdf/2308.15667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12315v2","updated":"2023-08-29T23:19:53Z","published":"2023-08-23T08:38:54Z","title":"Trustworthy Representation Learning Across Domains","summary":" As AI systems have obtained significant performance to be deployed widely in\nour daily live and human society, people both enjoy the benefits brought by\nthese technologies and suffer many social issues induced by these systems. To\nmake AI systems good enough and trustworthy, plenty of researches have been\ndone to build guidelines for trustworthy AI systems. Machine learning is one of\nthe most important parts for AI systems and representation learning is the\nfundamental technology in machine learning. How to make the representation\nlearning trustworthy in real-world application, e.g., cross domain scenarios,\nis very valuable and necessary for both machine learning and AI system fields.\nInspired by the concepts in trustworthy AI, we proposed the first trustworthy\nrepresentation learning across domains framework which includes four concepts,\ni.e, robustness, privacy, fairness, and explainability, to give a comprehensive\nliterature review on this research direction. Specifically, we first introduce\nthe details of the proposed trustworthy framework for representation learning\nacross domains. Second, we provide basic notions and comprehensively summarize\nexisting methods for the trustworthy framework from four concepts. Finally, we\nconclude this survey with insights and discussions on future research\ndirections.\n","authors":["Ronghang Zhu","Dongliang Guo","Daiqing Qi","Zhixuan Chu","Xiang Yu","Sheng Li"],"pdf_url":"https://arxiv.org/pdf/2308.12315v2.pdf","comment":"38 pages, 15 figures"},{"id":"http://arxiv.org/abs/2303.08112v3","updated":"2023-08-29T22:55:27Z","published":"2023-03-14T17:47:09Z","title":"Eliciting Latent Predictions from Transformers with the Tuned Lens","summary":" We analyze transformers from the perspective of iterative inference, seeking\nto understand how model predictions are refined layer by layer. To do so, we\ntrain an affine probe for each block in a frozen pretrained model, making it\npossible to decode every hidden state into a distribution over the vocabulary.\nOur method, the tuned lens, is a refinement of the earlier \"logit lens\"\ntechnique, which yielded useful insights but is often brittle.\n We test our method on various autoregressive language models with up to 20B\nparameters, showing it to be more predictive, reliable and unbiased than the\nlogit lens. With causal experiments, we show the tuned lens uses similar\nfeatures to the model itself. We also find the trajectory of latent predictions\ncan be used to detect malicious inputs with high accuracy. All code needed to\nreproduce our results can be found at\nhttps://github.com/AlignmentResearch/tuned-lens.\n","authors":["Nora Belrose","Zach Furman","Logan Smith","Danny Halawi","Igor Ostrovsky","Lev McKinney","Stella Biderman","Jacob Steinhardt"],"pdf_url":"https://arxiv.org/pdf/2303.08112v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.09929v3","updated":"2023-08-29T22:52:28Z","published":"2022-11-17T23:01:47Z","title":"Contrastive Credibility Propagation for Reliable Semi-Supervised\n Learning","summary":" Producing labels for unlabeled data is error-prone, making semi-supervised\nlearning (SSL) troublesome. Often, little is known about when and why an\nalgorithm fails to outperform a supervised baseline. Using benchmark datasets,\nwe craft five common real-world SSL data scenarios: few-label, open-set,\nnoisy-label, and class distribution imbalance/misalignment in the labeled and\nunlabeled sets. We propose a novel algorithm called Contrastive Credibility\nPropagation (CCP) for deep SSL via iterative transductive pseudo-label\nrefinement. CCP unifies semi-supervised learning and noisy label learning for\nthe goal of reliably outperforming a supervised baseline in any data scenario.\nCompared to prior methods which focus on a subset of scenarios, CCP uniquely\noutperforms the supervised baseline in all scenarios, supporting practitioners\nwhen the qualities of labeled or unlabeled data are unknown.\n","authors":["Brody Kutt","Pralay Ramteke","Xavier Mignot","Pamela Toman","Nandini Ramanan","Sujit Rokka Chhetri","Shan Huang","Min Du","William Hewlett"],"pdf_url":"https://arxiv.org/pdf/2211.09929v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.06074v2","updated":"2023-08-29T22:30:15Z","published":"2022-12-12T17:41:32Z","title":"Regression with Label Differential Privacy","summary":" We study the task of training regression models with the guarantee of label\ndifferential privacy (DP). Based on a global prior distribution on label\nvalues, which could be obtained privately, we derive a label DP randomization\nmechanism that is optimal under a given regression loss function. We prove that\nthe optimal mechanism takes the form of a \"randomized response on bins\", and\npropose an efficient algorithm for finding the optimal bin values. We carry out\na thorough experimental evaluation on several datasets demonstrating the\nefficacy of our algorithm.\n","authors":["Badih Ghazi","Pritish Kamath","Ravi Kumar","Ethan Leeman","Pasin Manurangsi","Avinash Varadarajan","Chiyuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2212.06074v2.pdf","comment":"Appeared at ICLR '23, 28 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.15656v1","updated":"2023-08-29T22:23:52Z","published":"2023-08-29T22:23:52Z","title":"Deep Reinforcement Learning Based Framework for Mobile Energy\n Disseminator Dispatching to Charge On-the-Road Electric Vehicles","summary":" The exponential growth of electric vehicles (EVs) presents novel challenges\nin preserving battery health and in addressing the persistent problem of\nvehicle range anxiety. To address these concerns, wireless charging,\nparticularly, Mobile Energy Disseminators (MEDs) have emerged as a promising\nsolution. The MED is mounted behind a large vehicle and charges all\nparticipating EVs within a radius upstream of it. Unfortuantely, during such\nV2V charging, the MED and EVs inadvertently form platoons, thereby occupying\nmultiple lanes and impairing overall corridor travel efficiency. In addition,\nconstrained budgets for MED deployment necessitate the development of an\neffective dispatching strategy to determine optimal timing and locations for\nintroducing the MEDs into traffic. This paper proposes a deep reinforcement\nlearning (DRL) based methodology to develop a vehicle dispatching framework. In\nthe first component of the framework, we develop a realistic reinforcement\nlearning environment termed \"ChargingEnv\" which incorporates a reliable\ncharging simulation system that accounts for common practical issues in\nwireless charging deployment, specifically, the charging panel misalignment.\nThe second component, the Proximal-Policy Optimization (PPO) agent, is trained\nto control MED dispatching through continuous interactions with ChargingEnv.\nNumerical experiments were carried out to demonstrate the demonstrate the\nefficacy of the proposed MED deployment decision processor. The experiment\nresults suggest that the proposed model can significantly enhance EV travel\nrange while efficiently deploying a optimal number of MEDs. The proposed model\nis found to be not only practical in its applicability but also has promises of\nreal-world effectiveness. The proposed model can help travelers to maximize EV\nrange and help road agencies or private-sector vendors to manage the deployment\nof MEDs efficiently.\n","authors":["Jiaming Wang","Jiqian Dong","Sikai Chen","Shreyas Sundaram","Samuel Labi"],"pdf_url":"https://arxiv.org/pdf/2308.15656v1.pdf","comment":"Submitted for presentation only at the 2024 Annual Meeting of the\n Transportation Research Board"},{"id":"http://arxiv.org/abs/1904.08576v5","updated":"2023-08-29T22:17:05Z","published":"2019-04-18T02:56:00Z","title":"On Low-rank Trace Regression under General Sampling Distribution","summary":" In this paper, we study the trace regression when a matrix of parameters B*\nis estimated via the convex relaxation of a rank-regularized regression or via\nregularized non-convex optimization. It is known that these estimators satisfy\nnear-optimal error bounds under assumptions on the rank, coherence, and\nspikiness of B*. We start by introducing a general notion of spikiness for B*\nthat provides a generic recipe to prove the restricted strong convexity of the\nsampling operator of the trace regression and obtain near-optimal and\nnon-asymptotic error bounds for the estimation error. Similar to the existing\nliterature, these results require the regularization parameter to be above a\ncertain theory-inspired threshold that depends on observation noise that may be\nunknown in practice. Next, we extend the error bounds to cases where the\nregularization parameter is chosen via cross-validation. This result is\nsignificant in that existing theoretical results on cross-validated estimators\n(Kale et al., 2011; Kumar et al., 2013; Abou-Moustafa and Szepesvari, 2017) do\nnot apply to our setting since the estimators we study are not known to satisfy\ntheir required notion of stability. Finally, using simulations on synthetic and\nreal data, we show that the cross-validated estimator selects a near-optimal\npenalty parameter and outperforms the theory-inspired approach of selecting the\nparameter.\n","authors":["Nima Hamidi","Mohsen Bayati"],"pdf_url":"https://arxiv.org/pdf/1904.08576v5.pdf","comment":"49 pages, 6 figure2"},{"id":"http://arxiv.org/abs/2305.02422v3","updated":"2023-08-29T22:12:04Z","published":"2023-05-03T20:29:04Z","title":"GAMIVAL: Video Quality Prediction on Mobile Cloud Gaming Content","summary":" The mobile cloud gaming industry has been rapidly growing over the last\ndecade. When streaming gaming videos are transmitted to customers' client\ndevices from cloud servers, algorithms that can monitor distorted video quality\nwithout having any reference video available are desirable tools. However,\ncreating No-Reference Video Quality Assessment (NR VQA) models that can\naccurately predict the quality of streaming gaming videos rendered by computer\ngraphics engines is a challenging problem, since gaming content generally\ndiffers statistically from naturalistic videos, often lacks detail, and\ncontains many smooth regions. Until recently, the problem has been further\ncomplicated by the lack of adequate subjective quality databases of mobile\ngaming content. We have created a new gaming-specific NR VQA model called the\nGaming Video Quality Evaluator (GAMIVAL), which combines and leverages the\nadvantages of spatial and temporal gaming distorted scene statistics models, a\nneural noise model, and deep semantic features. Using a support vector\nregression (SVR) as a regressor, GAMIVAL achieves superior performance on the\nnew LIVE-Meta Mobile Cloud Gaming (LIVE-Meta MCG) video quality database.\n","authors":["Yu-Chih Chen","Avinab Saha","Chase Davis","Bo Qiu","Xiaoming Wang","Rahul Gowda","Ioannis Katsavounidis","Alan C. Bovik"],"pdf_url":"https://arxiv.org/pdf/2305.02422v3.pdf","comment":"Accepted to IEEE SPL 2023. The implementation of GAMIVAL has been\n made available online: https://github.com/lskdream/GAMIVAL"},{"id":"http://arxiv.org/abs/2308.15651v1","updated":"2023-08-29T22:03:17Z","published":"2023-08-29T22:03:17Z","title":"Ensuring User-side Fairness in Dynamic Recommender Systems","summary":" User-side group fairness is crucial for modern recommender systems, as it\naims to alleviate performance disparity between groups of users defined by\nsensitive attributes such as gender, race, or age. We find that the disparity\ntends to persist or even increase over time. This calls for effective ways to\naddress user-side fairness in a dynamic environment, which has been\ninfrequently explored in the literature. However, fairness-constrained\nre-ranking, a typical method to ensure user-side fairness (i.e., reducing\nperformance disparity), faces two fundamental challenges in the dynamic\nsetting: (1) non-differentiability of the ranking-based fairness constraint,\nwhich hinders the end-to-end training paradigm, and (2) time-inefficiency,\nwhich impedes quick adaptation to changes in user preferences. In this paper,\nwe propose FAir Dynamic rEcommender (FADE), an end-to-end framework with\nfine-tuning strategy to dynamically alleviate performance disparity. To tackle\nthe above challenges, FADE uses a novel fairness loss designed to be\ndifferentiable and lightweight to fine-tune model parameters to ensure both\nuser-side fairness and high-quality recommendations. Via extensive experiments\non the real-world dataset, we empirically demonstrate that FADE effectively and\nefficiently reduces performance disparity, and furthermore, FADE improves\noverall recommendation quality over time compared to not using any new data.\n","authors":["Hyunsik Yoo","Zhichen Zeng","Jian Kang","Zhining Liu","David Zhou","Fei Wang","Eunice Chan","Hanghang Tong"],"pdf_url":"https://arxiv.org/pdf/2308.15651v1.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2307.08303v3","updated":"2023-08-29T21:52:58Z","published":"2023-07-17T07:55:47Z","title":"Soft Prompt Tuning for Augmenting Dense Retrieval with Large Language\n Models","summary":" Dense retrieval (DR) converts queries and documents into dense embeddings and\nmeasures the similarity between queries and documents in vector space. One of\nthe challenges in DR is the lack of domain-specific training data. While DR\nmodels can learn from large-scale public datasets like MS MARCO through\ntransfer learning, evidence shows that not all DR models and domains can\nbenefit from transfer learning equally. Recently, some researchers have\nresorted to large language models (LLMs) to improve the zero-shot and few-shot\nDR models. However, the hard prompts or human-written prompts utilized in these\nworks cannot guarantee the good quality of generated weak queries. To tackle\nthis, we propose soft prompt tuning for augmenting DR (SPTAR): For each task,\nwe leverage soft prompt-tuning to optimize a task-specific soft prompt on\nlimited ground truth data and then prompt the LLMs to tag unlabeled documents\nwith weak queries, yielding enough weak document-query pairs to train\ntask-specific dense retrievers. We design a filter to select high-quality\nexample document-query pairs in the prompt to further improve the quality of\nweak tagged queries. To the best of our knowledge, there is no prior work\nutilizing soft prompt tuning to augment DR models. The experiments demonstrate\nthat SPTAR outperforms the unsupervised baselines BM25 and the recently\nproposed LLMs-based augmentation method for DR.\n","authors":["Zhiyuan Peng","Xuyang Wu","Yi Fang"],"pdf_url":"https://arxiv.org/pdf/2307.08303v3.pdf","comment":"fix typos"},{"id":"http://arxiv.org/abs/2308.15647v1","updated":"2023-08-29T21:49:28Z","published":"2023-08-29T21:49:28Z","title":"A General Recipe for Automated Machine Learning in Practice","summary":" Automated Machine Learning (AutoML) is an area of research that focuses on\ndeveloping methods to generate machine learning models automatically. The idea\nof being able to build machine learning models with very little human\nintervention represents a great opportunity for the practice of applied machine\nlearning. However, there is very little information on how to design an AutoML\nsystem in practice. Most of the research focuses on the problems facing\noptimization algorithms and leaves out the details of how that would be done in\npractice. In this paper, we propose a frame of reference for building general\nAutoML systems. Through a narrative review of the main approaches in the area,\nour main idea is to distill the fundamental concepts in order to support them\nin a single design. Finally, we discuss some open problems related to the\napplication of AutoML for future research.\n","authors":["Hernan Ceferino Vazquez"],"pdf_url":"https://arxiv.org/pdf/2308.15647v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15642v1","updated":"2023-08-29T21:27:21Z","published":"2023-08-29T21:27:21Z","title":"Clustering Without an Eigengap","summary":" We study graph clustering in the Stochastic Block Model (SBM) in the presence\nof both large clusters and small, unrecoverable clusters. Previous approaches\nachieving exact recovery do not allow any small clusters of size $o(\\sqrt{n})$,\nor require a size gap between the smallest recovered cluster and the largest\nnon-recovered cluster. We provide an algorithm based on semidefinite\nprogramming (SDP) which removes these requirements and provably recovers large\nclusters regardless of the remaining cluster sizes. Mid-sized clusters pose\nunique challenges to the analysis, since their proximity to the recovery\nthreshold makes them highly sensitive to small noise perturbations and\nprecludes a closed-form candidate solution. We develop novel techniques,\nincluding a leave-one-out-style argument which controls the correlation between\nSDP solutions and noise vectors even when the removal of one row of noise can\ndrastically change the SDP solution. We also develop improved eigenvalue\nperturbation bounds of potential independent interest. Using our gap-free\nclustering procedure, we obtain efficient algorithms for the problem of\nclustering with a faulty oracle with superior query complexities, notably\nachieving $o(n^2)$ sample complexity even in the presence of a large number of\nsmall clusters. Our gap-free clustering procedure also leads to improved\nalgorithms for recursive clustering. Our results extend to certain\nheterogeneous probability settings that are challenging for alternative\nalgorithms.\n","authors":["Matthew Zurek","Yudong Chen"],"pdf_url":"https://arxiv.org/pdf/2308.15642v1.pdf","comment":"68 pages, 1 figure"},{"id":"http://arxiv.org/abs/2308.15640v1","updated":"2023-08-29T21:25:24Z","published":"2023-08-29T21:25:24Z","title":"Identifying Constitutive Parameters for Complex Hyperelastic Solids\n using Physics-Informed Neural Networks","summary":" Identifying constitutive parameters in engineering and biological materials,\nparticularly those with intricate geometries and mechanical behaviors, remains\na longstanding challenge. The recent advent of Physics-Informed Neural Networks\n(PINNs) offers promising solutions, but current frameworks are often limited to\nbasic constitutive laws and encounter practical constraints when combined with\nexperimental data. In this paper, we introduce a new PINN-based framework\ndesigned to identify material parameters for soft materials, specifically those\nexhibiting complex constitutive behaviors, under large deformation in plane\nstress conditions. Distinctively, our model emphasizes training PINNs with\nmulti-modal time-dependent experimental datasets consisting of full-field\ndeformation and loading history, ensuring algorithm robustness even amidst\nnoisy data. Our results reveal that our framework can accurately identify\nconstitutive parameters of the incompressible Arruda-Boyce model for samples\nwith intricate geometries, maintaining an error below 5%, even with an\nexperimental noise level of 5%. We believe our framework sets the stage for a\ntransformative approach in modulus identification for complex solids,\nespecially for those with geometrical and constitutive intricate.\n","authors":["Siyuan Song","Hanxun Jin"],"pdf_url":"https://arxiv.org/pdf/2308.15640v1.pdf","comment":"31 pages, 5 figures, 1 table"},{"id":"http://arxiv.org/abs/2308.09884v2","updated":"2023-08-29T21:23:03Z","published":"2023-08-19T02:30:35Z","title":"A Transformer-based Framework For Multi-variate Time Series: A Remaining\n Useful Life Prediction Use Case","summary":" In recent times, Large Language Models (LLMs) have captured a global\nspotlight and revolutionized the field of Natural Language Processing. One of\nthe factors attributed to the effectiveness of LLMs is the model architecture\nused for training, transformers. Transformer models excel at capturing\ncontextual features in sequential data since time series data are sequential,\ntransformer models can be leveraged for more efficient time series data\nprediction. The field of prognostics is vital to system health management and\nproper maintenance planning. A reliable estimation of the remaining useful life\n(RUL) of machines holds the potential for substantial cost savings. This\nincludes avoiding abrupt machine failures, maximizing equipment usage, and\nserving as a decision support system (DSS). This work proposed an\nencoder-transformer architecture-based framework for multivariate time series\nprediction for a prognostics use case. We validated the effectiveness of the\nproposed framework on all four sets of the C-MAPPS benchmark dataset for the\nremaining useful life prediction task. To effectively transfer the knowledge\nand application of transformers from the natural language domain to time\nseries, three model-specific experiments were conducted. Also, to enable the\nmodel awareness of the initial stages of the machine life and its degradation\npath, a novel expanding window method was proposed for the first time in this\nwork, it was compared with the sliding window method, and it led to a large\nimprovement in the performance of the encoder transformer model. Finally, the\nperformance of the proposed encoder-transformer model was evaluated on the test\ndataset and compared with the results from 13 other state-of-the-art (SOTA)\nmodels in the literature and it outperformed them all with an average\nperformance increase of 137.65% over the next best model across all the\ndatasets.\n","authors":["Oluwaseyi Ogunfowora","Homayoun Najjaran"],"pdf_url":"https://arxiv.org/pdf/2308.09884v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15639v1","updated":"2023-08-29T21:20:16Z","published":"2023-08-29T21:20:16Z","title":"Hyperbolic Convolutional Neural Networks","summary":" Deep Learning is mostly responsible for the surge of interest in Artificial\nIntelligence in the last decade. So far, deep learning researchers have been\nparticularly successful in the domain of image processing, where Convolutional\nNeural Networks are used. Although excelling at image classification,\nConvolutional Neural Networks are quite naive in that no inductive bias is set\non the embedding space for images. Similar flaws are also exhibited by another\ntype of Convolutional Networks - Graph Convolutional Neural Networks. However,\nusing non-Euclidean space for embedding data might result in more robust and\nexplainable models. One example of such a non-Euclidean space is hyperbolic\nspace. Hyperbolic spaces are particularly useful due to their ability to fit\nmore data in a low-dimensional space and tree-likeliness properties. These\nattractive properties have been previously used in multiple papers which\nindicated that they are beneficial for building hierarchical embeddings using\nshallow models and, recently, using MLPs and RNNs.\n However, no papers have yet suggested a general approach to using Hyperbolic\nConvolutional Neural Networks for structured data processing, although these\nare the most common examples of data used. Therefore, the goal of this work is\nto devise a general recipe for building Hyperbolic Convolutional Neural\nNetworks. We hypothesize that ability of hyperbolic space to capture hierarchy\nin the data would lead to better performance. This ability should be\nparticularly useful in cases where data has a tree-like structure. Since this\nis the case for many existing datasets \\citep{wordnet, imagenet, fb15k}, we\nargue that such a model would be advantageous both in terms of applications and\nfuture research prospects.\n","authors":["Andrii Skliar","Maurice Weiler"],"pdf_url":"https://arxiv.org/pdf/2308.15639v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09175v2","updated":"2023-08-29T20:33:12Z","published":"2023-08-17T20:27:33Z","title":"Diversifying AI: Towards Creative Chess with AlphaZero","summary":" In recent years, Artificial Intelligence (AI) systems have surpassed human\nintelligence in a variety of computational tasks. However, AI systems, like\nhumans, make mistakes, have blind spots, hallucinate, and struggle to\ngeneralize to new situations. This work explores whether AI can benefit from\ncreative decision-making mechanisms when pushed to the limits of its\ncomputational rationality. In particular, we investigate whether a team of\ndiverse AI systems can outperform a single AI in challenging tasks by\ngenerating more ideas as a group and then selecting the best ones. We study\nthis question in the game of chess, the so-called drosophila of AI. We build on\nAlphaZero (AZ) and extend it to represent a league of agents via a\nlatent-conditioned architecture, which we call AZ_db. We train AZ_db to\ngenerate a wider range of ideas using behavioral diversity techniques and\nselect the most promising ones with sub-additive planning. Our experiments\nsuggest that AZ_db plays chess in diverse ways, solves more puzzles as a group\nand outperforms a more homogeneous team. Notably, AZ_db solves twice as many\nchallenging puzzles as AZ, including the challenging Penrose positions. When\nplaying chess from different openings, we notice that players in AZ_db\nspecialize in different openings, and that selecting a player for each opening\nusing sub-additive planning results in a 50 Elo improvement over AZ. Our\nfindings suggest that diversity bonuses emerge in teams of AI agents, just as\nthey do in teams of humans and that diversity is a valuable asset in solving\ncomputationally hard problems.\n","authors":["Tom Zahavy","Vivek Veeriah","Shaobo Hou","Kevin Waugh","Matthew Lai","Edouard Leurent","Nenad Tomasev","Lisa Schut","Demis Hassabis","Satinder Singh"],"pdf_url":"https://arxiv.org/pdf/2308.09175v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15618v1","updated":"2023-08-29T20:25:49Z","published":"2023-08-29T20:25:49Z","title":"RACR-MIL: Weakly Supervised Skin Cancer Grading using Rank-Aware\n Contextual Reasoning on Whole Slide Images","summary":" Cutaneous squamous cell cancer (cSCC) is the second most common skin cancer\nin the US. It is diagnosed by manual multi-class tumor grading using a tissue\nwhole slide image (WSI), which is subjective and suffers from inter-pathologist\nvariability. We propose an automated weakly-supervised grading approach for\ncSCC WSIs that is trained using WSI-level grade and does not require\nfine-grained tumor annotations. The proposed model, RACR-MIL, transforms each\nWSI into a bag of tiled patches and leverages attention-based multiple-instance\nlearning to assign a WSI-level grade. We propose three key innovations to\naddress general as well as cSCC-specific challenges in tumor grading. First, we\nleverage spatial and semantic proximity to define a WSI graph that encodes both\nlocal and non-local dependencies between tumor regions and leverage graph\nattention convolution to derive contextual patch features. Second, we introduce\na novel ordinal ranking constraint on the patch attention network to ensure\nthat higher-grade tumor regions are assigned higher attention. Third, we use\ntumor depth as an auxiliary task to improve grade classification in a multitask\nlearning framework. RACR-MIL achieves 2-9% improvement in grade classification\nover existing weakly-supervised approaches on a dataset of 718 cSCC tissue\nimages and localizes the tumor better. The model achieves 5-20% higher accuracy\nin difficult-to-classify high-risk grade classes and is robust to class\nimbalance.\n","authors":["Anirudh Choudhary","Angelina Hwang","Jacob Kechter","Krishnakant Saboo","Blake Bordeaux","Puneet Bhullar","Nneka Comfere","David DiCaudo","Steven Nelson","Emma Johnson","Leah Swanson","Dennis Murphree","Aaron Mangold","Ravishankar K. Iyer"],"pdf_url":"https://arxiv.org/pdf/2308.15618v1.pdf","comment":"7 pages main text, 2 page references, 3 page appendix; submitted to\n AAAI"},{"id":"http://arxiv.org/abs/2308.15614v1","updated":"2023-08-29T20:14:42Z","published":"2023-08-29T20:14:42Z","title":"Everything Perturbed All at Once: Enabling Differentiable Graph Attacks","summary":" As powerful tools for representation learning on graphs, graph neural\nnetworks (GNNs) have played an important role in applications including social\nnetworks, recommendation systems, and online web services. However, GNNs have\nbeen shown to be vulnerable to adversarial attacks, which can significantly\ndegrade their effectiveness. Recent state-of-the-art approaches in adversarial\nattacks rely on gradient-based meta-learning to selectively perturb a single\nedge with the highest attack score until they reach the budget constraint.\nWhile effective in identifying vulnerable links, these methods are plagued by\nhigh computational costs. By leveraging continuous relaxation and\nparameterization of the graph structure, we propose a novel attack method\ncalled Differentiable Graph Attack (DGA) to efficiently generate effective\nattacks and meanwhile eliminate the need for costly retraining. Compared to the\nstate-of-the-art, DGA achieves nearly equivalent attack performance with 6\ntimes less training time and 11 times smaller GPU memory footprint on different\nbenchmark datasets. Additionally, we provide extensive experimental analyses of\nthe transferability of the DGA among different graph models, as well as its\nrobustness against widely-used defense mechanisms.\n","authors":["Haoran Liu","Bokun Wang","Jianling Wang","Xiangjue Dong","Tianbao Yang","James Caverlee"],"pdf_url":"https://arxiv.org/pdf/2308.15614v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15613v1","updated":"2023-08-29T20:13:37Z","published":"2023-08-29T20:13:37Z","title":"Mixed Variational Flows for Discrete Variables","summary":" Variational flows allow practitioners to learn complex continuous\ndistributions, but approximating discrete distributions remains a challenge.\nCurrent methodologies typically embed the discrete target in a continuous space\n- usually via continuous relaxation or dequantization - and then apply a\ncontinuous flow. These approaches involve a surrogate target that may not\ncapture the original discrete target, might have biased or unstable gradients,\nand can create a difficult optimization problem. In this work, we develop a\nvariational flow family for discrete distributions without any continuous\nembedding. First, we develop a measure-preserving and discrete (MAD) invertible\nmap that leaves the discrete target invariant, and then create a mixed\nvariational flow (MAD Mix) based on that map. We also develop an extension to\nMAD Mix that handles joint discrete and continuous models. Our experiments\nsuggest that MAD Mix produces more reliable approximations than\ncontinuous-embedding flows while being significantly faster to train.\n","authors":["Gian Carlo Diluvi","Benjamin Bloem-Reddy","Trevor Campbell"],"pdf_url":"https://arxiv.org/pdf/2308.15613v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15609v1","updated":"2023-08-29T20:02:24Z","published":"2023-08-29T20:02:24Z","title":"InstaTune: Instantaneous Neural Architecture Search During Fine-Tuning","summary":" One-Shot Neural Architecture Search (NAS) algorithms often rely on training a\nhardware agnostic super-network for a domain specific task. Optimal\nsub-networks are then extracted from the trained super-network for different\nhardware platforms. However, training super-networks from scratch can be\nextremely time consuming and compute intensive especially for large models that\nrely on a two-stage training process of pre-training and fine-tuning. State of\nthe art pre-trained models are available for a wide range of tasks, but their\nlarge sizes significantly limits their applicability on various hardware\nplatforms. We propose InstaTune, a method that leverages off-the-shelf\npre-trained weights for large models and generates a super-network during the\nfine-tuning stage. InstaTune has multiple benefits. Firstly, since the process\nhappens during fine-tuning, it minimizes the overall time and compute resources\nrequired for NAS. Secondly, the sub-networks extracted are optimized for the\ntarget task, unlike prior work that optimizes on the pre-training objective.\nFinally, InstaTune is easy to \"plug and play\" in existing frameworks. By using\nmulti-objective evolutionary search algorithms along with lightly trained\npredictors, we find Pareto-optimal sub-networks that outperform their\nrespective baselines across different performance objectives such as accuracy\nand MACs. Specifically, we demonstrate that our approach performs well across\nboth unimodal (ViT and BERT) and multi-modal (BEiT-3) transformer based\narchitectures.\n","authors":["Sharath Nittur Sridhar","Souvik Kundu","Sairam Sundaresan","Maciej Szankin","Anthony Sarah"],"pdf_url":"https://arxiv.org/pdf/2308.15609v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15605v1","updated":"2023-08-29T19:54:37Z","published":"2023-08-29T19:54:37Z","title":"Measurement Tampering Detection Benchmark","summary":" When training powerful AI systems to perform complex tasks, it may be\nchallenging to provide training signals which are robust to optimization. One\nconcern is measurement tampering, where the AI system manipulates multiple\nmeasurements to create the illusion of good results instead of achieving the\ndesired outcome. In this work, we build four new text-based datasets to\nevaluate measurement tampering detection techniques on large language models.\nConcretely, given sets of text inputs and measurements aimed at determining if\nsome outcome occurred, as well as a base model able to accurately predict\nmeasurements, the goal is to determine if examples where all measurements\nindicate the outcome actually had the outcome occur, or if this was caused by\nmeasurement tampering. We demonstrate techniques that outperform simple\nbaselines on most datasets, but don't achieve maximum performance. We believe\nthere is significant room for improvement for both techniques and datasets, and\nwe are excited for future work tackling measurement tampering.\n","authors":["Fabien Roger","Ryan Greenblatt","Max Nadeau","Buck Shlegeris","Nate Thomas"],"pdf_url":"https://arxiv.org/pdf/2308.15605v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.14935v2","updated":"2023-08-29T19:52:02Z","published":"2022-11-27T21:00:31Z","title":"RecXplainer: Amortized Attribute-based Personalized Explanations for\n Recommender Systems","summary":" Recommender systems influence many of our interactions in the digital world\n-- impacting how we shop for clothes, sorting what we see when browsing YouTube\nor TikTok, and determining which restaurants and hotels we are shown when using\nhospitality platforms. Modern recommender systems are large, opaque models\ntrained on a mixture of proprietary and open-source datasets. Naturally, issues\nof trust arise on both the developer and user side: is the system working\ncorrectly, and why did a user receive (or not receive) a particular\nrecommendation? Providing an explanation alongside a recommendation alleviates\nsome of these concerns. The status quo for auxiliary recommender system\nfeedback is either user-specific explanations (e.g., \"users who bought item B\nalso bought item A\") or item-specific explanations (e.g., \"we are recommending\nitem A because you watched/bought item B\"). However, users bring personalized\ncontext into their search experience, valuing an item as a function of that\nitem's attributes and their own personal preferences. In this work, we propose\nRecXplainer, a novel method for generating fine-grained explanations based on a\nuser's preferences over the attributes of recommended items. We evaluate\nRecXplainer on five real-world and large-scale recommendation datasets using\nfive different kinds of recommender systems to demonstrate the efficacy of\nRecXplainer in capturing users' preferences over item attributes and using them\nto explain recommendations. We also compare RecXplainer to five baselines and\nshow RecXplainer's exceptional performance on ten metrics.\n","authors":["Sahil Verma","Chirag Shah","John P. Dickerson","Anurag Beniwal","Narayanan Sadagopan","Arjun Seshadri"],"pdf_url":"https://arxiv.org/pdf/2211.14935v2.pdf","comment":"Awarded the Best Student Paper at TEA Workshop at NeurIPS 2022"},{"id":"http://arxiv.org/abs/2308.15602v1","updated":"2023-08-29T19:47:31Z","published":"2023-08-29T19:47:31Z","title":"An Experimental Comparison of Partitioning Strategies for Distributed\n Graph Neural Network Training","summary":" Recently, graph neural networks (GNNs) have gained much attention as a\ngrowing area of deep learning capable of learning on graph-structured data.\nHowever, the computational and memory requirements for training GNNs on\nlarge-scale graphs can exceed the capabilities of single machines or GPUs,\nmaking distributed GNN training a promising direction for large-scale GNN\ntraining. A prerequisite for distributed GNN training is to partition the input\ngraph into smaller parts that are distributed among multiple machines of a\ncompute cluster. Although graph partitioning has been extensively studied with\nregard to graph analytics and graph databases, its effect on GNN training\nperformance is largely unexplored.\n In this paper, we study the effectiveness of graph partitioning for\ndistributed GNN training. Our study aims to understand how different factors\nsuch as GNN parameters, mini-batch size, graph type, features size, and\nscale-out factor influence the effectiveness of graph partitioning. We conduct\nexperiments with two different GNN systems using vertex and edge partitioning.\nWe found that graph partitioning is a crucial pre-processing step that can\nheavily reduce the training time and memory footprint. Furthermore, our results\nshow that invested partitioning time can be amortized by reduced GNN training,\nmaking it a relevant optimization.\n","authors":["Nikolai Merkel","Daniel Stoll","Ruben Mayer","Hans-Arno Jacobsen"],"pdf_url":"https://arxiv.org/pdf/2308.15602v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15594v1","updated":"2023-08-29T19:38:41Z","published":"2023-08-29T19:38:41Z","title":"Can transformers learn the greatest common divisor?","summary":" I investigate the capability of small transformers to compute the greatest\ncommon divisor (GCD) of two positive integers. When the training distribution\nand the representation base are carefully chosen, models achieve 98% accuracy\nand correctly predict 91 of the 100 first GCD. Model predictions are\ndeterministic and fully interpretable. During training, the models learn to\ncluster input pairs with the same GCD, and classify them by their divisors.\nBasic models, trained from uniform operands encoded on small bases, only\ncompute a handful of GCD (up to 38 out of 100): the products of divisors of the\nbase. Longer training and larger bases allow some models to \"grok\" small prime\nGCD. Training from log-uniform operands boosts performance to 73 correct GCD,\nand balancing the training distribution of GCD, from inverse square to\nlog-uniform, to 91 GCD. Training models from a uniform distribution of GCD\nbreaks the deterministic model behavior.\n","authors":["François Charton"],"pdf_url":"https://arxiv.org/pdf/2308.15594v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.12871v3","updated":"2023-08-29T19:21:50Z","published":"2022-09-26T17:39:53Z","title":"Variationally Mimetic Operator Networks","summary":" In recent years operator networks have emerged as promising deep learning\ntools for approximating the solution to partial differential equations (PDEs).\nThese networks map input functions that describe material properties, forcing\nfunctions and boundary data to the solution of a PDE. This work describes a new\narchitecture for operator networks that mimics the form of the numerical\nsolution obtained from an approximate variational or weak formulation of the\nproblem. The application of these ideas to a generic elliptic PDE leads to a\nvariationally mimetic operator network (VarMiON). Like the conventional Deep\nOperator Network (DeepONet) the VarMiON is also composed of a sub-network that\nconstructs the basis functions for the output and another that constructs the\ncoefficients for these basis functions. However, in contrast to the DeepONet,\nthe architecture of these sub-networks in the VarMiON is precisely determined.\nAn analysis of the error in the VarMiON solution reveals that it contains\ncontributions from the error in the training data, the training error, the\nquadrature error in sampling input and output functions, and a \"covering error\"\nthat measures the distance between the test input functions and the nearest\nfunctions in the training dataset. It also depends on the stability constants\nfor the exact solution operator and its VarMiON approximation. The application\nof the VarMiON to a canonical elliptic PDE and a nonlinear PDE reveals that for\napproximately the same number of network parameters, on average the VarMiON\nincurs smaller errors than a standard DeepONet and a recently proposed\nmultiple-input operator network (MIONet). Further, its performance is more\nrobust to variations in input functions, the techniques used to sample the\ninput and output functions, the techniques used to construct the basis\nfunctions, and the number of input functions.\n","authors":["Dhruv Patel","Deep Ray","Michael R. A. Abdelmalik","Thomas J. R. Hughes","Assad A. Oberai"],"pdf_url":"https://arxiv.org/pdf/2209.12871v3.pdf","comment":"49 pages, 18 figures, 1 Appendix"},{"id":"http://arxiv.org/abs/2308.15575v1","updated":"2023-08-29T19:04:42Z","published":"2023-08-29T19:04:42Z","title":"Prototype Fission: Closing Set for Robust Open-set Semi-supervised\n Learning","summary":" Semi-supervised Learning (SSL) has been proven vulnerable to\nout-of-distribution (OOD) samples in realistic large-scale unsupervised\ndatasets due to over-confident pseudo-labeling OODs as in-distribution (ID). A\nkey underlying problem is class-wise latent space spreading from closed seen\nspace to open unseen space, and the bias is further magnified in SSL's\nself-training loops. To close the ID distribution set so that OODs are better\nrejected for safe SSL, we propose Prototype Fission(PF) to divide class-wise\nlatent spaces into compact sub-spaces by automatic fine-grained latent space\nmining, driven by coarse-grained labels only. Specifically, we form multiple\nunique learnable sub-class prototypes for each class, optimized towards both\ndiversity and consistency. The Diversity Modeling term encourages samples to be\nclustered by one of the multiple sub-class prototypes, while the Consistency\nModeling term clusters all samples of the same class to a global prototype.\nInstead of \"opening set\", i.e., modeling OOD distribution, Prototype Fission\n\"closes set\" and makes it hard for OOD samples to fit in sub-class latent\nspace. Therefore, PF is compatible with existing methods for further\nperformance gains. Extensive experiments validate the effectiveness of our\nmethod in open-set SSL settings in terms of successfully forming sub-classes,\ndiscriminating OODs from IDs and improving overall accuracy. Codes will be\nreleased.\n","authors":["Xuwei Tan","Yi-Jie Huang","Yaqian Li"],"pdf_url":"https://arxiv.org/pdf/2308.15575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15564v1","updated":"2023-08-29T18:36:21Z","published":"2023-08-29T18:36:21Z","title":"Learning Sequential Information in Task-based fMRI for Synthetic Data\n Augmentation","summary":" Insufficiency of training data is a persistent issue in medical image\nanalysis, especially for task-based functional magnetic resonance images (fMRI)\nwith spatio-temporal imaging data acquired using specific cognitive tasks. In\nthis paper, we propose an approach for generating synthetic fMRI sequences that\ncan then be used to create augmented training datasets in downstream learning\ntasks. To synthesize high-resolution task-specific fMRI, we adapt the\n$\\alpha$-GAN structure, leveraging advantages of both GAN and variational\nautoencoder models, and propose different alternatives in aggregating temporal\ninformation. The synthetic images are evaluated from multiple perspectives\nincluding visualizations and an autism spectrum disorder (ASD) classification\ntask. The results show that the synthetic task-based fMRI can provide effective\ndata augmentation in learning the ASD classification task.\n","authors":["Jiyao Wang","Nicha C. Dvornek","Lawrence H. Staib","James S. Duncan"],"pdf_url":"https://arxiv.org/pdf/2308.15564v1.pdf","comment":"Accepted by Machine Learning in Clinical Neuroimaging 2023 (MICCAI\n workshop), preprint version"},{"id":"http://arxiv.org/abs/2308.15559v1","updated":"2023-08-29T18:29:56Z","published":"2023-08-29T18:29:56Z","title":"Glocal Explanations of Expected Goal Models in Soccer","summary":" The expected goal models have gained popularity, but their interpretability\nis often limited, especially when trained using black-box methods. Explainable\nartificial intelligence tools have emerged to enhance model transparency and\nextract descriptive knowledge for a single observation or for all observations.\nHowever, explaining black-box models for a specific group of observations may\nbe more useful in some domains. This paper introduces the glocal explanations\n(between local and global levels) of the expected goal models to enable\nperformance analysis at the team and player levels by proposing the use of\naggregated versions of the SHAP values and partial dependence profiles. This\nallows knowledge to be extracted from the expected goal model for a player or\nteam rather than just a single shot. In addition, we conducted real-data\napplications to illustrate the usefulness of aggregated SHAP and aggregated\nprofiles. The paper concludes with remarks on the potential of these\nexplanations for performance analysis in soccer analytics.\n","authors":["Mustafa Cavus","Adrian Stando","Przemyslaw Biecek"],"pdf_url":"https://arxiv.org/pdf/2308.15559v1.pdf","comment":"26 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.13399v2","updated":"2023-08-29T18:28:13Z","published":"2023-08-25T14:23:40Z","title":"EntropyRank: Unsupervised Keyphrase Extraction via Side-Information\n Optimization for Language Model-based Text Compression","summary":" We propose an unsupervised method to extract keywords and keyphrases from\ntexts based on a pre-trained language model (LM) and Shannon's information\nmaximization. Specifically, our method extracts phrases having the highest\nconditional entropy under the LM. The resulting set of keyphrases turns out to\nsolve a relevant information-theoretic problem: if provided as side\ninformation, it leads to the expected minimal binary code length in compressing\nthe text using the LM and an entropy encoder. Alternately, the resulting set is\nan approximation via a causal LM to the set of phrases that minimize the\nentropy of the text when conditioned upon it. Empirically, the method provides\nresults comparable to the most commonly used methods in various keyphrase\nextraction benchmark challenges.\n","authors":["Alexander Tsvetkov","Alon Kipnis"],"pdf_url":"https://arxiv.org/pdf/2308.13399v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07522v3","updated":"2023-08-29T18:24:44Z","published":"2023-07-09T21:16:56Z","title":"The Future of Fundamental Science Led by Generative Closed-Loop\n Artificial Intelligence","summary":" Recent advances in machine learning and AI, including Generative AI and LLMs,\nare disrupting technological innovation, product development, and society as a\nwhole. AI's contribution to technology can come from multiple approaches that\nrequire access to large training data sets and clear performance evaluation\ncriteria, ranging from pattern recognition and classification to generative\nmodels. Yet, AI has contributed less to fundamental science in part because\nlarge data sets of high-quality data for scientific practice and model\ndiscovery are more difficult to access. Generative AI, in general, and Large\nLanguage Models in particular, may represent an opportunity to augment and\naccelerate the scientific discovery of fundamental deep science with\nquantitative models. Here we explore and investigate aspects of an AI-driven,\nautomated, closed-loop approach to scientific discovery, including self-driven\nhypothesis generation and open-ended autonomous exploration of the hypothesis\nspace. Integrating AI-driven automation into the practice of science would\nmitigate current problems, including the replication of findings, systematic\nproduction of data, and ultimately democratisation of the scientific process.\nRealising these possibilities requires a vision for augmented AI coupled with a\ndiversity of AI approaches able to deal with fundamental aspects of causality\nanalysis and model discovery while enabling unbiased search across the space of\nputative explanations. These advances hold the promise to unleash AI's\npotential for searching and discovering the fundamental structure of our world\nbeyond what human scientists have been able to achieve. Such a vision would\npush the boundaries of new fundamental science rather than automatize current\nworkflows and instead open doors for technological innovation to tackle some of\nthe greatest challenges facing humanity today.\n","authors":["Hector Zenil","Jesper Tegnér","Felipe S. Abrahão","Alexander Lavin","Vipin Kumar","Jeremy G. Frey","Adrian Weller","Larisa Soldatova","Alan R. Bundy","Nicholas R. Jennings","Koichi Takahashi","Lawrence Hunter","Saso Dzeroski","Andrew Briggs","Frederick D. Gregory","Carla P. Gomes","Jon Rowe","James Evans","Hiroaki Kitano","Ross King"],"pdf_url":"https://arxiv.org/pdf/2307.07522v3.pdf","comment":"35 pages, first draft of the final report from the Alan Turing\n Institute on AI for Scientific Discovery"},{"id":"http://arxiv.org/abs/2308.15553v1","updated":"2023-08-29T18:19:36Z","published":"2023-08-29T18:19:36Z","title":"Dimensionality Reduction Using pseudo-Boolean polynomials For Cluster\n Analysis","summary":" We introduce usage of a reduction property of penalty-based formulation of\npseudo-Boolean polynomials as a mechanism for invariant dimensionality\nreduction in cluster analysis processes. In our experiments, we show that\nmultidimensional data, like 4-dimensional Iris Flower dataset can be reduced to\n2-dimensional space while the 30-dimensional Wisconsin Diagnostic Breast Cancer\n(WDBC) dataset can be reduced to 3-dimensional space, and by searching lines or\nplanes that lie between reduced samples we can extract clusters in a linear and\nunbiased manner with competitive accuracies, reproducibility and clear\ninterpretation.\n","authors":["Tendai Mapungwana Chikake","Boris Goldengorin"],"pdf_url":"https://arxiv.org/pdf/2308.15553v1.pdf","comment":"14 pages, 4 figures, submitted to the International Conference Data\n Analysis, Optimization and Their Applications on the Occasion of Boris\n Mirkin's 80th Birthday January 30-31, 2023, Dolgoprudny, Moscow Region,\n Moscow Institute of Physics and Technology\n https://mipt.ru/education/chairs/dm/conferences/data-analysis-optimization-and-their-applications-2023.php"},{"id":"http://arxiv.org/abs/2308.15552v1","updated":"2023-08-29T18:18:21Z","published":"2023-08-29T18:18:21Z","title":"Pure Exploration under Mediators' Feedback","summary":" Stochastic multi-armed bandits are a sequential-decision-making framework,\nwhere, at each interaction step, the learner selects an arm and observes a\nstochastic reward. Within the context of best-arm identification (BAI)\nproblems, the goal of the agent lies in finding the optimal arm, i.e., the one\nwith highest expected reward, as accurately and efficiently as possible.\nNevertheless, the sequential interaction protocol of classical BAI problems,\nwhere the agent has complete control over the arm being pulled at each round,\ndoes not effectively model several decision-making problems of interest (e.g.,\noff-policy learning, partially controllable environments, and human feedback).\nFor this reason, in this work, we propose a novel strict generalization of the\nclassical BAI problem that we refer to as best-arm identification under\nmediators' feedback (BAI-MF). More specifically, we consider the scenario in\nwhich the learner has access to a set of mediators, each of which selects the\narms on the agent's behalf according to a stochastic and possibly unknown\npolicy. The mediator, then, communicates back to the agent the pulled arm\ntogether with the observed reward. In this setting, the agent's goal lies in\nsequentially choosing which mediator to query to identify with high probability\nthe optimal arm while minimizing the identification time, i.e., the sample\ncomplexity. To this end, we first derive and analyze a statistical lower bound\non the sample complexity specific to our general mediator feedback scenario.\nThen, we propose a sequential decision-making strategy for discovering the best\narm under the assumption that the mediators' policies are known to the learner.\nAs our theory verifies, this algorithm matches the lower bound both almost\nsurely and in expectation. Finally, we extend these results to cases where the\nmediators' policies are unknown to the learner obtaining comparable results.\n","authors":["Riccardo Poiani","Alberto Maria Metelli","Marcello Restelli"],"pdf_url":"https://arxiv.org/pdf/2308.15552v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15550v1","updated":"2023-08-29T18:17:35Z","published":"2023-08-29T18:17:35Z","title":"Adversarial Style Transfer for Robust Policy Optimization in Deep\n Reinforcement Learning","summary":" This paper proposes an algorithm that aims to improve generalization for\nreinforcement learning agents by removing overfitting to confounding features.\nOur approach consists of a max-min game theoretic objective. A generator\ntransfers the style of observation during reinforcement learning. An additional\ngoal of the generator is to perturb the observation, which maximizes the\nagent's probability of taking a different action. In contrast, a policy network\nupdates its parameters to minimize the effect of such perturbations, thus\nstaying robust while maximizing the expected future reward. Based on this\nsetup, we propose a practical deep reinforcement learning algorithm,\nAdversarial Robust Policy Optimization (ARPO), to find a robust policy that\ngeneralizes to unseen environments. We evaluate our approach on Procgen and\nDistracting Control Suite for generalization and sample efficiency.\nEmpirically, ARPO shows improved performance compared to a few baseline\nalgorithms, including data augmentation.\n","authors":["Md Masudur Rahman","Yexiang Xue"],"pdf_url":"https://arxiv.org/pdf/2308.15550v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15513v1","updated":"2023-08-29T16:24:11Z","published":"2023-08-29T16:24:11Z","title":"Tuning the perplexity for and computing sampling-based t-SNE embeddings","summary":" Widely used pipelines for the analysis of high-dimensional data utilize\ntwo-dimensional visualizations. These are created, e.g., via t-distributed\nstochastic neighbor embedding (t-SNE). When it comes to large data sets,\napplying these visualization techniques creates suboptimal embeddings, as the\nhyperparameters are not suitable for large data. Cranking up these parameters\nusually does not work as the computations become too expensive for practical\nworkflows. In this paper, we argue that a sampling-based embedding approach can\ncircumvent these problems. We show that hyperparameters must be chosen\ncarefully, depending on the sampling rate and the intended final embedding.\nFurther, we show how this approach speeds up the computation and increases the\nquality of the embeddings.\n","authors":["Martin Skrodzki","Nicolas Chaves-de-Plaza","Klaus Hildebrandt","Thomas Höllt","Elmar Eisemann"],"pdf_url":"https://arxiv.org/pdf/2308.15513v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2305.02422v3","updated":"2023-08-29T22:12:04Z","published":"2023-05-03T20:29:04Z","title":"GAMIVAL: Video Quality Prediction on Mobile Cloud Gaming Content","summary":" The mobile cloud gaming industry has been rapidly growing over the last\ndecade. When streaming gaming videos are transmitted to customers' client\ndevices from cloud servers, algorithms that can monitor distorted video quality\nwithout having any reference video available are desirable tools. However,\ncreating No-Reference Video Quality Assessment (NR VQA) models that can\naccurately predict the quality of streaming gaming videos rendered by computer\ngraphics engines is a challenging problem, since gaming content generally\ndiffers statistically from naturalistic videos, often lacks detail, and\ncontains many smooth regions. Until recently, the problem has been further\ncomplicated by the lack of adequate subjective quality databases of mobile\ngaming content. We have created a new gaming-specific NR VQA model called the\nGaming Video Quality Evaluator (GAMIVAL), which combines and leverages the\nadvantages of spatial and temporal gaming distorted scene statistics models, a\nneural noise model, and deep semantic features. Using a support vector\nregression (SVR) as a regressor, GAMIVAL achieves superior performance on the\nnew LIVE-Meta Mobile Cloud Gaming (LIVE-Meta MCG) video quality database.\n","authors":["Yu-Chih Chen","Avinab Saha","Chase Davis","Bo Qiu","Xiaoming Wang","Rahul Gowda","Ioannis Katsavounidis","Alan C. Bovik"],"pdf_url":"https://arxiv.org/pdf/2305.02422v3.pdf","comment":"Accepted to IEEE SPL 2023. The implementation of GAMIVAL has been\n made available online: https://github.com/lskdream/GAMIVAL"},{"id":"http://arxiv.org/abs/2308.15502v1","updated":"2023-08-29T10:41:34Z","published":"2023-08-29T10:41:34Z","title":"On the Steganographic Capacity of Selected Learning Models","summary":" Machine learning and deep learning models are potential vectors for various\nattack scenarios. For example, previous research has shown that malware can be\nhidden in deep learning models. Hiding information in a learning model can be\nviewed as a form of steganography. In this research, we consider the general\nquestion of the steganographic capacity of learning models. Specifically, for a\nwide range of models, we determine the number of low-order bits of the trained\nparameters that can be overwritten, without adversely affecting model\nperformance. For each model considered, we graph the accuracy as a function of\nthe number of low-order bits that have been overwritten, and for selected\nmodels, we also analyze the steganographic capacity of individual layers. The\nmodels that we test include the classic machine learning techniques of Linear\nRegression (LR) and Support Vector Machine (SVM); the popular general deep\nlearning models of Multilayer Perceptron (MLP) and Convolutional Neural Network\n(CNN); the highly-successful Recurrent Neural Network (RNN) architecture of\nLong Short-Term Memory (LSTM); the pre-trained transfer learning-based models\nVGG16, DenseNet121, InceptionV3, and Xception; and, finally, an Auxiliary\nClassifier Generative Adversarial Network (ACGAN). In all cases, we find that a\nmajority of the bits of each trained parameter can be overwritten before the\naccuracy degrades. Of the models tested, the steganographic capacity ranges\nfrom 7.04 KB for our LR experiments, to 44.74 MB for InceptionV3. We discuss\nthe implications of our results and consider possible avenues for further\nresearch.\n","authors":["Rishit Agrawal","Kelvin Jou","Tanush Obili","Daksh Parikh","Samarth Prajapati","Yash Seth","Charan Sridhar","Nathan Zhang","Mark Stamp"],"pdf_url":"https://arxiv.org/pdf/2308.15502v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2306.17189"}]},"2023-08-30T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2308.06032v2","updated":"2023-08-30T17:57:52Z","published":"2023-08-11T09:23:11Z","title":"Large Language Models in Cryptocurrency Securities Cases: Can ChatGPT\n Replace Lawyers?","summary":" Large Language Models (LLMs) could enhance access to the legal system.\nHowever, empirical research on their effectiveness in conducting legal tasks is\nscant. We study securities cases involving cryptocurrencies as one of numerous\ncontexts where AI could support the legal process, studying LLMs' legal\nreasoning and drafting capabilities. We examine whether a) an LLM can\naccurately determine which laws are potentially being violated from a fact\npattern, and b) whether there is a difference in juror decision-making based on\ncomplaints written by a lawyer compared to an LLM. We feed fact patterns from\nreal-life cases to GPT-3.5 and evaluate its ability to determine correct\npotential violations from the scenario and exclude spurious violations. Second,\nwe had mock jurors assess complaints written by the LLM and lawyers. GPT-3.5's\nlegal reasoning skills proved weak, though we expect improvement in future\nmodels, particularly given the violations it suggested tended to be correct (it\nmerely missed additional, correct violations). GPT-3.5 performed better at\nlegal drafting, and jurors' decisions were not statistically significantly\nassociated with the author of the document upon which they based their\ndecisions. Because LLMs cannot satisfactorily conduct legal reasoning tasks,\nthey would be unable to replace lawyers at this stage. However, their drafting\nskills (though, perhaps, still inferior to lawyers), could provide access to\njustice for more individuals by reducing the cost of legal services. Our\nresearch is the first to systematically study LLMs' legal drafting and\nreasoning capabilities in litigation, as well as in securities law and\ncryptocurrency-related misconduct.\n","authors":["Arianna Trozze","Toby Davies","Bennett Kleinberg"],"pdf_url":"https://arxiv.org/pdf/2308.06032v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16175v1","updated":"2023-08-30T17:53:25Z","published":"2023-08-30T17:53:25Z","title":"Quantifying Uncertainty in Answers from any Language Model via Intrinsic\n and Extrinsic Confidence Assessment","summary":" We introduce BSDetector, a method for detecting bad and speculative answers\nfrom a pretrained Large Language Model by estimating a numeric confidence score\nfor any output it generated. Our uncertainty quantification technique works for\nany LLM accessible only via a black-box API, and combines intrinsic and\nextrinsic assessments of confidence into a single trustworthiness estimate for\nany LLM response to a given prompt. Our method is extremely general and can\napplied to all of the best LLMs available today (whose training data remains\nunknown). By expending a bit of extra computation, users of any LLM API can now\nget the same response as they would ordinarily, as well as a confidence\nestimate that caution when not to trust this response. Experiments on both\nclosed and open-form Question-Answer benchmarks reveal that BSDetector more\naccurately identifies incorrect LLM responses than alternative uncertainty\nestimation procedures (for both GPT-3 and ChatGPT). By sampling multiple\nresponses from the LLM and considering the one with the highest confidence\nscore, we can additionally obtain more accurate responses from the same LLM,\nwithout any extra training steps.\n","authors":["Jiuhai Chen","Jonas Mueller"],"pdf_url":"https://arxiv.org/pdf/2308.16175v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.17590v2","updated":"2023-08-30T17:46:17Z","published":"2023-03-30T17:57:43Z","title":"Going Beyond Nouns With Vision & Language Models Using Synthetic Data","summary":" Large-scale pre-trained Vision & Language (VL) models have shown remarkable\nperformance in many applications, enabling replacing a fixed set of supported\nclasses with zero-shot open vocabulary reasoning over (almost arbitrary)\nnatural language prompts. However, recent works have uncovered a fundamental\nweakness of these models. For example, their difficulty to understand Visual\nLanguage Concepts (VLC) that go 'beyond nouns' such as the meaning of\nnon-object words (e.g., attributes, actions, relations, states, etc.), or\ndifficulty in performing compositional reasoning such as understanding the\nsignificance of the order of the words in a sentence. In this work, we\ninvestigate to which extent purely synthetic data could be leveraged to teach\nthese models to overcome such shortcomings without compromising their zero-shot\ncapabilities. We contribute Synthetic Visual Concepts (SyViC) - a million-scale\nsynthetic dataset and data generation codebase allowing to generate additional\nsuitable data to improve VLC understanding and compositional reasoning of VL\nmodels. Additionally, we propose a general VL finetuning strategy for\neffectively leveraging SyViC towards achieving these improvements. Our\nextensive experiments and ablations on VL-Checklist, Winoground, and ARO\nbenchmarks demonstrate that it is possible to adapt strong pre-trained VL\nmodels with synthetic data significantly enhancing their VLC understanding\n(e.g. by 9.9% on ARO and 4.3% on VL-Checklist) with under 1% drop in their\nzero-shot accuracy.\n","authors":["Paola Cascante-Bonilla","Khaled Shehada","James Seale Smith","Sivan Doveh","Donghyun Kim","Rameswar Panda","Gül Varol","Aude Oliva","Vicente Ordonez","Rogerio Feris","Leonid Karlinsky"],"pdf_url":"https://arxiv.org/pdf/2303.17590v2.pdf","comment":"Accepted to ICCV 2023. Project page: https://synthetic-vic.github.io/"},{"id":"http://arxiv.org/abs/2308.16149v1","updated":"2023-08-30T17:07:17Z","published":"2023-08-30T17:07:17Z","title":"Jais and Jais-chat: Arabic-Centric Foundation and Instruction-Tuned Open\n Generative Large Language Models","summary":" We introduce Jais and Jais-chat, new state-of-the-art Arabic-centric\nfoundation and instruction-tuned open generative large language models (LLMs).\nThe models are based on the GPT-3 decoder-only architecture and are pretrained\non a mixture of Arabic and English texts, including source code in various\nprogramming languages. With 13 billion parameters, they demonstrate better\nknowledge and reasoning capabilities in Arabic than any existing open Arabic\nand multilingual models by a sizable margin, based on extensive evaluation.\nMoreover, the models are competitive in English compared to English-centric\nopen models of similar size, despite being trained on much less English data.\nWe provide a detailed description of the training, the tuning, the safety\nalignment, and the evaluation of the models. We release two open versions of\nthe model -- the foundation Jais model, and an instruction-tuned Jais-chat\nvariant -- with the aim of promoting research on Arabic LLMs. Available at\nhttps://huggingface.co/inception-mbzuai/jais-13b-chat\n","authors":["Neha Sengupta","Sunil Kumar Sahu","Bokang Jia","Satheesh Katipomu","Haonan Li","Fajri Koto","Osama Mohammed Afzal","Samta Kamboj","Onkar Pandit","Rahul Pal","Lalit Pradhan","Zain Muhammad Mujahid","Massa Baali","Alham Fikri Aji","Zhengzhong Liu","Andy Hock","Andrew Feldman","Jonathan Lee","Andrew Jackson","Preslav Nakov","Timothy Baldwin","Eric Xing"],"pdf_url":"https://arxiv.org/pdf/2308.16149v1.pdf","comment":"Arabic-centric, foundation model, large-language model, LLM,\n generative model, instruction-tuned, Jais, Jais-chat"},{"id":"http://arxiv.org/abs/2308.16137v1","updated":"2023-08-30T16:47:51Z","published":"2023-08-30T16:47:51Z","title":"LM-Infinite: Simple On-the-Fly Length Generalization for Large Language\n Models","summary":" In recent years, there have been remarkable advancements in the performance\nof Transformer-based Large Language Models (LLMs) across various domains. As\nthese LLMs are deployed for increasingly complex tasks, they often face the\nneeds to conduct longer reasoning processes or understanding larger contexts.\nIn these situations, the length generalization failure of LLMs on long\nsequences become more prominent. Most pre-training schemes truncate training\nsequences to a fixed length (such as 2048 for LLaMa). LLMs often struggle to\ngenerate fluent texts, let alone carry out downstream tasks, after longer\ncontexts, even with relative positional encoding which is designed to cope with\nthis problem. Common solutions such as finetuning on longer corpora often\ninvolves daunting hardware and time costs and requires careful training process\ndesign. To more efficiently leverage the generation capacity of existing LLMs,\nwe theoretically and empirically investigate the main out-of-distribution (OOD)\nfactors contributing to this problem. Inspired by this diagnosis, we propose a\nsimple yet effective solution for on-the-fly length generalization,\nLM-Infinite, which involves only a $\\Lambda$-shaped attention mask and a\ndistance limit while requiring no parameter updates or learning. We find it\napplicable to a variety of LLMs using relative-position encoding methods.\nLM-Infinite is computational efficient with $O(n)$ time and space, and\ndemonstrates consistent fluency and generation quality to as long as 32k tokens\non ArXiv and OpenWebText2 datasets, with 2.72x decoding speedup. On downstream\ntask such as passkey retrieval, it continues to work on inputs much longer than\ntraining lengths where vanilla models fail immediately.\n","authors":["Chi Han","Qifan Wang","Wenhan Xiong","Yu Chen","Heng Ji","Sinong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.16137v1.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2305.17680v4","updated":"2023-08-30T16:17:27Z","published":"2023-05-28T10:05:13Z","title":"Evaluating GPT-3 Generated Explanations for Hateful Content Moderation","summary":" Recent research has focused on using large language models (LLMs) to generate\nexplanations for hate speech through fine-tuning or prompting. Despite the\ngrowing interest in this area, these generated explanations' effectiveness and\npotential limitations remain poorly understood. A key concern is that these\nexplanations, generated by LLMs, may lead to erroneous judgments about the\nnature of flagged content by both users and content moderators. For instance,\nan LLM-generated explanation might inaccurately convince a content moderator\nthat a benign piece of content is hateful. In light of this, we propose an\nanalytical framework for examining hate speech explanations and conducted an\nextensive survey on evaluating such explanations. Specifically, we prompted\nGPT-3 to generate explanations for both hateful and non-hateful content, and a\nsurvey was conducted with 2,400 unique respondents to evaluate the generated\nexplanations. Our findings reveal that (1) human evaluators rated the\nGPT-generated explanations as high quality in terms of linguistic fluency,\ninformativeness, persuasiveness, and logical soundness, (2) the persuasive\nnature of these explanations, however, varied depending on the prompting\nstrategy employed, and (3) this persuasiveness may result in incorrect\njudgments about the hatefulness of the content. Our study underscores the need\nfor caution in applying LLM-generated explanations for content moderation. Code\nand results are available at https://github.com/Social-AI-Studio/GPT3-HateEval.\n","authors":["Han Wang","Ming Shan Hee","Md Rabiul Awal","Kenny Tsu Wei Choo","Roy Ka-Wei Lee"],"pdf_url":"https://arxiv.org/pdf/2305.17680v4.pdf","comment":"9 pages, 2 figures, Accepted by International Joint Conference on\n Artificial Intelligence(IJCAI)"},{"id":"http://arxiv.org/abs/2308.16118v1","updated":"2023-08-30T16:17:26Z","published":"2023-08-30T16:17:26Z","title":"Response: Emergent analogical reasoning in large language models","summary":" In their recent Nature Human Behaviour paper, \"Emergent analogical reasoning\nin large language models,\" (Webb, Holyoak, and Lu, 2023) the authors argue that\n\"large language models such as GPT-3 have acquired an emergent ability to find\nzero-shot solutions to a broad range of analogy problems.\" In this response, we\nprovide counterexamples of the letter string analogies. In our tests, GPT-3\nfails to solve even the easiest variants of the problems presented in the\noriginal paper. Zero-shot reasoning is an extraordinary claim that requires\nextraordinary evidence. We do not see that evidence in our experiments. To\nstrengthen claims of humanlike reasoning such as zero-shot reasoning, it is\nimportant that the field develop approaches that rule out data memorization.\n","authors":["Damian Hodel","Jevin West"],"pdf_url":"https://arxiv.org/pdf/2308.16118v1.pdf","comment":"Response to publication in Nature Human Behaviour titled \"Emergent\n analogical reasoning in large language models,\" (Webb, Holyoak, and Lu, 2023,\n arXiv:2212.09196). 9 pages"},{"id":"http://arxiv.org/abs/2308.14359v2","updated":"2023-08-30T16:08:28Z","published":"2023-08-28T07:11:27Z","title":"Effect of Attention and Self-Supervised Speech Embeddings on\n Non-Semantic Speech Tasks","summary":" Human emotion understanding is pivotal in making conversational technology\nmainstream. We view speech emotion understanding as a perception task which is\na more realistic setting. With varying contexts (languages, demographics, etc.)\ndifferent share of people perceive the same speech segment as a non-unanimous\nemotion. As part of the ACM Multimedia 2023 Computational Paralinguistics\nChallengE (ComParE) in the EMotion Share track, we leverage their rich dataset\nof multilingual speakers and multi-label regression target of 'emotion share'\nor perception of that emotion. We demonstrate that the training scheme of\ndifferent foundation models dictates their effectiveness for tasks beyond\nspeech recognition, especially for non-semantic speech tasks like emotion\nunderstanding. This is a very complex task due to multilingual speakers,\nvariability in the target labels, and inherent imbalance in the regression\ndataset. Our results show that HuBERT-Large with a self-attention-based\nlight-weight sequence model provides 4.6% improvement over the reported\nbaseline.\n","authors":["Payal Mohapatra","Akash Pandey","Yueyuan Sui","Qi Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.14359v2.pdf","comment":"Accepted to appear at ACM Multimedia 2023 Multimedia Grand Challenges\n Track"},{"id":"http://arxiv.org/abs/2308.16109v1","updated":"2023-08-30T16:04:54Z","published":"2023-08-30T16:04:54Z","title":"Grandma Karl is 27 years old -- research agenda for pseudonymization of\n research data","summary":" Accessibility of research data is critical for advances in many research\nfields, but textual data often cannot be shared due to the personal and\nsensitive information which it contains, e.g names or political opinions.\nGeneral Data Protection Regulation (GDPR) suggests pseudonymization as a\nsolution to secure open access to research data, but we need to learn more\nabout pseudonymization as an approach before adopting it for manipulation of\nresearch data. This paper outlines a research agenda within pseudonymization,\nnamely need of studies into the effects of pseudonymization on unstructured\ndata in relation to e.g. readability and language assessment, as well as the\neffectiveness of pseudonymization as a way of protecting writer identity, while\nalso exploring different ways of developing context-sensitive algorithms for\ndetection, labelling and replacement of personal information in unstructured\ndata. The recently granted project on pseudonymization Grandma Karl is 27 years\nold addresses exactly those challenges.\n","authors":["Elena Volodina","Simon Dobnik","Therese Lindström Tiedemann","Xuan-Son Vu"],"pdf_url":"https://arxiv.org/pdf/2308.16109v1.pdf","comment":"Big DataService 2023 conference, 2023 Workshop on Big Data and\n Machine Learning with Privacy Enhancing Tech, IEEE Catalog Number:\n CFP23A91-ART, ISBN: 979-8-3503-3379-4"},{"id":"http://arxiv.org/abs/2307.15745v2","updated":"2023-08-30T15:58:56Z","published":"2023-07-28T18:01:08Z","title":"Context-VQA: Towards Context-Aware and Purposeful Visual Question\n Answering","summary":" Visual question answering (VQA) has the potential to make the Internet more\naccessible in an interactive way, allowing people who cannot see images to ask\nquestions about them. However, multiple studies have shown that people who are\nblind or have low-vision prefer image explanations that incorporate the context\nin which an image appears, yet current VQA datasets focus on images in\nisolation. We argue that VQA models will not fully succeed at meeting people's\nneeds unless they take context into account. To further motivate and analyze\nthe distinction between different contexts, we introduce Context-VQA, a VQA\ndataset that pairs images with contexts, specifically types of websites (e.g.,\na shopping website). We find that the types of questions vary systematically\nacross contexts. For example, images presented in a travel context garner 2\ntimes more \"Where?\" questions, and images on social media and news garner 2.8\nand 1.8 times more \"Who?\" questions than the average. We also find that context\neffects are especially important when participants can't see the image. These\nresults demonstrate that context affects the types of questions asked and that\nVQA models should be context-sensitive to better meet people's needs,\nespecially in accessibility settings.\n","authors":["Nandita Naik","Christopher Potts","Elisa Kreiss"],"pdf_url":"https://arxiv.org/pdf/2307.15745v2.pdf","comment":"Proceedings of ICCV 2023 Workshop on Closing the Loop Between Vision\n and Language"},{"id":"http://arxiv.org/abs/2305.09438v3","updated":"2023-08-30T14:56:16Z","published":"2023-05-16T13:50:24Z","title":"MPI-rical: Data-Driven MPI Distributed Parallelism Assistance with\n Transformers","summary":" Message Passing Interface (MPI) plays a crucial role in distributed memory\nparallelization across multiple nodes. However, parallelizing MPI code\nmanually, and specifically, performing domain decomposition, is a challenging,\nerror-prone task. In this paper, we address this problem by developing\nMPI-RICAL, a novel data-driven, programming-assistance tool that assists\nprogrammers in writing domain decomposition based distributed memory\nparallelization code. Specifically, we train a supervised language model to\nsuggest MPI functions and their proper locations in the code on the fly. We\nalso introduce MPICodeCorpus, the first publicly available corpus of MPI-based\nparallel programs that is created by mining more than 15,000 open-source\nrepositories on GitHub. Experimental results have been done on MPICodeCorpus\nand more importantly, on a compiled benchmark of MPI-based parallel programs\nfor numerical computations that represent real-world scientific applications.\nMPI-RICAL achieves F1 scores between 0.87-0.91 on these programs, demonstrating\nits accuracy in suggesting correct MPI functions at appropriate code\nlocations.. The source code used in this work, as well as other relevant\nsources, are available at:\nhttps://github.com/Scientific-Computing-Lab-NRCN/MPI-rical\n","authors":["Nadav Schneider","Tal Kadosh","Niranjan Hasabnis","Timothy Mattson","Yuval Pinter","Gal Oren"],"pdf_url":"https://arxiv.org/pdf/2305.09438v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10390v3","updated":"2023-08-30T14:55:01Z","published":"2023-08-20T23:47:23Z","title":"LibriSQA: Advancing Free-form and Open-ended Spoken Question Answering\n with a Novel Dataset and Framework","summary":" While Large Language Models (LLMs) have demonstrated commendable performance\nacross a myriad of domains and tasks, existing LLMs still exhibit a palpable\ndeficit in handling multimodal functionalities, especially for the Spoken\nQuestion Answering (SQA) task which necessitates precise alignment and deep\ninteraction between speech and text features. To address the SQA challenge on\nLLMs, we initially curated the free-form and open-ended LibriSQA dataset from\nLibrispeech, comprising Part I with natural conversational formats and Part II\nencompassing multiple-choice questions followed by answers and analytical\nsegments. Both parts collectively include 107k SQA pairs that cover various\ntopics. Given the evident paucity of existing speech-text LLMs, we propose a\nlightweight, end-to-end framework to execute the SQA task on the LibriSQA,\nwitnessing significant results. By reforming ASR into the SQA format, we\nfurther substantiate our framework's capability in handling ASR tasks. Our\nempirical findings bolster the LLMs' aptitude for aligning and comprehending\nmultimodal information, paving the way for the development of universal\nmultimodal LLMs. The dataset and demo can be found at\nhttps://github.com/ZihanZhaoSJTU/LibriSQA.\n","authors":["Zihan Zhao","Yiyang Jiang","Heyang Liu","Yanfeng Wang","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2308.10390v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16075v1","updated":"2023-08-30T14:52:14Z","published":"2023-08-30T14:52:14Z","title":"Impact of Visual Context on Noisy Multimodal NMT: An Empirical Study for\n English to Indian Languages","summary":" The study investigates the effectiveness of utilizing multimodal information\nin Neural Machine Translation (NMT). While prior research focused on using\nmultimodal data in low-resource scenarios, this study examines how image\nfeatures impact translation when added to a large-scale, pre-trained unimodal\nNMT system. Surprisingly, the study finds that images might be redundant in\nthis context. Additionally, the research introduces synthetic noise to assess\nwhether images help the model deal with textual noise. Multimodal models\nslightly outperform text-only models in noisy settings, even with random\nimages. The study's experiments translate from English to Hindi, Bengali, and\nMalayalam, outperforming state-of-the-art benchmarks significantly.\nInterestingly, the effect of visual context varies with source text noise: no\nvisual context works best for non-noisy translations, cropped image features\nare optimal for low noise, and full image features work better in high-noise\nscenarios. This sheds light on the role of visual context, especially in noisy\nsettings, opening up a new research direction for Noisy Neural Machine\nTranslation in multimodal setups. The research emphasizes the importance of\ncombining visual and textual information for improved translation in various\nenvironments.\n","authors":["Baban Gain","Dibyanayan Bandyopadhyay","Samrat Mukherjee","Chandranath Adak","Asif Ekbal"],"pdf_url":"https://arxiv.org/pdf/2308.16075v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16061v1","updated":"2023-08-30T14:36:25Z","published":"2023-08-30T14:36:25Z","title":"Conti Inc.: Understanding the Internal Discussions of a large\n Ransomware-as-a-Service Operator with Machine Learning","summary":" Ransomware-as-a-service (RaaS) is increasing the scale and complexity of\nransomware attacks. Understanding the internal operations behind RaaS has been\na challenge due to the illegality of such activities. The recent chat leak of\nthe Conti RaaS operator, one of the most infamous ransomware operators on the\ninternational scene, offers a key opportunity to better understand the inner\nworkings of such organizations. This paper analyzes the main topic discussions\nin the Conti chat leak using machine learning techniques such as Natural\nLanguage Processing (NLP) and Latent Dirichlet Allocation (LDA), as well as\nvisualization strategies. Five discussion topics are found: 1) Business, 2)\nTechnical, 3) Internal tasking/Management, 4) Malware, and 5) Customer\nService/Problem Solving. Moreover, the distribution of topics among Conti\nmembers shows that only 4% of individuals have specialized discussions while\nalmost all individuals (96%) are all-rounders, meaning that their discussions\nrevolve around the five topics. The results also indicate that a significant\nproportion of Conti discussions are non-tech related. This study thus\nhighlights that running such large RaaS operations requires a workforce skilled\nbeyond technical abilities, with individuals involved in various tasks, from\nmanagement to customer service or problem solving. The discussion topics also\nshow that the organization behind the Conti RaaS oper5086933ator shares\nsimilarities with a large firm. We conclude that, although RaaS represents an\nexample of specialization in the cybercrime industry, only a few members are\nspecialized in one topic, while the rest runs and coordinates the RaaS\noperation.\n","authors":["Estelle Ruellan","Masarah Paquet-Clouston","Sebastian Garcia"],"pdf_url":"https://arxiv.org/pdf/2308.16061v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16060v1","updated":"2023-08-30T14:33:25Z","published":"2023-08-30T14:33:25Z","title":"Text-to-OverpassQL: A Natural Language Interface for Complex Geodata\n Querying of OpenStreetMap","summary":" We present Text-to-OverpassQL, a task designed to facilitate a natural\nlanguage interface for querying geodata from OpenStreetMap (OSM). The Overpass\nQuery Language (OverpassQL) allows users to formulate complex database queries\nand is widely adopted in the OSM ecosystem. Generating Overpass queries from\nnatural language input serves multiple use-cases. It enables novice users to\nutilize OverpassQL without prior knowledge, assists experienced users with\ncrafting advanced queries, and enables tool-augmented large language models to\naccess information stored in the OSM database. In order to assess the\nperformance of current sequence generation models on this task, we propose\nOverpassNL, a dataset of 8,352 queries with corresponding natural language\ninputs. We further introduce task specific evaluation metrics and ground the\nevaluation of the Text-to-OverpassQL task by executing the queries against the\nOSM database. We establish strong baselines by finetuning sequence-to-sequence\nmodels and adapting large language models with in-context examples. The\ndetailed evaluation reveals strengths and weaknesses of the considered learning\nstrategies, laying the foundations for further research into the\nText-to-OverpassQL task.\n","authors":["Michael Staniek","Raphael Schumann","Maike Züfle","Stefan Riezler"],"pdf_url":"https://arxiv.org/pdf/2308.16060v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16055v1","updated":"2023-08-30T14:24:16Z","published":"2023-08-30T14:24:16Z","title":"AsyncET: Asynchronous Learning for Knowledge Graph Entity Typing with\n Auxiliary Relations","summary":" Knowledge graph entity typing (KGET) is a task to predict the missing entity\ntypes in knowledge graphs (KG). Previously, KG embedding (KGE) methods tried to\nsolve the KGET task by introducing an auxiliary relation, 'hasType', to model\nthe relationship between entities and their types. However, a single auxiliary\nrelation has limited expressiveness for diverse entity-type patterns. We\nimprove the expressiveness of KGE methods by introducing multiple auxiliary\nrelations in this work. Similar entity types are grouped to reduce the number\nof auxiliary relations and improve their capability to model entity-type\npatterns with different granularities. With the presence of multiple auxiliary\nrelations, we propose a method adopting an Asynchronous learning scheme for\nEntity Typing, named AsyncET, which updates the entity and type embeddings\nalternatively to keep the learned entity embedding up-to-date and informative\nfor entity type prediction. Experiments are conducted on two commonly used KGET\ndatasets to show that the performance of KGE methods on the KGET task can be\nsubstantially improved by the proposed multiple auxiliary relations and\nasynchronous embedding learning. Furthermore, our method has a significant\nadvantage over state-of-the-art methods in model sizes and time complexity.\n","authors":["Yun-Cheng Wang","Xiou Ge","Bin Wang","C. -C. Jay Kuo"],"pdf_url":"https://arxiv.org/pdf/2308.16055v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17926v2","updated":"2023-08-30T13:22:35Z","published":"2023-05-29T07:41:03Z","title":"Large Language Models are not Fair Evaluators","summary":" In this paper, we uncover a systematic bias in the evaluation paradigm of\nadopting large language models~(LLMs), e.g., GPT-4, as a referee to score and\ncompare the quality of responses generated by candidate models. We find that\nthe quality ranking of candidate responses can be easily hacked by simply\naltering their order of appearance in the context. This manipulation allows us\nto skew the evaluation result, making one model appear considerably superior to\nthe other, e.g., Vicuna-13B could beat ChatGPT on 66 over 80 tested queries\nwith ChatGPT as an evaluator. To address this issue, we propose a calibration\nframework with three simple yet effective strategies: 1) Multiple Evidence\nCalibration, which requires the evaluator model to generate multiple evaluation\nevidence before assigning ratings; 2) Balanced Position Calibration, which\naggregates results across various orders to determine the final score; 3)\nHuman-in-the-Loop Calibration, which introduces a balanced position diversity\nentropy to measure the difficulty of each example and seeks human assistance\nwhen needed. We also manually annotate the \"win/tie/lose\" outcomes of responses\nfrom ChatGPT and Vicuna-13B in the Vicuna Benchmark's question prompt, and\nextensive experiments demonstrate that our approach successfully mitigates\nevaluation bias, resulting in closer alignment with human judgments. We release\nour code and human annotation at \\url{https://github.com/i-Eval/FairEval} to\nfacilitate future research.\n","authors":["Peiyi Wang","Lei Li","Liang Chen","Zefan Cai","Dawei Zhu","Binghuai Lin","Yunbo Cao","Qi Liu","Tianyu Liu","Zhifang Sui"],"pdf_url":"https://arxiv.org/pdf/2305.17926v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13081v2","updated":"2023-08-30T13:16:46Z","published":"2023-08-24T20:57:07Z","title":"Formal specification terminology for demographic agent-based models of\n fixed-step single-clocked simulations","summary":" This document presents adequate formal terminology for the mathematical\nspecification of a subset of Agent Based Models (ABMs) in the field of\nDemography. The simulation of the targeted ABMs follows a fixed-step\nsingle-clocked pattern. The proposed terminology further improves the model\nunderstanding and can act as a stand-alone methodology for the specification\nand optionally the documentation of a significant set of (demographic) ABMs.\nNevertheless, it is imaginable the this terminology probably with further\nextensions can be merged with the largely-informal widely-used model\ndocumentation and communication O.D.D. protocol [Grimm and et al., 2020,\nAmouroux et al., 2010] to reduce many sources of ambiguity, hindering model\nreplications by other modelers. A published demographic model documentation,\nlargely simplified version of the Lone Parent Model [Gostoli and Silverman,\n2020] is separately published in [Elsheikh, 2023b] as illustration for the\nformal terminology. The model was implemented in the Julia language [Elsheikh,\n2023a] based on the Agents.jl julia package [Datseris et al., 2022].\n","authors":["Atiyah Elsheikh"],"pdf_url":"https://arxiv.org/pdf/2308.13081v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2307.16548"},{"id":"http://arxiv.org/abs/2308.15214v2","updated":"2023-08-30T13:13:19Z","published":"2023-08-29T11:08:40Z","title":"FurChat: An Embodied Conversational Agent using LLMs, Combining Open and\n Closed-Domain Dialogue with Facial Expressions","summary":" We demonstrate an embodied conversational agent that can function as a\nreceptionist and generate a mixture of open and closed-domain dialogue along\nwith facial expressions, by using a large language model (LLM) to develop an\nengaging conversation. We deployed the system onto a Furhat robot, which is\nhighly expressive and capable of using both verbal and nonverbal cues during\ninteraction. The system was designed specifically for the National Robotarium\nto interact with visitors through natural conversations, providing them with\ninformation about the facilities, research, news, upcoming events, etc. The\nsystem utilises the state-of-the-art GPT-3.5 model to generate such information\nalong with domain-general conversations and facial expressions based on prompt\nengineering.\n","authors":["Neeraj Cherakara","Finny Varghese","Sheena Shabana","Nivan Nelson","Abhiram Karukayil","Rohith Kulothungan","Mohammed Afil Farhan","Birthe Nesset","Meriam Moujahid","Tanvi Dinkar","Verena Rieser","Oliver Lemon"],"pdf_url":"https://arxiv.org/pdf/2308.15214v2.pdf","comment":"5 pages, 2 figures, Accepted at SIGDIAL 2023 (24th Meeting of the\n Special Interest Group on Discourse and Dialogue), for the demo video, see\n https://youtu.be/fwtUl1kl22s"},{"id":"http://arxiv.org/abs/2211.02423v2","updated":"2023-08-30T12:30:33Z","published":"2022-11-04T12:56:12Z","title":"CLSE: Corpus of Linguistically Significant Entities","summary":" One of the biggest challenges of natural language generation (NLG) is the\nproper handling of named entities. Named entities are a common source of\ngrammar mistakes such as wrong prepositions, wrong article handling, or\nincorrect entity inflection. Without factoring linguistic representation, such\nerrors are often underrepresented when evaluating on a small set of arbitrarily\npicked argument values, or when translating a dataset from a linguistically\nsimpler language, like English, to a linguistically complex language, like\nRussian. However, for some applications, broadly precise grammatical\ncorrectness is critical -- native speakers may find entity-related grammar\nerrors silly, jarring, or even offensive.\n To enable the creation of more linguistically diverse NLG datasets, we\nrelease a Corpus of Linguistically Significant Entities (CLSE) annotated by\nlinguist experts. The corpus includes 34 languages and covers 74 different\nsemantic types to support various applications from airline ticketing to video\ngames. To demonstrate one possible use of CLSE, we produce an augmented version\nof the Schema-Guided Dialog Dataset, SGD-CLSE. Using the CLSE's entities and a\nsmall number of human translations, we create a linguistically representative\nNLG evaluation benchmark in three languages: French (high-resource), Marathi\n(low-resource), and Russian (highly inflected language). We establish quality\nbaselines for neural, template-based, and hybrid NLG systems and discuss the\nstrengths and weaknesses of each approach.\n","authors":["Aleksandr Chuklin","Justin Zhao","Mihir Kale"],"pdf_url":"https://arxiv.org/pdf/2211.02423v2.pdf","comment":"Proceedings of the 2nd Workshop on Natural Language Generation,\n Evaluation, and Metrics (GEM 2022) at EMNLP 2022"},{"id":"http://arxiv.org/abs/2308.15987v1","updated":"2023-08-30T12:18:18Z","published":"2023-08-30T12:18:18Z","title":"FPTQ: Fine-grained Post-Training Quantization for Large Language Models","summary":" In the era of large-scale language models, the substantial parameter size\nposes significant challenges for deployment. Being a prevalent compression\ntechnique, quantization has emerged as the mainstream practice to tackle this\nissue, which is mainly centered on two recipes W8A8 and W4A16 (i.e. weights and\nactivations in such bit widths). In this study, we propose a novel W4A8\npost-training quantization method for the available open-sourced LLMs, which\ncombines the advantages of both two recipes. Therefore, we can leverage the\nbenefit in the I/O utilization of 4-bit weight quantization and the\nacceleration due to 8-bit matrix computation. Nevertheless, the W4A8 faces\nnotorious performance degradation. As a remedy, we involve layerwise activation\nquantization strategies which feature a novel logarithmic equalization for most\nintractable layers, and we combine them with fine-grained weight quantization.\nWithout whistles and bells, we eliminate the necessity for further fine-tuning\nand obtain the state-of-the-art W4A8 quantized performance on BLOOM, LLaMA, and\nLLaMA-2 on standard benchmarks. We confirm that the W4A8 quantization is\nachievable for the deployment of large language models, fostering their\nwide-spreading real-world applications.\n","authors":["Qingyuan Li","Yifan Zhang","Liang Li","Peng Yao","Bo Zhang","Xiangxiang Chu","Yerui Sun","Li Du","Yuchen Xie"],"pdf_url":"https://arxiv.org/pdf/2308.15987v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15982v1","updated":"2023-08-30T12:10:17Z","published":"2023-08-30T12:10:17Z","title":"MerA: Merging Pretrained Adapters For Few-Shot Learning","summary":" Adapter tuning, which updates only a few parameters, has become a mainstream\nmethod for fine-tuning pretrained language models to downstream tasks. However,\nit often yields subpar results in few-shot learning. AdapterFusion, which\nassembles pretrained adapters using composition layers tailored to specific\ntasks, is a possible solution but significantly increases trainable parameters\nand deployment costs. Despite this, our preliminary study reveals that even\nsingle adapters can outperform Adapterfusion in few-shot learning, urging us to\npropose \\textbf{\\texttt{Merging Pretrained Adapters}} (MerA) that efficiently\nincorporates pretrained adapters to a single model through model fusion.\nExtensive experiments on two PLMs demonstrate that MerA achieves substantial\nimprovements compared to both single adapters and AdapterFusion. To further\nenhance the capacity of MerA, we also introduce a simple yet effective\ntechnique, referred to as the \"\\textit{same-track}\" setting, that merges\nadapters from the same track of pretraining tasks. With the implementation of\nthe \"\\textit{same-track}\" setting, we observe even more impressive gains,\nsurpassing the performance of both full fine-tuning and adapter tuning by a\nsubstantial margin, e.g., 3.5\\% in MRPC and 5.0\\% in MNLI.\n","authors":["Shwai He","Run-Ze Fan","Liang Ding","Li Shen","Tianyi Zhou","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2308.15982v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15961v1","updated":"2023-08-30T11:35:21Z","published":"2023-08-30T11:35:21Z","title":"Finding-Aware Anatomical Tokens for Chest X-Ray Automated Reporting","summary":" The task of radiology reporting comprises describing and interpreting the\nmedical findings in radiographic images, including description of their\nlocation and appearance. Automated approaches to radiology reporting require\nthe image to be encoded into a suitable token representation for input to the\nlanguage model. Previous methods commonly use convolutional neural networks to\nencode an image into a series of image-level feature map representations.\nHowever, the generated reports often exhibit realistic style but imperfect\naccuracy. Inspired by recent works for image captioning in the general domain\nin which each visual token corresponds to an object detected in an image, we\ninvestigate whether using local tokens corresponding to anatomical structures\ncan improve the quality of the generated reports. We introduce a novel\nadaptation of Faster R-CNN in which finding detection is performed for the\ncandidate bounding boxes extracted during anatomical structure localisation. We\nuse the resulting bounding box feature representations as our set of\nfinding-aware anatomical tokens. This encourages the extracted anatomical\ntokens to be informative about the findings they contain (required for the\nfinal task of radiology reporting). Evaluating on the MIMIC-CXR dataset of\nchest X-Ray images, we show that task-aware anatomical tokens give\nstate-of-the-art performance when integrated into an automated reporting\npipeline, yielding generated reports with improved clinical accuracy.\n","authors":["Francesco Dalla Serra","Chaoyang Wang","Fani Deligianni","Jeffrey Dalton","Alison Q. O'Neil"],"pdf_url":"https://arxiv.org/pdf/2308.15961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15952v1","updated":"2023-08-30T11:02:26Z","published":"2023-08-30T11:02:26Z","title":"Benchmarking Multilabel Topic Classification in the Kyrgyz Language","summary":" Kyrgyz is a very underrepresented language in terms of modern natural\nlanguage processing resources. In this work, we present a new public benchmark\nfor topic classification in Kyrgyz, introducing a dataset based on collected\nand annotated data from the news site 24.KG and presenting several baseline\nmodels for news classification in the multilabel setting. We train and evaluate\nboth classical statistical and neural models, reporting the scores, discussing\nthe results, and proposing directions for future work.\n","authors":["Anton Alekseev","Sergey I. Nikolenko","Gulnara Kabaeva"],"pdf_url":"https://arxiv.org/pdf/2308.15952v1.pdf","comment":"Accepted to AIST 2023"},{"id":"http://arxiv.org/abs/2308.09662v3","updated":"2023-08-30T10:21:00Z","published":"2023-08-18T16:27:04Z","title":"Red-Teaming Large Language Models using Chain of Utterances for\n Safety-Alignment","summary":" Larger language models (LLMs) have taken the world by storm with their\nmassive multi-tasking capabilities simply by optimizing over a next-word\nprediction objective. With the emergence of their properties and encoded\nknowledge, the risk of LLMs producing harmful outputs increases, making them\nunfit for scalable deployment for the public. In this work, we propose a new\nsafety evaluation benchmark RED-EVAL that carries out red-teaming. We show that\neven widely deployed models are susceptible to the Chain of Utterances-based\n(CoU) prompting, jailbreaking closed source LLM-based systems such as GPT-4 and\nChatGPT to unethically respond to more than 65% and 73% of harmful queries. We\nalso demonstrate the consistency of the RED-EVAL across 8 open-source LLMs in\ngenerating harmful responses in more than 86% of the red-teaming attempts.\nNext, we propose RED-INSTRUCT--An approach for the safety alignment of LLMs. It\nconstitutes two phases: 1) HARMFULQA data collection: Leveraging CoU prompting,\nwe collect a dataset that consists of 1.9K harmful questions covering a wide\nrange of topics, 9.5K safe and 7.3K harmful conversations from ChatGPT; 2)\nSAFE-ALIGN: We demonstrate how the conversational dataset can be used for the\nsafety alignment of LLMs by minimizing the negative log-likelihood over helpful\nresponses and penalizing over harmful responses by gradient accent over sample\nloss. Our model STARLING, a fine-tuned Vicuna-7B, is observed to be more safely\naligned when evaluated on RED-EVAL and HHH benchmarks while preserving the\nutility of the baseline models (TruthfulQA, MMLU, and BBH).\n","authors":["Rishabh Bhardwaj","Soujanya Poria"],"pdf_url":"https://arxiv.org/pdf/2308.09662v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15930v1","updated":"2023-08-30T10:12:39Z","published":"2023-08-30T10:12:39Z","title":"LLaSM: Large Language and Speech Model","summary":" Multi-modal large language models have garnered significant interest\nrecently. Though, most of the works focus on vision-language multi-modal models\nproviding strong capabilities in following vision-and-language instructions.\nHowever, we claim that speech is also an important modality through which\nhumans interact with the world. Hence, it is crucial for a general-purpose\nassistant to be able to follow multi-modal speech-and-language instructions. In\nthis work, we propose Large Language and Speech Model (LLaSM). LLaSM is an\nend-to-end trained large multi-modal speech-language model with cross-modal\nconversational abilities, capable of following speech-and-language\ninstructions. Our early experiments show that LLaSM demonstrates a more\nconvenient and natural way for humans to interact with artificial intelligence.\nSpecifically, we also release a large Speech Instruction Following dataset\nLLaSM-Audio-Instructions. Code and demo are available at\nhttps://github.com/LinkSoul-AI/LLaSM and\nhttps://huggingface.co/spaces/LinkSoul/LLaSM. The LLaSM-Audio-Instructions\ndataset is available at\nhttps://huggingface.co/datasets/LinkSoul/LLaSM-Audio-Instructions.\n","authors":["Yu Shu","Siwei Dong","Guangyao Chen","Wenhao Huang","Ruihua Zhang","Daochen Shi","Qiqi Xiang","Yemin Shi"],"pdf_url":"https://arxiv.org/pdf/2308.15930v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15906v1","updated":"2023-08-30T09:19:06Z","published":"2023-08-30T09:19:06Z","title":"Is the U.S. Legal System Ready for AI's Challenges to Human Values?","summary":" Our interdisciplinary study investigates how effectively U.S. laws confront\nthe challenges posed by Generative AI to human values. Through an analysis of\ndiverse hypothetical scenarios crafted during an expert workshop, we have\nidentified notable gaps and uncertainties within the existing legal framework\nregarding the protection of fundamental values, such as autonomy, privacy,\ndignity, diversity, equality, and physical/mental well-being. Constitutional\nand civil rights, it appears, may not provide sufficient protection against\nAI-generated discriminatory outputs. Furthermore, even if we exclude the\nliability shield provided by Section 230, proving causation for defamation and\nproduct liability claims is a challenging endeavor due to the intricate and\nopaque nature of AI systems. To address the unique and unforeseeable threats\nposed by Generative AI, we advocate for legal frameworks that evolve to\nrecognize new threat and provide proactive, auditable guidelines to industry\nstakeholders. Addressing these issues requires deep interdisciplinary\ncollaborations to identify harms, values, and mitigation strategies.\n","authors":["Inyoung Cheong","Aylin Caliskan","Tadayoshi Kohno"],"pdf_url":"https://arxiv.org/pdf/2308.15906v1.pdf","comment":"26 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.15885v1","updated":"2023-08-30T09:04:06Z","published":"2023-08-30T09:04:06Z","title":"Towards One-Shot Learning for Text Classification using Inductive Logic\n Programming","summary":" With the ever-increasing potential of AI to perform personalised tasks, it is\nbecoming essential to develop new machine learning techniques which are\ndata-efficient and do not require hundreds or thousands of training data. In\nthis paper, we explore an Inductive Logic Programming approach for one-shot\ntext classification. In particular, we explore the framework of\nMeta-Interpretive Learning (MIL), along with using common-sense background\nknowledge extracted from ConceptNet. Results indicate that MIL can learn text\nclassification rules from a small number of training examples. Moreover, the\nhigher complexity of chosen examples, the higher accuracy of the outcome.\n","authors":["Ghazal Afroozi Milani","Daniel Cyrus","Alireza Tamaddoni-Nezhad"],"pdf_url":"https://arxiv.org/pdf/2308.15885v1.pdf","comment":"In Proceedings ICLP 2023, arXiv:2308.14898"},{"id":"http://arxiv.org/abs/2308.15122v2","updated":"2023-08-30T09:03:23Z","published":"2023-08-29T08:41:16Z","title":"SpikeBERT: A Language Spikformer Trained with Two-Stage Knowledge\n Distillation from BERT","summary":" Spiking neural networks (SNNs) offer a promising avenue to implement deep\nneural networks in a more energy-efficient way. However, the network\narchitectures of existing SNNs for language tasks are too simplistic, and deep\narchitectures have not been fully explored, resulting in a significant\nperformance gap compared to mainstream transformer-based networks such as BERT.\nTo this end, we improve a recently-proposed spiking transformer (i.e.,\nSpikformer) to make it possible to process language tasks and propose a\ntwo-stage knowledge distillation method for training it, which combines\npre-training by distilling knowledge from BERT with a large collection of\nunlabelled texts and fine-tuning with task-specific instances via knowledge\ndistillation again from the BERT fine-tuned on the same training examples.\nThrough extensive experimentation, we show that the models trained with our\nmethod, named SpikeBERT, outperform state-of-the-art SNNs and even achieve\ncomparable results to BERTs on text classification tasks for both English and\nChinese with much less energy consumption.\n","authors":["Changze Lv","Tianlong Li","Jianhan Xu","Chenxi Gu","Zixuan Ling","Cenyuan Zhang","Xiaoqing Zheng","Xuanjing Huang"],"pdf_url":"https://arxiv.org/pdf/2308.15122v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.03780v3","updated":"2023-08-30T08:56:37Z","published":"2023-02-07T22:37:21Z","title":"Reliable Natural Language Understanding with Large Language Models and\n Answer Set Programming","summary":" Humans understand language by extracting information (meaning) from\nsentences, combining it with existing commonsense knowledge, and then\nperforming reasoning to draw conclusions. While large language models (LLMs)\nsuch as GPT-3 and ChatGPT are able to leverage patterns in the text to solve a\nvariety of NLP tasks, they fall short in problems that require reasoning. They\nalso cannot reliably explain the answers generated for a given question. In\norder to emulate humans better, we propose STAR, a framework that combines LLMs\nwith Answer Set Programming (ASP). We show how LLMs can be used to effectively\nextract knowledge -- represented as predicates -- from language. Goal-directed\nASP is then employed to reliably reason over this knowledge. We apply the STAR\nframework to three different NLU tasks requiring reasoning: qualitative\nreasoning, mathematical reasoning, and goal-directed conversation. Our\nexperiments reveal that STAR is able to bridge the gap of reasoning in NLU\ntasks, leading to significant performance improvements, especially for smaller\nLLMs, i.e., LLMs with a smaller number of parameters. NLU applications\ndeveloped using the STAR framework are also explainable: along with the\npredicates generated, a justification in the form of a proof tree can be\nproduced for a given output.\n","authors":["Abhiramon Rajasekharan","Yankai Zeng","Parth Padalkar","Gopal Gupta"],"pdf_url":"https://arxiv.org/pdf/2302.03780v3.pdf","comment":"In Proceedings ICLP 2023, arXiv:2308.14898"},{"id":"http://arxiv.org/abs/2211.05994v4","updated":"2023-08-30T08:02:56Z","published":"2022-11-11T04:29:02Z","title":"A Survey of Knowledge Enhanced Pre-trained Language Models","summary":" Pre-trained Language Models (PLMs) which are trained on large text corpus via\nself-supervised learning method, have yielded promising performance on various\ntasks in Natural Language Processing (NLP). However, though PLMs with huge\nparameters can effectively possess rich knowledge learned from massive training\ntext and benefit downstream tasks at the fine-tuning stage, they still have\nsome limitations such as poor reasoning ability due to the lack of external\nknowledge. Research has been dedicated to incorporating knowledge into PLMs to\ntackle these issues. In this paper, we present a comprehensive review of\nKnowledge Enhanced Pre-trained Language Models (KE-PLMs) to provide a clear\ninsight into this thriving field. We introduce appropriate taxonomies\nrespectively for Natural Language Understanding (NLU) and Natural Language\nGeneration (NLG) to highlight these two main tasks of NLP. For NLU, we divide\nthe types of knowledge into four categories: linguistic knowledge, text\nknowledge, knowledge graph (KG), and rule knowledge. The KE-PLMs for NLG are\ncategorized into KG-based and retrieval-based methods. Finally, we point out\nsome promising future directions of KE-PLMs.\n","authors":["Linmei Hu","Zeyi Liu","Ziwang Zhao","Lei Hou","Liqiang Nie","Juanzi Li"],"pdf_url":"https://arxiv.org/pdf/2211.05994v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15813v1","updated":"2023-08-30T07:36:12Z","published":"2023-08-30T07:36:12Z","title":"Knowledge-grounded Natural Language Recommendation Explanation","summary":" Explanations accompanied by a recommendation can assist users in\nunderstanding the decision made by recommendation systems, which in turn\nincreases a user's confidence and trust in the system. Recently, research has\nfocused on generating natural language explanations in a human-readable format.\nThus far, the proposed approaches leverage item reviews written by users, which\nare often subjective, sparse in language, and unable to account for new items\nthat have not been purchased or reviewed before. Instead, we aim to generate\nfact-grounded recommendation explanations that are objectively described with\nitem features while implicitly considering a user's preferences, based on the\nuser's purchase history. To achieve this, we propose a knowledge graph (KG)\napproach to natural language explainable recommendation. Our approach draws on\nuser-item features through a novel collaborative filtering-based KG\nrepresentation to produce fact-grounded, personalized explanations, while\njointly learning user-item representations for recommendation scoring.\nExperimental results show that our approach consistently outperforms previous\nstate-of-the-art models on natural language explainable recommendation.\n","authors":["Anthony Colas","Jun Araki","Zhengyu Zhou","Bingqing Wang","Zhe Feng"],"pdf_url":"https://arxiv.org/pdf/2308.15813v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15812v1","updated":"2023-08-30T07:35:32Z","published":"2023-08-30T07:35:32Z","title":"Peering Through Preferences: Unraveling Feedback Acquisition for\n Aligning Large Language Models","summary":" Aligning large language models (LLMs) with human values and intents\ncritically involves the use of human or AI feedback. While dense feedback\nannotations are expensive to acquire and integrate, sparse feedback presents a\nstructural design choice between ratings (e.g., score Response A on a scale of\n1-7) and rankings (e.g., is Response A better than Response B?). In this work,\nwe analyze the effect of this design choice for the alignment and evaluation of\nLLMs. We uncover an inconsistency problem wherein the preferences inferred from\nratings and rankings significantly disagree 60% for both human and AI\nannotators. Our subsequent analysis identifies various facets of annotator\nbiases that explain this phenomena, such as human annotators would rate denser\nresponses higher while preferring accuracy during pairwise judgments. To our\nsurprise, we also observe that the choice of feedback protocol also has a\nsignificant effect on the evaluation of aligned LLMs. In particular, we find\nthat LLMs that leverage rankings data for alignment (say model X) are preferred\nover those that leverage ratings data (say model Y), with a rank-based\nevaluation protocol (is X/Y's response better than reference response?) but not\nwith a rating-based evaluation protocol (score Rank X/Y's response on a scale\nof 1-7). Our findings thus shed light on critical gaps in methods for\nevaluating the real-world utility of language models and their strong\ndependence on the feedback protocol used for alignment. Our code and data are\navailable at https://github.com/Hritikbansal/sparse_feedback.\n","authors":["Hritik Bansal","John Dang","Aditya Grover"],"pdf_url":"https://arxiv.org/pdf/2308.15812v1.pdf","comment":"24 pages, 12 Tables, 3 Figures"},{"id":"http://arxiv.org/abs/2308.15793v1","updated":"2023-08-30T06:53:24Z","published":"2023-08-30T06:53:24Z","title":"HAlf-MAsked Model for Named Entity Sentiment analysis","summary":" Named Entity Sentiment analysis (NESA) is one of the most actively developing\napplication domains in Natural Language Processing (NLP). Social media NESA is\na significant field of opinion analysis since detecting and tracking sentiment\ntrends in the news flow is crucial for building various analytical systems and\nmonitoring the media image of specific people or companies. In this paper, we\nstudy different transformers-based solutions NESA in RuSentNE-23 evaluation.\nDespite the effectiveness of the BERT-like models, they can still struggle with\ncertain challenges, such as overfitting, which appeared to be the main obstacle\nin achieving high accuracy on the RuSentNE-23 data. We present several\napproaches to overcome this problem, among which there is a novel technique of\nadditional pass over given data with masked entity before making the final\nprediction so that we can combine logits from the model when it knows the exact\nentity it predicts sentiment for and when it does not. Utilizing this\ntechnique, we ensemble multiple BERT- like models trained on different subsets\nof data to improve overall performance. Our proposed model achieves the best\nresult on RuSentNE-23 evaluation data and demonstrates improved consistency in\nentity-level sentiment analysis.\n","authors":["Anton Kabaev","Pavel Podberezko","Andrey Kaznacheev","Sabina Abdullayeva"],"pdf_url":"https://arxiv.org/pdf/2308.15793v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06566v3","updated":"2023-08-30T06:46:43Z","published":"2023-05-11T04:51:21Z","title":"ONCE: Boosting Content-based Recommendation with Both Open- and\n Closed-source Large Language Models","summary":" Personalized content-based recommender systems have become indispensable\ntools for users to navigate through the vast amount of content available on\nplatforms like daily news websites and book recommendation services. However,\nexisting recommenders face significant challenges in understanding the content\nof items. Large language models (LLMs), which possess deep semantic\ncomprehension and extensive knowledge from pretraining, have proven to be\neffective in various natural language processing tasks. In this study, we\nexplore the potential of leveraging both open- and closed-source LLMs to\nenhance content-based recommendation. With open-source LLMs, we utilize their\ndeep layers as content encoders, enriching the representation of content at the\nembedding level. For closed-source LLMs, we employ prompting techniques to\nenrich the training data at the token level. Through comprehensive experiments,\nwe demonstrate the high effectiveness of both types of LLMs and show the\nsynergistic relationship between them. Notably, we observed a significant\nrelative improvement of up to 19.32% compared to existing state-of-the-art\nrecommendation models. These findings highlight the immense potential of both\nopen- and closed-source of LLMs in enhancing content-based recommendation\nsystems. We will make our code and LLM-generated data available for other\nresearchers to reproduce our results.\n","authors":["Qijiong Liu","Nuo Chen","Tetsuya Sakai","Xiao-Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2305.06566v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15772v1","updated":"2023-08-30T05:41:29Z","published":"2023-08-30T05:41:29Z","title":"Task-Based MoE for Multitask Multilingual Machine Translation","summary":" Mixture-of-experts (MoE) architecture has been proven a powerful method for\ndiverse tasks in training deep models in many applications. However, current\nMoE implementations are task agnostic, treating all tokens from different tasks\nin the same manner. In this work, we instead design a novel method that\nincorporates task information into MoE models at different granular levels with\nshared dynamic task-based adapters. Our experiments and analysis show the\nadvantages of our approaches over the dense and canonical MoE models on\nmulti-task multilingual machine translations. With task-specific adapters, our\nmodels can additionally generalize to new tasks efficiently.\n","authors":["Hai Pham","Young Jin Kim","Subhabrata Mukherjee","David P. Woodruff","Barnabas Poczos","Hany Hassan Awadalla"],"pdf_url":"https://arxiv.org/pdf/2308.15772v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15053v2","updated":"2023-08-30T04:46:19Z","published":"2023-08-29T06:27:58Z","title":"Adapting Text-based Dialogue State Tracker for Spoken Dialogues","summary":" Although there have been remarkable advances in dialogue systems through the\ndialogue systems technology competition (DSTC), it remains one of the key\nchallenges to building a robust task-oriented dialogue system with a speech\ninterface. Most of the progress has been made for text-based dialogue systems\nsince there are abundant datasets with written corpora while those with spoken\ndialogues are very scarce. However, as can be seen from voice assistant systems\nsuch as Siri and Alexa, it is of practical importance to transfer the success\nto spoken dialogues. In this paper, we describe our engineering effort in\nbuilding a highly successful model that participated in the speech-aware\ndialogue systems technology challenge track in DSTC11. Our model consists of\nthree major modules: (1) automatic speech recognition error correction to\nbridge the gap between the spoken and the text utterances, (2) text-based\ndialogue system (D3ST) for estimating the slots and values using slot\ndescriptions, and (3) post-processing for recovering the error of the estimated\nslot value. Our experiments show that it is important to use an explicit\nautomatic speech recognition error correction module, post-processing, and data\naugmentation to adapt a text-based dialogue state tracker for spoken dialogue\ncorpora.\n","authors":["Jaeseok Yoon","Seunghyun Hwang","Ran Han","Jeonguk Bang","Kee-Eung Kim"],"pdf_url":"https://arxiv.org/pdf/2308.15053v2.pdf","comment":"8 pages, 5 figures, Accepted at the DSTC 11 Workshop to be located at\n SIGDIAL 2023"},{"id":"http://arxiv.org/abs/1811.03325v5","updated":"2023-08-30T04:39:22Z","published":"2018-11-08T09:16:19Z","title":"Marshall-Olkin Power-Law Distributions in Length-Frequency of Entities","summary":" Entities involve important concepts with concrete meanings and play important\nroles in numerous linguistic tasks. Entities have different forms in different\nlinguistic tasks and researchers treat those different forms as different\nconcepts. In this paper, we are curious to know whether there are some common\ncharacteristics that connect those different forms of entities. Specifically,\nwe investigate the underlying distributions of entities from different types\nand different languages, trying to figure out some common characteristics\nbehind those diverse entities. After analyzing twelve datasets about different\ntypes of entities and eighteen datasets about entities in different languages,\nwe find that while these entities are dramatically diverse from each other in\nmany aspects, their length-frequencies can be well characterized by a family of\nMarshall-Olkin power-law (MOPL) distributions. We conduct experiments on those\nthirty datasets about entities in different types and different languages, and\nexperimental results demonstrate that MOPL models characterize the\nlength-frequencies of entities much better than two state-of-the-art power-law\nmodels and an alternative log-normal model. Experimental results also\ndemonstrate that MOPL models are scalable to the length-frequency of entities\nin large-scale real-world datasets.\n","authors":["Xiaoshi Zhong","Xiang Yu","Erik Cambria","Jagath C. Rajapakse"],"pdf_url":"https://arxiv.org/pdf/1811.03325v5.pdf","comment":"33 pages, 3 figures (30 subfigures), 8 tables. To appear in\n Knowledge-Based Systems"},{"id":"http://arxiv.org/abs/2308.15745v1","updated":"2023-08-30T03:52:28Z","published":"2023-08-30T03:52:28Z","title":"Cyberbullying Detection for Low-resource Languages and Dialects: Review\n of the State of the Art","summary":" The struggle of social media platforms to moderate content in a timely\nmanner, encourages users to abuse such platforms to spread vulgar or abusive\nlanguage, which, when performed repeatedly becomes cyberbullying a social\nproblem taking place in virtual environments, yet with real-world consequences,\nsuch as depression, withdrawal, or even suicide attempts of its victims.\nSystems for the automatic detection and mitigation of cyberbullying have been\ndeveloped but, unfortunately, the vast majority of them are for the English\nlanguage, with only a handful available for low-resource languages. To estimate\nthe present state of research and recognize the needs for further development,\nin this paper we present a comprehensive systematic survey of studies done so\nfar for automatic cyberbullying detection in low-resource languages. We\nanalyzed all studies on this topic that were available. We investigated more\nthan seventy published studies on automatic detection of cyberbullying or\nrelated language in low-resource languages and dialects that were published\nbetween around 2017 and January 2023. There are 23 low-resource languages and\ndialects covered by this paper, including Bangla, Hindi, Dravidian languages\nand others. In the survey, we identify some of the research gaps of previous\nstudies, which include the lack of reliable definitions of cyberbullying and\nits relevant subcategories, biases in the acquisition, and annotation of data.\nBased on recognizing those research gaps, we provide some suggestions for\nimproving the general research conduct in cyberbullying detection, with a\nprimary focus on low-resource languages. Based on those proposed suggestions,\nwe collect and release a cyberbullying dataset in the Chittagonian dialect of\nBangla and propose a number of initial ML solutions trained on that dataset. In\naddition, pre-trained transformer-based the BanglaBERT model was also\nattempted.\n","authors":["Tanjim Mahmud","Michal Ptaszynski","Juuso Eronen","Fumito Masui"],"pdf_url":"https://arxiv.org/pdf/2308.15745v1.pdf","comment":"52 Pages"},{"id":"http://arxiv.org/abs/2308.03188v2","updated":"2023-08-30T03:47:34Z","published":"2023-08-06T18:38:52Z","title":"Automatically Correcting Large Language Models: Surveying the landscape\n of diverse self-correction strategies","summary":" Large language models (LLMs) have demonstrated remarkable performance across\na wide array of NLP tasks. However, their efficacy is undermined by undesired\nand inconsistent behaviors, including hallucination, unfaithful reasoning, and\ntoxic content. A promising approach to rectify these flaws is self-correction,\nwhere the LLM itself is prompted or guided to fix problems in its own output.\nTechniques leveraging automated feedback -- either produced by the LLM itself\nor some external system -- are of particular interest as they are a promising\nway to make LLM-based solutions more practical and deployable with minimal\nhuman feedback. This paper presents a comprehensive review of this emerging\nclass of techniques. We analyze and taxonomize a wide array of recent work\nutilizing these strategies, including training-time, generation-time, and\npost-hoc correction. We also summarize the major applications of this strategy\nand conclude by discussing future directions and challenges.\n","authors":["Liangming Pan","Michael Saxon","Wenda Xu","Deepak Nathani","Xinyi Wang","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.03188v2.pdf","comment":"Work in Progress. Version 2"},{"id":"http://arxiv.org/abs/2308.05361v2","updated":"2023-08-30T03:11:56Z","published":"2023-08-10T06:08:20Z","title":"WeaverBird: Empowering Financial Decision-Making with Large Language\n Model, Knowledge Base, and Search Engine","summary":" We present WeaverBird, an intelligent dialogue system designed specifically\nfor the finance domain. Our system harnesses a large language model of GPT\narchitecture that has been tuned using extensive corpora of finance-related\ntext. As a result, our system possesses the capability to understand complex\nfinancial queries, such as \"How should I manage my investments during\ninflation?\", and provide informed responses. Furthermore, our system\nincorporates a local knowledge base and a search engine to retrieve relevant\ninformation. The final responses are conditioned on the search results and\ninclude proper citations to the sources, thus enjoying an enhanced credibility.\nThrough a range of finance-related questions, we have demonstrated the superior\nperformance of our system compared to other models. To experience our system\nfirsthand, users can interact with our live demo at\nhttps://weaverbird.ttic.edu, as well as watch our 2-min video illustration at\nhttps://www.youtube.com/watch?v=fyV2qQkX6Tc.\n","authors":["Siqiao Xue","Fan Zhou","Yi Xu","Hongyu Zhao","Shuo Xie","Qingyang Dai","Caigao Jiang","James Zhang","Jun Zhou","Dacheng Xiu","Hongyuan Mei"],"pdf_url":"https://arxiv.org/pdf/2308.05361v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15727v1","updated":"2023-08-30T03:06:47Z","published":"2023-08-30T03:06:47Z","title":"Quantifying and Analyzing Entity-level Memorization in Large Language\n Models","summary":" Large language models (LLMs) have been proven capable of memorizing their\ntraining data, which can be extracted through specifically designed prompts. As\nthe scale of datasets continues to grow, privacy risks arising from\nmemorization have attracted increasing attention. Quantifying language model\nmemorization helps evaluate potential privacy risks. However, prior works on\nquantifying memorization require access to the precise original data or incur\nsubstantial computational overhead, making it difficult for applications in\nreal-world language models. To this end, we propose a fine-grained,\nentity-level definition to quantify memorization with conditions and metrics\ncloser to real-world scenarios. In addition, we also present an approach for\nefficiently extracting sensitive entities from autoregressive language models.\nWe conduct extensive experiments based on the proposed, probing language\nmodels' ability to reconstruct sensitive entities under different settings. We\nfind that language models have strong memorization at the entity level and are\nable to reproduce the training data even with partial leakages. The results\ndemonstrate that LLMs not only memorize their training data but also understand\nassociations between entities. These findings necessitate that trainers of LLMs\nexercise greater prudence regarding model memorization, adopting memorization\nmitigation techniques to preclude privacy violations.\n","authors":["Zhenhong Zhou","Jiuyang Xiang","Chaomeng Chen","Sen Su"],"pdf_url":"https://arxiv.org/pdf/2308.15727v1.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.15711v1","updated":"2023-08-30T02:22:40Z","published":"2023-08-30T02:22:40Z","title":"Optimizing Factual Accuracy in Text Generation through Dynamic Knowledge\n Selection","summary":" Language models (LMs) have revolutionized the way we interact with\ninformation, but they often generate nonfactual text, raising concerns about\ntheir reliability. Previous methods use external knowledge as references for\ntext generation to enhance factuality but often struggle with the knowledge\nmix-up(e.g., entity mismatch) of irrelevant references. Besides,as the length\nof the output text grows, the randomness of sampling can escalate,\ndetrimentally impacting the factual accuracy of the generated text. In this\npaper, we present DKGen, which divide the text generation process into an\niterative process. In each iteration, DKGen takes the input query, the\npreviously generated text and a subset of the reference passages as input to\ngenerate short text. During the process, the subset is dynamically selected\nfrom the full passage set based on their relevance to the previously generated\ntext and the query, largely eliminating the irrelevant references from input.\nTo further enhance DKGen's ability to correctly use these external knowledge,\nDKGen distills the relevance order of reference passages to the cross-attention\ndistribution of decoder. We train and evaluate DKGen on a large-scale benchmark\ndataset. Experiment results show that DKGen outperforms all baseline models.\n","authors":["Hongjin Qian","Zhicheng Dou","Jiejun Tan","Haonan Chen","Haoqi Gu","Ruofei Lai","Xinyu Zhang","Zhao Cao","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2308.15711v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2304.06767v3","updated":"2023-08-30T01:25:29Z","published":"2023-04-13T18:22:40Z","title":"RAFT: Reward rAnked FineTuning for Generative Foundation Model Alignment","summary":" Generative foundation models are susceptible to implicit biases that can\narise from extensive unsupervised training data. Such biases can produce\nsuboptimal samples, skewed outcomes, and unfairness, with potentially serious\nconsequences. Consequently, aligning these models with human ethics and\npreferences is an essential step toward ensuring their responsible and\neffective deployment in real-world applications. Prior research has primarily\nemployed Reinforcement Learning from Human Feedback (RLHF) to address this\nproblem, where generative models are fine-tuned with RL algorithms guided by a\nhuman-feedback-informed reward model. However, the inefficiencies and\ninstabilities associated with RL algorithms frequently present substantial\nobstacles to the successful alignment, necessitating the development of a more\nrobust and streamlined approach. To this end, we introduce a new framework,\nReward rAnked FineTuning (RAFT), designed to align generative models\neffectively. Utilizing a reward model and a sufficient number of samples, our\napproach selects the high-quality samples, discarding those that exhibit\nundesired behavior, and subsequently enhancing the model by fine-tuning on\nthese filtered samples. Our studies show that RAFT can effectively improve the\nmodel performance in both reward learning and other automated metrics in both\nlarge language models and diffusion models.\n","authors":["Hanze Dong","Wei Xiong","Deepanshu Goyal","Yihan Zhang","Winnie Chow","Rui Pan","Shizhe Diao","Jipeng Zhang","Kashun Shum","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2304.06767v3.pdf","comment":"26 pages, 8 figures"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.16187v1","updated":"2023-08-30T17:59:11Z","published":"2023-08-30T17:59:11Z","title":"Boosting Detection in Crowd Analysis via Underutilized Output Features","summary":" Detection-based methods have been viewed unfavorably in crowd analysis due to\ntheir poor performance in dense crowds. However, we argue that the potential of\nthese methods has been underestimated, as they offer crucial information for\ncrowd analysis that is often ignored. Specifically, the area size and\nconfidence score of output proposals and bounding boxes provide insight into\nthe scale and density of the crowd. To leverage these underutilized features,\nwe propose Crowd Hat, a plug-and-play module that can be easily integrated with\nexisting detection models. This module uses a mixed 2D-1D compression technique\nto refine the output features and obtain the spatial and numerical distribution\nof crowd-specific information. Based on these features, we further propose\nregion-adaptive NMS thresholds and a decouple-then-align paradigm that address\nthe major limitations of detection-based methods. Our extensive evaluations on\nvarious crowd analysis tasks, including crowd counting, localization, and\ndetection, demonstrate the effectiveness of utilizing output features and the\npotential of detection-based methods in crowd analysis.\n","authors":["Shaokai Wu","Fengyu Yang"],"pdf_url":"https://arxiv.org/pdf/2308.16187v1.pdf","comment":"project page: https://fredfyyang.github.io/Crowd-Hat/"},{"id":"http://arxiv.org/abs/2308.16184v1","updated":"2023-08-30T17:59:02Z","published":"2023-08-30T17:59:02Z","title":"SAM-Med2D","summary":" The Segment Anything Model (SAM) represents a state-of-the-art research\nadvancement in natural image segmentation, achieving impressive results with\ninput prompts such as points and bounding boxes. However, our evaluation and\nrecent research indicate that directly applying the pretrained SAM to medical\nimage segmentation does not yield satisfactory performance. This limitation\nprimarily arises from significant domain gap between natural images and medical\nimages. To bridge this gap, we introduce SAM-Med2D, the most comprehensive\nstudies on applying SAM to medical 2D images. Specifically, we first collect\nand curate approximately 4.6M images and 19.7M masks from public and private\ndatasets, constructing a large-scale medical image segmentation dataset\nencompassing various modalities and objects. Then, we comprehensively fine-tune\nSAM on this dataset and turn it into SAM-Med2D. Unlike previous methods that\nonly adopt bounding box or point prompts as interactive segmentation approach,\nwe adapt SAM to medical image segmentation through more comprehensive prompts\ninvolving bounding boxes, points, and masks. We additionally fine-tune the\nencoder and decoder of the original SAM to obtain a well-performed SAM-Med2D,\nleading to the most comprehensive fine-tuning strategies to date. Finally, we\nconducted a comprehensive evaluation and analysis to investigate the\nperformance of SAM-Med2D in medical image segmentation across various\nmodalities, anatomical structures, and organs. Concurrently, we validated the\ngeneralization capability of SAM-Med2D on 9 datasets from MICCAI 2023\nchallenge. Overall, our approach demonstrated significantly superior\nperformance and generalization capability compared to SAM.\n","authors":["Junlong Cheng","Jin Ye","Zhongying Deng","Jianpin Chen","Tianbin Li","Haoyu Wang","Yanzhou Su","Ziyan Huang","Jilong Chen","Lei Jiang","Hui Sun","Junjun He","Shaoting Zhang","Min Zhu","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2308.16184v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16182v1","updated":"2023-08-30T17:58:50Z","published":"2023-08-30T17:58:50Z","title":"GREC: Generalized Referring Expression Comprehension","summary":" The objective of Classic Referring Expression Comprehension (REC) is to\nproduce a bounding box corresponding to the object mentioned in a given textual\ndescription. Commonly, existing datasets and techniques in classic REC are\ntailored for expressions that pertain to a single target, meaning a sole\nexpression is linked to one specific object. Expressions that refer to multiple\ntargets or involve no specific target have not been taken into account. This\nconstraint hinders the practical applicability of REC. This study introduces a\nnew benchmark termed as Generalized Referring Expression Comprehension (GREC).\nThis benchmark extends the classic REC by permitting expressions to describe\nany number of target objects. To achieve this goal, we have built the first\nlarge-scale GREC dataset named gRefCOCO. This dataset encompasses a range of\nexpressions: those referring to multiple targets, expressions with no specific\ntarget, and the single-target expressions. The design of GREC and gRefCOCO\nensures smooth compatibility with classic REC. The proposed gRefCOCO dataset, a\nGREC method implementation code, and GREC evaluation code are available at\nhttps://github.com/henghuiding/gRefCOCO.\n","authors":["Shuting He","Henghui Ding","Chang Liu","Xudong Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.16182v1.pdf","comment":"GREC Technical Report, Project Page:\n https://henghuiding.github.io/GRES"},{"id":"http://arxiv.org/abs/2303.17590v2","updated":"2023-08-30T17:46:17Z","published":"2023-03-30T17:57:43Z","title":"Going Beyond Nouns With Vision & Language Models Using Synthetic Data","summary":" Large-scale pre-trained Vision & Language (VL) models have shown remarkable\nperformance in many applications, enabling replacing a fixed set of supported\nclasses with zero-shot open vocabulary reasoning over (almost arbitrary)\nnatural language prompts. However, recent works have uncovered a fundamental\nweakness of these models. For example, their difficulty to understand Visual\nLanguage Concepts (VLC) that go 'beyond nouns' such as the meaning of\nnon-object words (e.g., attributes, actions, relations, states, etc.), or\ndifficulty in performing compositional reasoning such as understanding the\nsignificance of the order of the words in a sentence. In this work, we\ninvestigate to which extent purely synthetic data could be leveraged to teach\nthese models to overcome such shortcomings without compromising their zero-shot\ncapabilities. We contribute Synthetic Visual Concepts (SyViC) - a million-scale\nsynthetic dataset and data generation codebase allowing to generate additional\nsuitable data to improve VLC understanding and compositional reasoning of VL\nmodels. Additionally, we propose a general VL finetuning strategy for\neffectively leveraging SyViC towards achieving these improvements. Our\nextensive experiments and ablations on VL-Checklist, Winoground, and ARO\nbenchmarks demonstrate that it is possible to adapt strong pre-trained VL\nmodels with synthetic data significantly enhancing their VLC understanding\n(e.g. by 9.9% on ARO and 4.3% on VL-Checklist) with under 1% drop in their\nzero-shot accuracy.\n","authors":["Paola Cascante-Bonilla","Khaled Shehada","James Seale Smith","Sivan Doveh","Donghyun Kim","Rameswar Panda","Gül Varol","Aude Oliva","Vicente Ordonez","Rogerio Feris","Leonid Karlinsky"],"pdf_url":"https://arxiv.org/pdf/2303.17590v2.pdf","comment":"Accepted to ICCV 2023. Project page: https://synthetic-vic.github.io/"},{"id":"http://arxiv.org/abs/2308.16154v1","updated":"2023-08-30T17:20:46Z","published":"2023-08-30T17:20:46Z","title":"MMVP: Motion-Matrix-based Video Prediction","summary":" A central challenge of video prediction lies where the system has to reason\nthe objects' future motions from image frames while simultaneously maintaining\nthe consistency of their appearances across frames. This work introduces an\nend-to-end trainable two-stream video prediction framework, Motion-Matrix-based\nVideo Prediction (MMVP), to tackle this challenge. Unlike previous methods that\nusually handle motion prediction and appearance maintenance within the same set\nof modules, MMVP decouples motion and appearance information by constructing\nappearance-agnostic motion matrices. The motion matrices represent the temporal\nsimilarity of each and every pair of feature patches in the input frames, and\nare the sole input of the motion prediction module in MMVP. This design\nimproves video prediction in both accuracy and efficiency, and reduces the\nmodel size. Results of extensive experiments demonstrate that MMVP outperforms\nstate-of-the-art systems on public data sets by non-negligible large margins\n(about 1 db in PSNR, UCF Sports) in significantly smaller model sizes (84% the\nsize or smaller). Please refer to\nhttps://github.com/Kay1794/MMVP-motion-matrix-based-video-prediction for the\nofficial code and the datasets used in this paper.\n","authors":["Yiqi Zhong","Luming Liang","Ilya Zharkov","Ulrich Neumann"],"pdf_url":"https://arxiv.org/pdf/2308.16154v1.pdf","comment":"ICCV 2023 (Oral)"},{"id":"http://arxiv.org/abs/2308.16150v1","updated":"2023-08-30T17:16:02Z","published":"2023-08-30T17:16:02Z","title":"Modality Cycles with Masked Conditional Diffusion for Unsupervised\n Anomaly Segmentation in MRI","summary":" Unsupervised anomaly segmentation aims to detect patterns that are distinct\nfrom any patterns processed during training, commonly called abnormal or\nout-of-distribution patterns, without providing any associated manual\nsegmentations. Since anomalies during deployment can lead to model failure,\ndetecting the anomaly can enhance the reliability of models, which is valuable\nin high-risk domains like medical imaging. This paper introduces Masked\nModality Cycles with Conditional Diffusion (MMCCD), a method that enables\nsegmentation of anomalies across diverse patterns in multimodal MRI. The method\nis based on two fundamental ideas. First, we propose the use of cyclic modality\ntranslation as a mechanism for enabling abnormality detection.\nImage-translation models learn tissue-specific modality mappings, which are\ncharacteristic of tissue physiology. Thus, these learned mappings fail to\ntranslate tissues or image patterns that have never been encountered during\ntraining, and the error enables their segmentation. Furthermore, we combine\nimage translation with a masked conditional diffusion model, which attempts to\n`imagine' what tissue exists under a masked area, further exposing unknown\npatterns as the generative model fails to recreate them. We evaluate our method\non a proxy task by training on healthy-looking slices of BraTS2021\nmulti-modality MRIs and testing on slices with tumors. We show that our method\ncompares favorably to previous unsupervised approaches based on image\nreconstruction and denoising with autoencoders and diffusion models.\n","authors":["Ziyun Liang","Harry Anthony","Felix Wagner","Konstantinos Kamnitsas"],"pdf_url":"https://arxiv.org/pdf/2308.16150v1.pdf","comment":"Accepted in Multiscale Multimodal Medical Imaging workshop in MICCAI\n 2023"},{"id":"http://arxiv.org/abs/2308.01981v2","updated":"2023-08-30T17:02:55Z","published":"2023-08-03T18:28:50Z","title":"CartiMorph: a framework for automated knee articular cartilage\n morphometrics","summary":" We introduce CartiMorph, a framework for automated knee articular cartilage\nmorphometrics. It takes an image as input and generates quantitative metrics\nfor cartilage subregions, including the percentage of full-thickness cartilage\nloss (FCL), mean thickness, surface area, and volume. CartiMorph leverages the\npower of deep learning models for hierarchical image feature representation.\nDeep learning models were trained and validated for tissue segmentation,\ntemplate construction, and template-to-image registration. We established\nmethods for surface-normal-based cartilage thickness mapping, FCL estimation,\nand rule-based cartilage parcellation. Our cartilage thickness map showed less\nerror in thin and peripheral regions. We evaluated the effectiveness of the\nadopted segmentation model by comparing the quantitative metrics obtained from\nmodel segmentation and those from manual segmentation. The root-mean-squared\ndeviation of the FCL measurements was less than 8%, and strong correlations\nwere observed for the mean thickness (Pearson's correlation coefficient $\\rho\n\\in [0.82,0.97]$), surface area ($\\rho \\in [0.82,0.98]$) and volume ($\\rho \\in\n[0.89,0.98]$) measurements. We compared our FCL measurements with those from a\nprevious study and found that our measurements deviated less from the ground\ntruths. We observed superior performance of the proposed rule-based cartilage\nparcellation method compared with the atlas-based approach. CartiMorph has the\npotential to promote imaging biomarkers discovery for knee osteoarthritis.\n","authors":["Yongcheng Yao","Junru Zhong","Liping Zhang","Sheheryar Khan","Weitian Chen"],"pdf_url":"https://arxiv.org/pdf/2308.01981v2.pdf","comment":"To be published in Medical Image Analysis"},{"id":"http://arxiv.org/abs/2308.16145v1","updated":"2023-08-30T17:01:01Z","published":"2023-08-30T17:01:01Z","title":"CircleFormer: Circular Nuclei Detection in Whole Slide Images with\n Circle Queries and Attention","summary":" Both CNN-based and Transformer-based object detection with bounding box\nrepresentation have been extensively studied in computer vision and medical\nimage analysis, but circular object detection in medical images is still\nunderexplored. Inspired by the recent anchor free CNN-based circular object\ndetection method (CircleNet) for ball-shape glomeruli detection in renal\npathology, in this paper, we present CircleFormer, a Transformer-based circular\nmedical object detection with dynamic anchor circles. Specifically, queries\nwith circle representation in Transformer decoder iteratively refine the\ncircular object detection results, and a circle cross attention module is\nintroduced to compute the similarity between circular queries and image\nfeatures. A generalized circle IoU (gCIoU) is proposed to serve as a new\nregression loss of circular object detection as well. Moreover, our approach is\neasy to generalize to the segmentation task by adding a simple segmentation\nbranch to CircleFormer. We evaluate our method in circular nuclei detection and\nsegmentation on the public MoNuSeg dataset, and the experimental results show\nthat our method achieves promising performance compared with the\nstate-of-the-art approaches. The effectiveness of each component is validated\nvia ablation studies as well. Our code is released at:\n\\url{https://github.com/zhanghx-iim-ahu/CircleFormer}.\n","authors":["Hengxu Zhang","Pengpeng Liang","Zhiyong Sun","Bo Song","Erkang Cheng"],"pdf_url":"https://arxiv.org/pdf/2308.16145v1.pdf","comment":"Accepted at MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.16139v1","updated":"2023-08-30T16:52:20Z","published":"2023-08-30T16:52:20Z","title":"MedShapeNet -- A Large-Scale Dataset of 3D Medical Shapes for Computer\n Vision","summary":" We present MedShapeNet, a large collection of anatomical shapes (e.g., bones,\norgans, vessels) and 3D surgical instrument models. Prior to the deep learning\nera, the broad application of statistical shape models (SSMs) in medical image\nanalysis is evidence that shapes have been commonly used to describe medical\ndata. Nowadays, however, state-of-the-art (SOTA) deep learning algorithms in\nmedical imaging are predominantly voxel-based. In computer vision, on the\ncontrary, shapes (including, voxel occupancy grids, meshes, point clouds and\nimplicit surface models) are preferred data representations in 3D, as seen from\nthe numerous shape-related publications in premier vision conferences, such as\nthe IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), as\nwell as the increasing popularity of ShapeNet (about 51,300 models) and\nPrinceton ModelNet (127,915 models) in computer vision research. MedShapeNet is\ncreated as an alternative to these commonly used shape benchmarks to facilitate\nthe translation of data-driven vision algorithms to medical applications, and\nit extends the opportunities to adapt SOTA vision algorithms to solve critical\nmedical problems. Besides, the majority of the medical shapes in MedShapeNet\nare modeled directly on the imaging data of real patients, and therefore it\ncomplements well existing shape benchmarks comprising of computer-aided design\n(CAD) models. MedShapeNet currently includes more than 100,000 medical shapes,\nand provides annotations in the form of paired data. It is therefore also a\nfreely available repository of 3D models for extended reality (virtual reality\n- VR, augmented reality - AR, mixed reality - MR) and medical 3D printing. This\nwhite paper describes in detail the motivations behind MedShapeNet, the shape\nacquisition procedures, the use cases, as well as the usage of the online shape\nsearch portal: https://medshapenet.ikim.nrw/\n","authors":["Jianning Li","Antonio Pepe","Christina Gsaxner","Gijs Luijten","Yuan Jin","Narmada Ambigapathy","Enrico Nasca","Naida Solak","Gian Marco Melito","Afaque R. Memon","Xiaojun Chen","Jan Stefan Kirschke","Ezequiel de la Rosa","Patrich Ferndinand Christ","Hongwei Bran Li","David G. Ellis","Michele R. Aizenberg","Sergios Gatidis","Thomas Kuestner","Nadya Shusharina","Nicholas Heller","Vincent Andrearczyk","Adrien Depeursinge","Mathieu Hatt","Anjany Sekuboyina","Maximilian Loeffler","Hans Liebl","Reuben Dorent","Tom Vercauteren","Jonathan Shapey","Aaron Kujawa","Stefan Cornelissen","Patrick Langenhuizen","Achraf Ben-Hamadou","Ahmed Rekik","Sergi Pujades","Edmond Boyer","Federico Bolelli","Costantino Grana","Luca Lumetti","Hamidreza Salehi","Jun Ma","Yao Zhang","Ramtin Gharleghi","Susann Beier","Eduardo A. Garza-Villarreal","Thania Balducci","Diego Angeles-Valdez","Roberto Souza","Leticia Rittner","Richard Frayne","Yuanfeng Ji","Soumick Chatterjee","Andreas Nuernberger","Joao Pedrosa","Carlos Ferreira","Guilherme Aresta","Antonio Cunha","Aurelio Campilho","Yannick Suter","Jose Garcia","Alain Lalande","Emmanuel Audenaert","Claudia Krebs","Timo Van Leeuwen","Evie Vereecke","Rainer Roehrig","Frank Hoelzle","Vahid Badeli","Kathrin Krieger","Matthias Gunzer","Jianxu Chen","Amin Dada","Miriam Balzer","Jana Fragemann","Frederic Jonske","Moritz Rempe","Stanislav Malorodov","Fin H. Bahnsen","Constantin Seibold","Alexander Jaus","Ana Sofia Santos","Mariana Lindo","Andre Ferreira","Victor Alves","Michael Kamp","Amr Abourayya","Felix Nensa","Fabian Hoerst","Alexander Brehmer","Lukas Heine","Lars E. Podleska","Matthias A. Fink","Julius Keyl","Konstantinos Tserpes","Moon-Sung Kim","Shireen Elhabian","Hans Lamecker","Dzenan Zukic","Beatriz Paniagua","Christian Wachinger","Martin Urschler","Luc Duong","Jakob Wasserthal","Peter F. Hoyer","Oliver Basu","Thomas Maal","Max J. H. Witjes","Ping Luo","Bjoern Menze","Mauricio Reyes","Christos Davatzikos","Behrus Puladi","Jens Kleesiek","Jan Egger"],"pdf_url":"https://arxiv.org/pdf/2308.16139v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2303.17783v3","updated":"2023-08-30T16:25:02Z","published":"2023-03-31T03:14:44Z","title":"Uncertainty-Aware Source-Free Adaptive Image Super-Resolution with\n Wavelet Augmentation Transformer","summary":" Unsupervised Domain Adaptation (UDA) can effectively address domain gap\nissues in real-world image Super-Resolution (SR) by accessing both the source\nand target data. Considering privacy policies or transmission restrictions of\nsource data in practical scenarios, we propose a SOurce-free Domain Adaptation\nframework for image SR (SODA-SR) to address this issue, i.e., adapt a\nsource-trained model to a target domain with only unlabeled target data.\nSODA-SR leverages the source-trained model to generate refined pseudo-labels\nfor teacher-student learning. To better utilize pseudo-labels, we propose a\nnovel wavelet-based augmentation method, named Wavelet Augmentation Transformer\n(WAT), which can be flexibly incorporated with existing networks, to implicitly\nproduce useful augmented data. WAT learns low-frequency information of varying\nlevels across diverse samples, which is aggregated efficiently via deformable\nattention. Furthermore, an uncertainty-aware self-training mechanism is\nproposed to improve the accuracy of pseudo-labels, with inaccurate predictions\nbeing rectified by uncertainty estimation. To acquire better SR results and\navoid overfitting pseudo-labels, several regularization losses are proposed to\nconstrain target LR and SR images in the frequency domain. Experiments show\nthat without accessing source data, SODA-SR outperforms state-of-the-art UDA\nmethods in both synthetic$\\rightarrow$real and real$\\rightarrow$real adaptation\nsettings, and is not constrained by specific network architectures.\n","authors":["Yuang Ai","Xiaoqiang Zhou","Huaibo Huang","Lei Zhang","Ran He"],"pdf_url":"https://arxiv.org/pdf/2303.17783v3.pdf","comment":"9 pages, 7 figures, 3 tables"},{"id":"http://arxiv.org/abs/2306.00914v2","updated":"2023-08-30T16:24:15Z","published":"2023-06-01T17:16:37Z","title":"Conditioning Diffusion Models via Attributes and Semantic Masks for Face\n Generation","summary":" Deep generative models have shown impressive results in generating realistic\nimages of faces. GANs managed to generate high-quality, high-fidelity images\nwhen conditioned on semantic masks, but they still lack the ability to\ndiversify their output. Diffusion models partially solve this problem and are\nable to generate diverse samples given the same condition. In this paper, we\npropose a multi-conditioning approach for diffusion models via cross-attention\nexploiting both attributes and semantic masks to generate high-quality and\ncontrollable face images. We also studied the impact of applying\nperceptual-focused loss weighting into the latent space instead of the pixel\nspace. Our method extends the previous approaches by introducing conditioning\non more than one set of features, guaranteeing a more fine-grained control over\nthe generated face images. We evaluate our approach on the CelebA-HQ dataset,\nand we show that it can generate realistic and diverse samples while allowing\nfor fine-grained control over multiple attributes and semantic regions.\nAdditionally, we perform an ablation study to evaluate the impact of different\nconditioning strategies on the quality and diversity of the generated images.\n","authors":["Nico Giambi","Giuseppe Lisanti"],"pdf_url":"https://arxiv.org/pdf/2306.00914v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16126v1","updated":"2023-08-30T16:23:07Z","published":"2023-08-30T16:23:07Z","title":"CorrEmbed: Evaluating Pre-trained Model Image Similarity Efficacy with a\n Novel Metric","summary":" Detecting visually similar images is a particularly useful attribute to look\nto when calculating product recommendations. Embedding similarity, which\nutilizes pre-trained computer vision models to extract high-level image\nfeatures, has demonstrated remarkable efficacy in identifying images with\nsimilar compositions. However, there is a lack of methods for evaluating the\nembeddings generated by these models, as conventional loss and performance\nmetrics do not adequately capture their performance in image similarity search\ntasks.\n In this paper, we evaluate the viability of the image embeddings from\nnumerous pre-trained computer vision models using a novel approach named\nCorrEmbed. Our approach computes the correlation between distances in image\nembeddings and distances in human-generated tag vectors. We extensively\nevaluate numerous pre-trained Torchvision models using this metric, revealing\nan intuitive relationship of linear scaling between ImageNet1k accuracy scores\nand tag-correlation scores. Importantly, our method also identifies deviations\nfrom this pattern, providing insights into how different models capture\nhigh-level image features.\n By offering a robust performance evaluation of these pre-trained models,\nCorrEmbed serves as a valuable tool for researchers and practitioners seeking\nto develop effective, data-driven approaches to similar item recommendations in\nfashion retail.\n","authors":["Karl Audun Kagnes Borgersen","Morten Goodwin","Jivitesh Sharma","Tobias Aasmoe","Mari Leonhardsen","Gro Herredsvela Rørvik"],"pdf_url":"https://arxiv.org/pdf/2308.16126v1.pdf","comment":"Accepted to AI-2023 Forty-third SGAI International Conference on\n Artificial Intelligence"},{"id":"http://arxiv.org/abs/2308.16110v1","updated":"2023-08-30T16:10:21Z","published":"2023-08-30T16:10:21Z","title":"Improving Few-shot Image Generation by Structural Discrimination and\n Textural Modulation","summary":" Few-shot image generation, which aims to produce plausible and diverse images\nfor one category given a few images from this category, has drawn extensive\nattention. Existing approaches either globally interpolate different images or\nfuse local representations with pre-defined coefficients. However, such an\nintuitive combination of images/features only exploits the most relevant\ninformation for generation, leading to poor diversity and coarse-grained\nsemantic fusion. To remedy this, this paper proposes a novel textural\nmodulation (TexMod) mechanism to inject external semantic signals into internal\nlocal representations. Parameterized by the feedback from the discriminator,\nour TexMod enables more fined-grained semantic injection while maintaining the\nsynthesis fidelity. Moreover, a global structural discriminator (StructD) is\ndeveloped to explicitly guide the model to generate images with reasonable\nlayout and outline. Furthermore, the frequency awareness of the model is\nreinforced by encouraging the model to distinguish frequency signals. Together\nwith these techniques, we build a novel and effective model for few-shot image\ngeneration. The effectiveness of our model is identified by extensive\nexperiments on three popular datasets and various settings. Besides achieving\nstate-of-the-art synthesis performance on these datasets, our proposed\ntechniques could be seamlessly integrated into existing models for a further\nperformance boost.\n","authors":["Mengping Yang","Zhe Wang","Wenyi Feng","Qian Zhang","Ting Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.16110v1.pdf","comment":"To appear in ACM MM 2023, code is available at\n https://github.com/kobeshegu/SDTM-GAN-ACMMM-2023"},{"id":"http://arxiv.org/abs/2305.11582v2","updated":"2023-08-30T16:06:27Z","published":"2023-05-19T10:43:57Z","title":"What You Hear Is What You See: Audio Quality Metrics From Image Quality\n Metrics","summary":" In this study, we investigate the feasibility of utilizing state-of-the-art\nimage perceptual metrics for evaluating audio signals by representing them as\nspectrograms. The encouraging outcome of the proposed approach is based on the\nsimilarity between the neural mechanisms in the auditory and visual pathways.\nFurthermore, we customise one of the metrics which has a psychoacoustically\nplausible architecture to account for the peculiarities of sound signals. We\nevaluate the effectiveness of our proposed metric and several baseline metrics\nusing a music dataset, with promising results in terms of the correlation\nbetween the metrics and the perceived quality of audio as rated by human\nevaluators.\n","authors":["Tashi Namgyal","Alexander Hepburn","Raul Santos-Rodriguez","Valero Laparra","Jesus Malo"],"pdf_url":"https://arxiv.org/pdf/2305.11582v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15745v2","updated":"2023-08-30T15:58:56Z","published":"2023-07-28T18:01:08Z","title":"Context-VQA: Towards Context-Aware and Purposeful Visual Question\n Answering","summary":" Visual question answering (VQA) has the potential to make the Internet more\naccessible in an interactive way, allowing people who cannot see images to ask\nquestions about them. However, multiple studies have shown that people who are\nblind or have low-vision prefer image explanations that incorporate the context\nin which an image appears, yet current VQA datasets focus on images in\nisolation. We argue that VQA models will not fully succeed at meeting people's\nneeds unless they take context into account. To further motivate and analyze\nthe distinction between different contexts, we introduce Context-VQA, a VQA\ndataset that pairs images with contexts, specifically types of websites (e.g.,\na shopping website). We find that the types of questions vary systematically\nacross contexts. For example, images presented in a travel context garner 2\ntimes more \"Where?\" questions, and images on social media and news garner 2.8\nand 1.8 times more \"Who?\" questions than the average. We also find that context\neffects are especially important when participants can't see the image. These\nresults demonstrate that context affects the types of questions asked and that\nVQA models should be context-sensitive to better meet people's needs,\nespecially in accessibility settings.\n","authors":["Nandita Naik","Christopher Potts","Elisa Kreiss"],"pdf_url":"https://arxiv.org/pdf/2307.15745v2.pdf","comment":"Proceedings of ICCV 2023 Workshop on Closing the Loop Between Vision\n and Language"},{"id":"http://arxiv.org/abs/2308.14480v2","updated":"2023-08-30T15:33:01Z","published":"2023-08-28T10:40:16Z","title":"Priority-Centric Human Motion Generation in Discrete Latent Space","summary":" Text-to-motion generation is a formidable task, aiming to produce human\nmotions that align with the input text while also adhering to human\ncapabilities and physical laws. While there have been advancements in diffusion\nmodels, their application in discrete spaces remains underexplored. Current\nmethods often overlook the varying significance of different motions, treating\nthem uniformly. It is essential to recognize that not all motions hold the same\nrelevance to a particular textual description. Some motions, being more salient\nand informative, should be given precedence during generation. In response, we\nintroduce a Priority-Centric Motion Discrete Diffusion Model (M2DM), which\nutilizes a Transformer-based VQ-VAE to derive a concise, discrete motion\nrepresentation, incorporating a global self-attention mechanism and a\nregularization term to counteract code collapse. We also present a motion\ndiscrete diffusion model that employs an innovative noise schedule, determined\nby the significance of each motion token within the entire motion sequence.\nThis approach retains the most salient motions during the reverse diffusion\nprocess, leading to more semantically rich and varied motions. Additionally, we\nformulate two strategies to gauge the importance of motion tokens, drawing from\nboth textual and visual indicators. Comprehensive experiments on the HumanML3D\nand KIT-ML datasets confirm that our model surpasses existing techniques in\nfidelity and diversity, particularly for intricate textual descriptions.\n","authors":["Hanyang Kong","Kehong Gong","Dongze Lian","Michael Bi Mi","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14480v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.16083v1","updated":"2023-08-30T15:15:31Z","published":"2023-08-30T15:15:31Z","title":"Learned Image Reasoning Prior Penetrates Deep Unfolding Network for\n Panchromatic and Multi-Spectral Image Fusion","summary":" The success of deep neural networks for pan-sharpening is commonly in a form\nof black box, lacking transparency and interpretability. To alleviate this\nissue, we propose a novel model-driven deep unfolding framework with image\nreasoning prior tailored for the pan-sharpening task. Different from existing\nunfolding solutions that deliver the proximal operator networks as the\nuncertain and vague priors, our framework is motivated by the content reasoning\nability of masked autoencoders (MAE) with insightful designs. Specifically, the\npre-trained MAE with spatial masking strategy, acting as intrinsic reasoning\nprior, is embedded into unfolding architecture. Meanwhile, the pre-trained MAE\nwith spatial-spectral masking strategy is treated as the regularization term\nwithin loss function to constrain the spatial-spectral consistency. Such\ndesigns penetrate the image reasoning prior into deep unfolding networks while\nimproving its interpretability and representation capability. The uniqueness of\nour framework is that the holistic learning process is explicitly integrated\nwith the inherent physical mechanism underlying the pan-sharpening task.\nExtensive experiments on multiple satellite datasets demonstrate the\nsuperiority of our method over the existing state-of-the-art approaches. Code\nwill be released at \\url{https://manman1995.github.io/}.\n","authors":["Man Zhou","Jie Huang","Naishan Zheng","Chongyi Li"],"pdf_url":"https://arxiv.org/pdf/2308.16083v1.pdf","comment":"10 pages; Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.16082v1","updated":"2023-08-30T15:14:56Z","published":"2023-08-30T15:14:56Z","title":"SignDiff: Learning Diffusion Models for American Sign Language\n Production","summary":" The field of Sign Language Production (SLP) lacked a large-scale, pre-trained\nmodel based on deep learning for continuous American Sign Language (ASL)\nproduction in the past decade. This limitation hampers communication for all\nindividuals with disabilities relying on ASL. To address this issue, we\nundertook the secondary development and utilization of How2Sign, one of the\nlargest publicly available ASL datasets. Despite its significance, prior\nresearchers in the field of sign language have not effectively employed this\ncorpus due to the intricacies involved in American Sign Language Production\n(ASLP).\n To conduct large-scale ASLP, we propose SignDiff based on the latest work in\nrelated fields, which is a dual-condition diffusion pre-training model that can\ngenerate human sign language speakers from a skeleton pose. SignDiff has a\nnovel Frame Reinforcement Network called FR-Net, similar to dense human pose\nestimation work, which enhances the correspondence between text lexical symbols\nand sign language dense pose frames reduce the occurrence of multiple fingers\nin the diffusion model. In addition, our ASLP method proposes two new improved\nmodules and a new loss function to improve the accuracy and quality of sign\nlanguage skeletal posture and enhance the ability of the model to train on\nlarge-scale data.\n We propose the first baseline for ASL production and report the scores of\n17.19 and 12.85 on BLEU-4 on the How2Sign dev/test sets. We also evaluated our\nmodel on the previous mainstream dataset called PHOENIX14T, and the main\nexperiments achieved the results of SOTA. In addition, our image quality far\nexceeds all previous results by 10 percentage points on the SSIM indicator.\nFinally, we conducted ablation studies and qualitative evaluations for\ndiscussion.\n","authors":["Sen Fang","Chunyu Sui","Xuedong Zhang","Yapeng Tian"],"pdf_url":"https://arxiv.org/pdf/2308.16082v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16075v1","updated":"2023-08-30T14:52:14Z","published":"2023-08-30T14:52:14Z","title":"Impact of Visual Context on Noisy Multimodal NMT: An Empirical Study for\n English to Indian Languages","summary":" The study investigates the effectiveness of utilizing multimodal information\nin Neural Machine Translation (NMT). While prior research focused on using\nmultimodal data in low-resource scenarios, this study examines how image\nfeatures impact translation when added to a large-scale, pre-trained unimodal\nNMT system. Surprisingly, the study finds that images might be redundant in\nthis context. Additionally, the research introduces synthetic noise to assess\nwhether images help the model deal with textual noise. Multimodal models\nslightly outperform text-only models in noisy settings, even with random\nimages. The study's experiments translate from English to Hindi, Bengali, and\nMalayalam, outperforming state-of-the-art benchmarks significantly.\nInterestingly, the effect of visual context varies with source text noise: no\nvisual context works best for non-noisy translations, cropped image features\nare optimal for low noise, and full image features work better in high-noise\nscenarios. This sheds light on the role of visual context, especially in noisy\nsettings, opening up a new research direction for Noisy Neural Machine\nTranslation in multimodal setups. The research emphasizes the importance of\ncombining visual and textual information for improved translation in various\nenvironments.\n","authors":["Baban Gain","Dibyanayan Bandyopadhyay","Samrat Mukherjee","Chandranath Adak","Asif Ekbal"],"pdf_url":"https://arxiv.org/pdf/2308.16075v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16071v1","updated":"2023-08-30T14:49:34Z","published":"2023-08-30T14:49:34Z","title":"Semantic Image Synthesis via Class-Adaptive Cross-Attention","summary":" In semantic image synthesis, the state of the art is dominated by methods\nthat use spatially-adaptive normalization layers, which allow for excellent\nvisual generation quality and editing versatility. Granted their efficacy,\nrecent research efforts have focused toward finer-grained local style control\nand multi-modal generation. By construction though, such layers tend to\noverlook global image statistics leading to unconvincing local style editing\nand causing global inconsistencies such as color or illumination distribution\nshifts. Also, the semantic layout is required for mapping styles in the\ngenerator, putting a strict alignment constraint over the features. In\nresponse, we designed a novel architecture where cross-attention layers are\nused in place of de-normalization ones for conditioning the image generation.\nOur model inherits the advantages of both solutions, retaining state-of-the-art\nreconstruction quality, as well as improved global and local style transfer.\nCode and models available at https://github.com/TFonta/CA2SIS.\n","authors":["Tomaso Fontanini","Claudio Ferrari","Giuseppe Lisanti","Massimo Bertozzi","Andrea Prati"],"pdf_url":"https://arxiv.org/pdf/2308.16071v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2107.07752v2","updated":"2023-08-30T14:39:24Z","published":"2021-07-16T08:07:22Z","title":"NeXtQSM -- A complete deep learning pipeline for data-consistent\n quantitative susceptibility mapping trained with hybrid data","summary":" Deep learning based Quantitative Susceptibility Mapping (QSM) has shown great\npotential in recent years, obtaining similar results to established\nnon-learning approaches. Many current deep learning approaches are not data\nconsistent, require in vivo training data or solve the QSM problem in\nconsecutive steps resulting in the propagation of errors. Here we aim to\novercome these limitations and developed a framework to solve the QSM\nprocessing steps jointly. We developed a new hybrid training data generation\nmethod that enables the end-to-end training for solving background field\ncorrection and dipole inversion in a data-consistent fashion using a\nvariational network that combines the QSM model term and a learned regularizer.\nWe demonstrate that NeXtQSM overcomes the limitations of previous deep learning\nmethods. NeXtQSM offers a new deep learning based pipeline for computing\nquantitative susceptibility maps that integrates each processing step into the\ntraining and provides results that are robust and fast.\n","authors":["Francesco Cognolato","Kieran O'Brien","Jin Jin","Simon Robinson","Frederik B. Laun","Markus Barth","Steffen Bollmann"],"pdf_url":"https://arxiv.org/pdf/2107.07752v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08637v2","updated":"2023-08-30T14:28:37Z","published":"2023-06-14T17:07:51Z","title":"TAPIR: Tracking Any Point with per-frame Initialization and temporal\n Refinement","summary":" We present a novel model for Tracking Any Point (TAP) that effectively tracks\nany queried point on any physical surface throughout a video sequence. Our\napproach employs two stages: (1) a matching stage, which independently locates\na suitable candidate point match for the query point on every other frame, and\n(2) a refinement stage, which updates both the trajectory and query features\nbased on local correlations. The resulting model surpasses all baseline methods\nby a significant margin on the TAP-Vid benchmark, as demonstrated by an\napproximate 20% absolute average Jaccard (AJ) improvement on DAVIS. Our model\nfacilitates fast inference on long and high-resolution video sequences. On a\nmodern GPU, our implementation has the capacity to track points faster than\nreal-time, and can be flexibly extended to higher-resolution videos. Given the\nhigh-quality trajectories extracted from a large dataset, we demonstrate a\nproof-of-concept diffusion model which generates trajectories from static\nimages, enabling plausible animations. Visualizations, source code, and\npretrained models can be found on our project webpage.\n","authors":["Carl Doersch","Yi Yang","Mel Vecerik","Dilara Gokay","Ankush Gupta","Yusuf Aytar","Joao Carreira","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2306.08637v2.pdf","comment":"Published at ICCV 2023"},{"id":"http://arxiv.org/abs/2210.05152v5","updated":"2023-08-30T14:24:46Z","published":"2022-10-11T05:11:41Z","title":"TriangleNet: Edge Prior Augmented Network for Semantic Segmentation\n through Cross-Task Consistency","summary":" This paper addresses the task of semantic segmentation in computer vision,\naiming to achieve precise pixel-wise classification. We investigate the joint\ntraining of models for semantic edge detection and semantic segmentation, which\nhas shown promise. However, implicit cross-task consistency learning in\nmulti-task networks is limited. To address this, we propose a novel \"decoupled\ncross-task consistency loss\" that explicitly enhances cross-task consistency.\nOur semantic segmentation network, TriangleNet, achieves a substantial 2.88\\%\nimprovement over the Baseline in mean Intersection over Union (mIoU) on the\nCityscapes test set. Notably, TriangleNet operates at 77.4\\% mIoU/46.2 FPS on\nCityscapes, showcasing real-time inference capabilities at full resolution.\nWith multi-scale inference, performance is further enhanced to 77.8\\%.\nFurthermore, TriangleNet consistently outperforms the Baseline on the FloodNet\ndataset, demonstrating its robust generalization capabilities. The proposed\nmethod underscores the significance of multi-task learning and explicit\ncross-task consistency enhancement for advancing semantic segmentation and\nhighlights the potential of multitasking in real-time semantic segmentation.\n","authors":["Dan Zhang","Rui Zheng","Luosang Gadeng","Pei Yang"],"pdf_url":"https://arxiv.org/pdf/2210.05152v5.pdf","comment":"Accepted for publication in the journal \"International Journal of\n Intelligent Systems\""},{"id":"http://arxiv.org/abs/2302.14416v3","updated":"2023-08-30T14:22:32Z","published":"2023-02-28T08:48:45Z","title":"DREAM: Efficient Dataset Distillation by Representative Matching","summary":" Dataset distillation aims to synthesize small datasets with little\ninformation loss from original large-scale ones for reducing storage and\ntraining costs. Recent state-of-the-art methods mainly constrain the sample\nsynthesis process by matching synthetic images and the original ones regarding\ngradients, embedding distributions, or training trajectories. Although there\nare various matching objectives, currently the strategy for selecting original\nimages is limited to naive random sampling.\n We argue that random sampling overlooks the evenness of the selected sample\ndistribution, which may result in noisy or biased matching targets.\n Besides, the sample diversity is also not constrained by random sampling.\nThese factors together lead to optimization instability in the distilling\nprocess and degrade the training efficiency. Accordingly, we propose a novel\nmatching strategy named as \\textbf{D}ataset distillation by\n\\textbf{RE}present\\textbf{A}tive \\textbf{M}atching (DREAM), where only\nrepresentative original images are selected for matching. DREAM is able to be\neasily plugged into popular dataset distillation frameworks and reduce the\ndistilling iterations by more than 8 times without performance drop. Given\nsufficient training time, DREAM further provides significant improvements and\nachieves state-of-the-art performances.\n","authors":["Yanqing Liu","Jianyang Gu","Kai Wang","Zheng Zhu","Wei Jiang","Yang You"],"pdf_url":"https://arxiv.org/pdf/2302.14416v3.pdf","comment":"Efficient matching for dataset distillation"},{"id":"http://arxiv.org/abs/2308.14500v2","updated":"2023-08-30T14:18:58Z","published":"2023-08-28T11:20:48Z","title":"LAC -- Latent Action Composition for Skeleton-based Action Segmentation","summary":" Skeleton-based action segmentation requires recognizing composable actions in\nuntrimmed videos. Current approaches decouple this problem by first extracting\nlocal visual features from skeleton sequences and then processing them by a\ntemporal model to classify frame-wise actions. However, their performances\nremain limited as the visual features cannot sufficiently express composable\nactions. In this context, we propose Latent Action Composition (LAC), a novel\nself-supervised framework aiming at learning from synthesized composable\nmotions for skeleton-based action segmentation. LAC is composed of a novel\ngeneration module towards synthesizing new sequences. Specifically, we design a\nlinear latent space in the generator to represent primitive motion. New\ncomposed motions can be synthesized by simply performing arithmetic operations\non latent representations of multiple input skeleton sequences. LAC leverages\nsuch synthesized sequences, which have large diversity and complexity, for\nlearning visual representations of skeletons in both sequence and frame spaces\nvia contrastive learning. The resulting visual encoder has a high expressive\npower and can be effectively transferred onto action segmentation tasks by\nend-to-end fine-tuning without the need for additional temporal models. We\nconduct a study focusing on transfer-learning and we show that representations\nlearned from pre-trained LAC outperform the state-of-the-art by a large margin\non TSU, Charades, PKU-MMD datasets.\n","authors":["Di Yang","Yaohui Wang","Antitza Dantcheva","Quan Kong","Lorenzo Garattoni","Gianpiero Francesca","Francois Bremond"],"pdf_url":"https://arxiv.org/pdf/2308.14500v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2303.12247v2","updated":"2023-08-30T14:09:13Z","published":"2023-03-22T01:01:14Z","title":"Exploring the Benefits of Visual Prompting in Differential Privacy","summary":" Visual Prompting (VP) is an emerging and powerful technique that allows\nsample-efficient adaptation to downstream tasks by engineering a well-trained\nfrozen source model. In this work, we explore the benefits of VP in\nconstructing compelling neural network classifiers with differential privacy\n(DP). We explore and integrate VP into canonical DP training methods and\ndemonstrate its simplicity and efficiency. In particular, we discover that VP\nin tandem with PATE, a state-of-the-art DP training method that leverages the\nknowledge transfer from an ensemble of teachers, achieves the state-of-the-art\nprivacy-utility trade-off with minimum expenditure of privacy budget. Moreover,\nwe conduct additional experiments on cross-domain image classification with a\nsufficient domain gap to further unveil the advantage of VP in DP. Lastly, we\nalso conduct extensive ablation studies to validate the effectiveness and\ncontribution of VP under DP consideration. Our code is available at\n(https://github.com/EzzzLi/Prompt-PATE).\n","authors":["Yizhe Li","Yu-Lin Tsai","Xuebin Ren","Chia-Mu Yu","Pin-Yu Chen"],"pdf_url":"https://arxiv.org/pdf/2303.12247v2.pdf","comment":"Published at ICCV 2023"},{"id":"http://arxiv.org/abs/2306.06208v3","updated":"2023-08-30T14:07:49Z","published":"2023-06-05T23:07:01Z","title":"DeltaNN: Assessing the Impact of Computational Environment Parameters on\n the Performance of Image Recognition Models","summary":" Image recognition tasks typically use deep learning and require enormous\nprocessing power, thus relying on hardware accelerators like GPUs and TPUs for\nfast, timely processing. Failure in real-time image recognition tasks can occur\ndue to sub-optimal mapping on hardware accelerators during model deployment,\nwhich may lead to timing uncertainty and erroneous behavior. Mapping on\nhardware accelerators is done using multiple software components like deep\nlearning frameworks, compilers, and device libraries, that we refer to as the\ncomputational environment. Owing to the increased use of image recognition\ntasks in safety-critical applications like autonomous driving and medical\nimaging, it is imperative to assess their robustness to changes in the\ncomputational environment, as the impact of parameters like deep learning\nframeworks, compiler optimizations, and hardware devices on model performance\nand correctness is not yet well understood.\n In this paper we present a differential testing framework, DeltaNN, that\nallows us to assess the impact of different computational environment\nparameters on the performance of image recognition models during deployment,\npost training. DeltaNN generates different implementations of a given image\nrecognition model for variations in environment parameters, namely, deep\nlearning frameworks, compiler optimizations and hardware devices and analyzes\ndifferences in model performance as a result. Using DeltaNN, we conduct an\nempirical study of robustness analysis of three popular image recognition\nmodels using the ImageNet dataset. We report the impact in terms of\nmisclassifications and inference time differences across different settings. In\ntotal, we observed up to 72% output label differences across deep learning\nframeworks, and up to 81% unexpected performance degradation in terms of\ninference time, when applying compiler optimizations.\n","authors":["Nikolaos Louloudakis","Perry Gibson","José Cano","Ajitha Rajan"],"pdf_url":"https://arxiv.org/pdf/2306.06208v3.pdf","comment":"11 pages, 10 figures, 2 tables"},{"id":"http://arxiv.org/abs/2305.08854v2","updated":"2023-08-30T14:01:36Z","published":"2023-05-15T17:59:57Z","title":"Laughing Matters: Introducing Laughing-Face Generation using Diffusion\n Models","summary":" Speech-driven animation has gained significant traction in recent years, with\ncurrent methods achieving near-photorealistic results. However, the field\nremains underexplored regarding non-verbal communication despite evidence\ndemonstrating its importance in human interaction. In particular, generating\nlaughter sequences presents a unique challenge due to the intricacy and nuances\nof this behaviour. This paper aims to bridge this gap by proposing a novel\nmodel capable of generating realistic laughter sequences, given a still\nportrait and an audio clip containing laughter. We highlight the failure cases\nof traditional facial animation methods and leverage recent advances in\ndiffusion models to produce convincing laughter videos. We train our model on a\ndiverse set of laughter datasets and introduce an evaluation metric\nspecifically designed for laughter. When compared with previous speech-driven\napproaches, our model achieves state-of-the-art performance across all metrics,\neven when these are re-trained for laughter generation. Our code and project\nare publicly available\n","authors":["Antoni Bigata Casademunt","Rodrigo Mira","Nikita Drobyshev","Konstantinos Vougioukas","Stavros Petridis","Maja Pantic"],"pdf_url":"https://arxiv.org/pdf/2305.08854v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16041v1","updated":"2023-08-30T14:00:48Z","published":"2023-08-30T14:00:48Z","title":"From Pixels to Portraits: A Comprehensive Survey of Talking Head\n Generation Techniques and Applications","summary":" Recent advancements in deep learning and computer vision have led to a surge\nof interest in generating realistic talking heads. This paper presents a\ncomprehensive survey of state-of-the-art methods for talking head generation.\nWe systematically categorises them into four main approaches: image-driven,\naudio-driven, video-driven and others (including neural radiance fields (NeRF),\nand 3D-based methods). We provide an in-depth analysis of each method,\nhighlighting their unique contributions, strengths, and limitations.\nFurthermore, we thoroughly compare publicly available models, evaluating them\non key aspects such as inference time and human-rated quality of the generated\noutputs. Our aim is to provide a clear and concise overview of the current\nlandscape in talking head generation, elucidating the relationships between\ndifferent approaches and identifying promising directions for future research.\nThis survey will serve as a valuable reference for researchers and\npractitioners interested in this rapidly evolving field.\n","authors":["Shreyank N Gowda","Dheeraj Pandey","Shashank Narayana Gowda"],"pdf_url":"https://arxiv.org/pdf/2308.16041v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.10611v2","updated":"2023-08-30T13:50:25Z","published":"2023-01-25T14:45:13Z","title":"Discriminator-free Unsupervised Domain Adaptation for Multi-label Image\n Classification","summary":" In this paper, a discriminator-free adversarial-based Unsupervised Domain\nAdaptation (UDA) for Multi-Label Image Classification (MLIC) referred to as\nDDA-MLIC is proposed. Recently, some attempts have been made for introducing\nadversarial-based UDA methods in the context of MLIC. However, these methods\nwhich rely on an additional discriminator subnet present one major shortcoming.\nThe learning of domain-invariant features may harm their task-specific\ndiscriminative power, since the classification and discrimination tasks are\ndecoupled. Herein, we propose to overcome this issue by introducing a novel\nadversarial critic that is directly deduced from the task-specific classifier.\nSpecifically, a two-component Gaussian Mixture Model (GMM) is fitted on the\nsource and target predictions in order to distinguish between two clusters.\nThis allows extracting a Gaussian distribution for each component. The\nresulting Gaussian distributions are then used for formulating an adversarial\nloss based on a Frechet distance. The proposed method is evaluated on several\nmulti-label image datasets covering three different types of domain shift. The\nobtained results demonstrate that DDA-MLIC outperforms existing\nstate-of-the-art methods in terms of precision while requiring a lower number\nof parameters. The code will be made publicly available online.\n","authors":["Indel Pal Singh","Enjie Ghorbel","Anis Kacem","Arunkumar Rathinam","Djamila Aouada"],"pdf_url":"https://arxiv.org/pdf/2301.10611v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06157v3","updated":"2023-08-30T13:41:23Z","published":"2023-06-10T23:50:02Z","title":"Fault Localization for Buggy Deep Learning Framework Conversions in\n Image Recognition","summary":" When deploying Deep Neural Networks (DNNs), developers often convert models\nfrom one deep learning framework to another (e.g., TensorFlow to PyTorch).\nHowever, this process is error-prone and can impact target model accuracy. To\nidentify the extent of such impact, we perform and briefly present a\ndifferential analysis against three DNNs widely used for image recognition\n(MobileNetV2, ResNet101, and InceptionV3) converted across four well-known deep\nlearning frameworks (PyTorch, Keras, TensorFlow (TF), and TFLite), which\nrevealed numerous model crashes and output label discrepancies of up to 72%. To\nmitigate such errors, we present a novel approach towards fault localization\nand repair of buggy deep learning framework conversions, focusing on\npre-trained image recognition models. Our technique consists of four stages of\nanalysis: 1) conversion tools, 2) model parameters, 3) model hyperparameters,\nand 4) graph representation. In addition, we propose various strategies towards\nfault repair of the faults detected. We implement our technique on top of the\nApache TVM deep learning compiler, and we test it by conducting a preliminary\nfault localization analysis for the conversion of InceptionV3 from TF to\nTFLite. Our approach detected a fault in a common DNN converter tool, which\nintroduced precision errors in weights, reducing model accuracy. After our\nfault localization, we repaired the issue, reducing our conversion error to\nzero.\n","authors":["Nikolaos Louloudakis","Perry Gibson","José Cano","Ajitha Rajan"],"pdf_url":"https://arxiv.org/pdf/2306.06157v3.pdf","comment":"5 pages, 3 figures, 1 table"},{"id":"http://arxiv.org/abs/2308.14074v2","updated":"2023-08-30T13:40:21Z","published":"2023-08-27T11:37:26Z","title":"Nonrigid Object Contact Estimation With Regional Unwrapping Transformer","summary":" Acquiring contact patterns between hands and nonrigid objects is a common\nconcern in the vision and robotics community. However, existing learning-based\nmethods focus more on contact with rigid ones from monocular images. When\nadopting them for nonrigid contact, a major problem is that the existing\ncontact representation is restricted by the geometry of the object.\nConsequently, contact neighborhoods are stored in an unordered manner and\ncontact features are difficult to align with image cues. At the core of our\napproach lies a novel hand-object contact representation called RUPs (Region\nUnwrapping Profiles), which unwrap the roughly estimated hand-object surfaces\nas multiple high-resolution 2D regional profiles. The region grouping strategy\nis consistent with the hand kinematic bone division because they are the\nprimitive initiators for a composite contact pattern. Based on this\nrepresentation, our Regional Unwrapping Transformer (RUFormer) learns the\ncorrelation priors across regions from monocular inputs and predicts\ncorresponding contact and deformed transformations. Our experiments demonstrate\nthat the proposed framework can robustly estimate the deformed degrees and\ndeformed transformations, which makes it suitable for both nonrigid and rigid\ncontact.\n","authors":["Wei Xie","Zimeng Zhao","Shiying Li","Binghui Zuo","Yangang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14074v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2307.15016v2","updated":"2023-08-30T13:33:59Z","published":"2023-07-27T17:19:32Z","title":"How Good is Google Bard's Visual Understanding? An Empirical Study on\n Open Challenges","summary":" Google's Bard has emerged as a formidable competitor to OpenAI's ChatGPT in\nthe field of conversational AI. Notably, Bard has recently been updated to\nhandle visual inputs alongside text prompts during conversations. Given Bard's\nimpressive track record in handling textual inputs, we explore its capabilities\nin understanding and interpreting visual data (images) conditioned by text\nquestions. This exploration holds the potential to unveil new insights and\nchallenges for Bard and other forthcoming multi-modal Generative models,\nespecially in addressing complex computer vision problems that demand accurate\nvisual and language understanding. Specifically, in this study, we focus on 15\ndiverse task scenarios encompassing regular, camouflaged, medical, under-water\nand remote sensing data to comprehensively evaluate Bard's performance. Our\nprimary finding indicates that Bard still struggles in these vision scenarios,\nhighlighting the significant gap in vision-based understanding that needs to be\nbridged in future developments. We expect that this empirical study will prove\nvaluable in advancing future models, leading to enhanced capabilities in\ncomprehending and interpreting fine-grained visual data. Our project is\nreleased on https://github.com/htqin/GoogleBard-VisUnderstand\n","authors":["Haotong Qin","Ge-Peng Ji","Salman Khan","Deng-Ping Fan","Fahad Shahbaz Khan","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2307.15016v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14036v2","updated":"2023-08-30T13:27:35Z","published":"2023-08-27T08:10:23Z","title":"MB-TaylorFormer: Multi-branch Efficient Transformer Expanded by Taylor\n Formula for Image Dehazing","summary":" In recent years, Transformer networks are beginning to replace pure\nconvolutional neural networks (CNNs) in the field of computer vision due to\ntheir global receptive field and adaptability to input. However, the quadratic\ncomputational complexity of softmax-attention limits the wide application in\nimage dehazing task, especially for high-resolution images. To address this\nissue, we propose a new Transformer variant, which applies the Taylor expansion\nto approximate the softmax-attention and achieves linear computational\ncomplexity. A multi-scale attention refinement module is proposed as a\ncomplement to correct the error of the Taylor expansion. Furthermore, we\nintroduce a multi-branch architecture with multi-scale patch embedding to the\nproposed Transformer, which embeds features by overlapping deformable\nconvolution of different scales. The design of multi-scale patch embedding is\nbased on three key ideas: 1) various sizes of the receptive field; 2)\nmulti-level semantic information; 3) flexible shapes of the receptive field.\nOur model, named Multi-branch Transformer expanded by Taylor formula\n(MB-TaylorFormer), can embed coarse to fine features more flexibly at the patch\nembedding stage and capture long-distance pixel interactions with limited\ncomputational cost. Experimental results on several dehazing benchmarks show\nthat MB-TaylorFormer achieves state-of-the-art (SOTA) performance with a light\ncomputational burden. The source code and pre-trained models are available at\nhttps://github.com/FVL2020/ICCV-2023-MB-TaylorFormer.\n","authors":["Yuwei Qiu","Kaihao Zhang","Chenxi Wang","Wenhan Luo","Hongdong Li","Zhi Jin"],"pdf_url":"https://arxiv.org/pdf/2308.14036v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.16018v1","updated":"2023-08-30T13:20:54Z","published":"2023-08-30T13:20:54Z","title":"Topology-aware MLP for Skeleton-based Action Recognition","summary":" Graph convolution networks (GCNs) have achieved remarkable performance in\nskeleton-based action recognition. However, existing previous GCN-based methods\nhave relied excessively on elaborate human body priors and constructed complex\nfeature aggregation mechanisms, which limits the generalizability of networks.\nTo solve these problems, we propose a novel Spatial Topology Gating Unit\n(STGU), which is an MLP-based variant without extra priors, to capture the\nco-occurrence topology features that encode the spatial dependency across all\njoints. In STGU, to model the sample-specific and completely independent\npoint-wise topology attention, a new gate-based feature interaction mechanism\nis introduced to activate the features point-to-point by the attention map\ngenerated from the input. Based on the STGU, in this work, we propose the first\ntopology-aware MLP-based model, Ta-MLP, for skeleton-based action recognition.\nIn comparison with existing previous methods on three large-scale datasets,\nTa-MLP achieves competitive performance. In addition, Ta-MLP reduces the\nparameters by up to 62.5% with favorable results. Compared with previous\nstate-of-the-art (SOAT) approaches, Ta-MLP pushes the frontier of real-time\naction recognition. The code will be available at\nhttps://github.com/BUPTSJZhang/Ta-MLP.\n","authors":["Shaojie Zhang","Jianqin Yin","Yonghao Dang","Jiajun Fu"],"pdf_url":"https://arxiv.org/pdf/2308.16018v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06289v2","updated":"2023-08-30T13:01:39Z","published":"2023-06-09T22:29:56Z","title":"SegViTv2: Exploring Efficient and Continual Semantic Segmentation with\n Plain Vision Transformers","summary":" This paper investigates the capability of plain Vision Transformers (ViTs)\nfor semantic segmentation using the encoder-decoder framework and introduces\n\\textbf{SegViTv2}. In this study, we introduce a novel Attention-to-Mask (\\atm)\nmodule to design a lightweight decoder effective for plain ViT. The proposed\nATM converts the global attention map into semantic masks for high-quality\nsegmentation results. Our decoder outperforms the popular decoder UPerNet using\nvarious ViT backbones while consuming only about $5\\%$ of the computational\ncost. For the encoder, we address the concern of the relatively high\ncomputational cost in the ViT-based encoders and propose a \\emph{Shrunk++}\nstructure that incorporates edge-aware query-based down-sampling (EQD) and\nquery-based upsampling (QU) modules. The Shrunk++ structure reduces the\ncomputational cost of the encoder by up to $50\\%$ while maintaining competitive\nperformance. Furthermore, we propose to adapt SegViT for continual semantic\nsegmentation, demonstrating nearly zero forgetting of previously learned\nknowledge. Experiments show that our proposed SegViTv2 surpasses recent\nsegmentation methods on three popular benchmarks including ADE20k,\nCOCO-Stuff-10k and PASCAL-Context datasets. The code is available through the\nfollowing link: \\url{https://github.com/zbwxp/SegVit}.\n","authors":["Bowen Zhang","Liyang Liu","Minh Hieu Phan","Zhi Tian","Chunhua Shen","Yifan Liu"],"pdf_url":"https://arxiv.org/pdf/2306.06289v2.pdf","comment":"IJCV 2023 accepted, 21 pages, 8 figures, 12 tables"},{"id":"http://arxiv.org/abs/2305.14730v2","updated":"2023-08-30T13:00:09Z","published":"2023-05-24T05:06:59Z","title":"BinaryViT: Towards Efficient and Accurate Binary Vision Transformers","summary":" Vision Transformers (ViTs) have emerged as the fundamental architecture for\nmost computer vision fields, but the considerable memory and computation costs\nhinders their application on resource-limited devices. As one of the most\npowerful compression methods, binarization reduces the computation of the\nneural network by quantizing the weights and activation values as $\\pm$1.\nAlthough existing binarization methods have demonstrated excellent performance\non Convolutional Neural Networks (CNNs), the full binarization of ViTs is still\nunder-studied and suffering a significant performance drop. In this paper, we\nfirst argue empirically that the severe performance degradation is mainly\ncaused by the weight oscillation in the binarization training and the\ninformation distortion in the activation of ViTs. Based on these analyses, we\npropose $\\textbf{BinaryViT}$, an accurate full binarization scheme for ViTs,\nwhich pushes the quantization of ViTs to the limit. Specifically, we propose a\nnovel gradient regularization scheme (GRS) for driving a bimodal distribution\nof the weights to reduce oscillation in binarization training. Moreover, we\ndesign an activation shift module (ASM) to adaptively tune the activation\ndistribution to reduce the information distortion caused by binarization.\nExtensive experiments on ImageNet dataset show that our BinaryViT consistently\nsurpasses the strong baseline by 2.05% and improve the accuracy of fully\nbinarized ViTs to a usable level. Furthermore, our method achieves impressive\nsavings of 16.2$\\times$ and 17.7$\\times$ in model size and OPs compared to the\nfull-precision DeiT-S.\n","authors":["Junrui Xiao","Zhikai Li","Lianwei Yang","Qingyi Gu"],"pdf_url":"https://arxiv.org/pdf/2305.14730v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.05593v3","updated":"2023-08-30T12:39:29Z","published":"2022-08-10T23:50:01Z","title":"Evaluating the Quality and Diversity of DCGAN-based Generatively\n Synthesized Diabetic Retinopathy Imagery","summary":" Publicly available diabetic retinopathy (DR) datasets are imbalanced,\ncontaining limited numbers of images with DR. This imbalance contributes to\noverfitting when training machine learning classifiers. The impact of this\nimbalance is exacerbated as the severity of the DR stage increases, affecting\nthe classifiers' diagnostic capacity. The imbalance can be addressed using\nGenerative Adversarial Networks (GANs) to augment the datasets with synthetic\nimages. Generating synthetic images is advantageous if high-quality and\ndiversified images are produced. To evaluate the quality and diversity of\nsynthetic images, several evaluation metrics, such as Multi-Scale Structural\nSimilarity Index (MS-SSIM), Cosine Distance (CD), and Fr\\'echet Inception\nDistance (FID) are used. Understanding the effectiveness of each metric in\nevaluating the quality and diversity of GAN-based synthetic images is critical\nto select images for augmentation. To date, there has been limited analysis of\nthe appropriateness of these metrics in the context of biomedical imagery. This\nwork contributes an empirical assessment of these evaluation metrics as applied\nto synthetic Proliferative DR imagery generated by a Deep Convolutional GAN\n(DCGAN). Furthermore, the metrics' capacity to indicate the quality and\ndiversity of synthetic images and a correlation with classifier performance is\nundertaken. This enables a quantitative selection of synthetic imagery and an\ninformed augmentation strategy. Results indicate that FID is suitable for\nevaluating the quality, while MS-SSIM and CD are suitable for evaluating the\ndiversity of synthetic imagery. Furthermore, the superior performance of\nConvolutional Neural Network (CNN) and EfficientNet classifiers, as indicated\nby the F1 and AUC scores, for the augmented datasets demonstrates the efficacy\nof synthetic imagery to augment the imbalanced dataset.\n","authors":["Cristina-Madalina Dragan","Muhammad Muneeb Saad","Mubashir Husain Rehmani","Ruairi O'Reilly"],"pdf_url":"https://arxiv.org/pdf/2208.05593v3.pdf","comment":"29 Pages, 8 Figures, submitted to MEDAL23: Advances in Deep\n Generative Models for Medical Artificial Intelligence (Springer Nature\n series)"},{"id":"http://arxiv.org/abs/2308.15996v1","updated":"2023-08-30T12:37:03Z","published":"2023-08-30T12:37:03Z","title":"DTrOCR: Decoder-only Transformer for Optical Character Recognition","summary":" Typical text recognition methods rely on an encoder-decoder structure, in\nwhich the encoder extracts features from an image, and the decoder produces\nrecognized text from these features. In this study, we propose a simpler and\nmore effective method for text recognition, known as the Decoder-only\nTransformer for Optical Character Recognition (DTrOCR). This method uses a\ndecoder-only Transformer to take advantage of a generative language model that\nis pre-trained on a large corpus. We examined whether a generative language\nmodel that has been successful in natural language processing can also be\neffective for text recognition in computer vision. Our experiments demonstrated\nthat DTrOCR outperforms current state-of-the-art methods by a large margin in\nthe recognition of printed, handwritten, and scene text in both English and\nChinese.\n","authors":["Masato Fujitake"],"pdf_url":"https://arxiv.org/pdf/2308.15996v1.pdf","comment":"Accepted to WACV2024"},{"id":"http://arxiv.org/abs/2308.15989v1","updated":"2023-08-30T12:19:35Z","published":"2023-08-30T12:19:35Z","title":"DiffuVolume: Diffusion Model for Volume based Stereo Matching","summary":" Stereo matching is a significant part in many computer vision tasks and\ndriving-based applications. Recently cost volume-based methods have achieved\ngreat success benefiting from the rich geometry information in paired images.\nHowever, the redundancy of cost volume also interferes with the model training\nand limits the performance. To construct a more precise cost volume, we\npioneeringly apply the diffusion model to stereo matching. Our method, termed\nDiffuVolume, considers the diffusion model as a cost volume filter, which will\nrecurrently remove the redundant information from the cost volume. Two main\ndesigns make our method not trivial. Firstly, to make the diffusion model more\nadaptive to stereo matching, we eschew the traditional manner of directly\nadding noise into the image but embed the diffusion model into a task-specific\nmodule. In this way, we outperform the traditional diffusion stereo matching\nmethod by 22% EPE improvement and 240 times inference acceleration. Secondly,\nDiffuVolume can be easily embedded into any volume-based stereo matching\nnetwork with boost performance but slight parameters rise (only 2%). By adding\nthe DiffuVolume into well-performed methods, we outperform all the published\nmethods on Scene Flow, KITTI2012, KITTI2015 benchmarks and zero-shot\ngeneralization setting. It is worth mentioning that the proposed model ranks\n1st on KITTI 2012 leader board, 2nd on KITTI 2015 leader board since 15, July\n2023.\n","authors":["Dian Zheng","Xiao-Ming Wu","Zuhao Liu","Jingke Meng","Wei-shi Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.15989v1.pdf","comment":"17 pages, 11 figures"},{"id":"http://arxiv.org/abs/2308.15984v1","updated":"2023-08-30T12:13:13Z","published":"2023-08-30T12:13:13Z","title":"Learning Structure-from-Motion with Graph Attention Networks","summary":" In this paper we tackle the problem of learning Structure-from-Motion (SfM)\nthrough the use of graph attention networks. SfM is a classic computer vision\nproblem that is solved though iterative minimization of reprojection errors,\nreferred to as Bundle Adjustment (BA), starting from a good initialization. In\norder to obtain a good enough initialization to BA, conventional methods rely\non a sequence of sub-problems (such as pairwise pose estimation, pose averaging\nor triangulation) which provides an initial solution that can then be refined\nusing BA. In this work we replace these sub-problems by learning a model that\ntakes as input the 2D keypoints detected across multiple views, and outputs the\ncorresponding camera poses and 3D keypoint coordinates. Our model takes\nadvantage of graph neural networks to learn SfM-specific primitives, and we\nshow that it can be used for fast inference of the reconstruction for new and\nunseen sequences. The experimental results show that the proposed model\noutperforms competing learning-based methods, and challenges COLMAP while\nhaving lower runtime.\n","authors":["Lucas Brynte","José Pedro Iglesias","Carl Olsson","Fredrik Kahl"],"pdf_url":"https://arxiv.org/pdf/2308.15984v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15975v1","updated":"2023-08-30T11:57:04Z","published":"2023-08-30T11:57:04Z","title":"RoboTAP: Tracking Arbitrary Points for Few-Shot Visual Imitation","summary":" For robots to be useful outside labs and specialized factories we need a way\nto teach them new useful behaviors quickly. Current approaches lack either the\ngenerality to onboard new tasks without task-specific engineering, or else lack\nthe data-efficiency to do so in an amount of time that enables practical use.\nIn this work we explore dense tracking as a representational vehicle to allow\nfaster and more general learning from demonstration. Our approach utilizes\nTrack-Any-Point (TAP) models to isolate the relevant motion in a demonstration,\nand parameterize a low-level controller to reproduce this motion across changes\nin the scene configuration. We show this results in robust robot policies that\ncan solve complex object-arrangement tasks such as shape-matching, stacking,\nand even full path-following tasks such as applying glue and sticking objects\ntogether, all from demonstrations that can be collected in minutes.\n","authors":["Mel Vecerik","Carl Doersch","Yi Yang","Todor Davchev","Yusuf Aytar","Guangyao Zhou","Raia Hadsell","Lourdes Agapito","Jon Scholz"],"pdf_url":"https://arxiv.org/pdf/2308.15975v1.pdf","comment":"Project website: https://robotap.github.io"},{"id":"http://arxiv.org/abs/2308.02562v2","updated":"2023-08-30T11:47:05Z","published":"2023-08-03T04:03:46Z","title":"Food Classification using Joint Representation of Visual and Textual\n Data","summary":" Food classification is an important task in health care. In this work, we\npropose a multimodal classification framework that uses the modified version of\nEfficientNet with the Mish activation function for image classification, and\nthe traditional BERT transformer-based network is used for text classification.\nThe proposed network and the other state-of-the-art methods are evaluated on a\nlarge open-source dataset, UPMC Food-101. The experimental results show that\nthe proposed network outperforms the other methods, a significant difference of\n11.57% and 6.34% in accuracy is observed for image and text classification,\nrespectively, when compared with the second-best performing method. We also\ncompared the performance in terms of accuracy, precision, and recall for text\nclassification using both machine learning and deep learning-based models. The\ncomparative analysis from the prediction results of both images and text\ndemonstrated the efficiency and robustness of the proposed approach.\n","authors":["Prateek Mittal","Puneet Goyal","Joohi Chauhan"],"pdf_url":"https://arxiv.org/pdf/2308.02562v2.pdf","comment":"Updated results and discussions to be posted and some sections needed\n to be expanded"},{"id":"http://arxiv.org/abs/2308.15966v1","updated":"2023-08-30T11:42:54Z","published":"2023-08-30T11:42:54Z","title":"SHARP Challenge 2023: Solving CAD History and pArameters Recovery from\n Point clouds and 3D scans. Overview, Datasets, Metrics, and Baselines","summary":" Recent breakthroughs in geometric Deep Learning (DL) and the availability of\nlarge Computer-Aided Design (CAD) datasets have advanced the research on\nlearning CAD modeling processes and relating them to real objects. In this\ncontext, 3D reverse engineering of CAD models from 3D scans is considered to be\none of the most sought-after goals for the CAD industry. However, recent\nefforts assume multiple simplifications limiting the applications in real-world\nsettings. The SHARP Challenge 2023 aims at pushing the research a step closer\nto the real-world scenario of CAD reverse engineering through dedicated\ndatasets and tracks. In this paper, we define the proposed SHARP 2023 tracks,\ndescribe the provided datasets, and propose a set of baseline methods along\nwith suitable evaluation metrics to assess the performance of the track\nsolutions. All proposed datasets along with useful routines and the evaluation\nmetrics are publicly available.\n","authors":["Dimitrios Mallis","Sk Aziz Ali","Elona Dupont","Kseniya Cherenkova","Ahmet Serdar Karadeniz","Mohammad Sadil Khan","Anis Kacem","Gleb Gusev","Djamila Aouada"],"pdf_url":"https://arxiv.org/pdf/2308.15966v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15961v1","updated":"2023-08-30T11:35:21Z","published":"2023-08-30T11:35:21Z","title":"Finding-Aware Anatomical Tokens for Chest X-Ray Automated Reporting","summary":" The task of radiology reporting comprises describing and interpreting the\nmedical findings in radiographic images, including description of their\nlocation and appearance. Automated approaches to radiology reporting require\nthe image to be encoded into a suitable token representation for input to the\nlanguage model. Previous methods commonly use convolutional neural networks to\nencode an image into a series of image-level feature map representations.\nHowever, the generated reports often exhibit realistic style but imperfect\naccuracy. Inspired by recent works for image captioning in the general domain\nin which each visual token corresponds to an object detected in an image, we\ninvestigate whether using local tokens corresponding to anatomical structures\ncan improve the quality of the generated reports. We introduce a novel\nadaptation of Faster R-CNN in which finding detection is performed for the\ncandidate bounding boxes extracted during anatomical structure localisation. We\nuse the resulting bounding box feature representations as our set of\nfinding-aware anatomical tokens. This encourages the extracted anatomical\ntokens to be informative about the findings they contain (required for the\nfinal task of radiology reporting). Evaluating on the MIMIC-CXR dataset of\nchest X-Ray images, we show that task-aware anatomical tokens give\nstate-of-the-art performance when integrated into an automated reporting\npipeline, yielding generated reports with improved clinical accuracy.\n","authors":["Francesco Dalla Serra","Chaoyang Wang","Fani Deligianni","Jeffrey Dalton","Alison Q. O'Neil"],"pdf_url":"https://arxiv.org/pdf/2308.15961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15960v1","updated":"2023-08-30T11:33:07Z","published":"2023-08-30T11:33:07Z","title":"Fusing Pseudo Labels with Weak Supervision for Dynamic Traffic Scenarios","summary":" Advanced Driver Assistance Systems (ADAS) have made significant strides,\ncapitalizing on computer vision to enhance perception and decision-making\ncapabilities. Nonetheless, the adaptation of these systems to diverse traffic\nscenarios poses challenges due to shifts in data distribution stemming from\nfactors such as location, weather, and road infrastructure. To tackle this, we\nintroduce a weakly-supervised label unification pipeline that amalgamates\npseudo labels from a multitude of object detection models trained on\nheterogeneous datasets. Our pipeline engenders a unified label space through\nthe amalgamation of labels from disparate datasets, rectifying bias and\nenhancing generalization. We fine-tune multiple object detection models on\nindividual datasets, subsequently crafting a unified dataset featuring pseudo\nlabels, meticulously validated for precision. Following this, we retrain a\nsolitary object detection model using the merged label space, culminating in a\nresilient model proficient in dynamic traffic scenarios. We put forth a\ncomprehensive evaluation of our approach, employing diverse datasets\noriginating from varied Asian countries, effectively demonstrating its efficacy\nin challenging road conditions. Notably, our method yields substantial\nenhancements in object detection performance, culminating in a model with\nheightened resistance against domain shifts.\n","authors":["Harshith Mohan Kumar","Sean Lawrence"],"pdf_url":"https://arxiv.org/pdf/2308.15960v1.pdf","comment":"This work was accepted as an extended abstract at the International\n Conference on Computer Vision (ICCV) 2023 BRAVO Workshop, Paris, France"},{"id":"http://arxiv.org/abs/2209.15376v3","updated":"2023-08-30T11:04:14Z","published":"2022-09-30T11:09:54Z","title":"NBV-SC: Next Best View Planning based on Shape Completion for Fruit\n Mapping and Reconstruction","summary":" Active perception for fruit mapping and harvesting is a difficult task since\nocclusions occur frequently and the location as well as size of fruits change\nover time. State-of-the-art viewpoint planning approaches utilize\ncomputationally expensive ray casting operations to find good viewpoints aiming\nat maximizing information gain and covering the fruits in the scene. In this\npaper, we present a novel viewpoint planning approach that explicitly uses\ninformation about the predicted fruit shapes to compute targeted viewpoints\nthat observe as yet unobserved parts of the fruits. Furthermore, we formulate\nthe concept of viewpoint dissimilarity to reduce the sampling space for more\nefficient selection of useful, dissimilar viewpoints. Our simulation\nexperiments with a UR5e arm equipped with an RGB-D sensor provide a\nquantitative demonstration of the efficacy of our iterative next best view\nplanning method based on shape completion. In comparative experiments with a\nstate-of-the-art viewpoint planner, we demonstrate improvement not only in the\nestimation of the fruit sizes, but also in their reconstruction, while\nsignificantly reducing the planning time. Finally, we show the viability of our\napproach for mapping sweet peppers plants with a real robotic system in a\ncommercial glasshouse.\n","authors":["Rohit Menon","Tobias Zaenker","Nils Dengler","Maren Bennewitz"],"pdf_url":"https://arxiv.org/pdf/2209.15376v3.pdf","comment":"Agricultural Automation, Viewpoint Planning, Active Perception, Shape\n Completion"},{"id":"http://arxiv.org/abs/2308.15949v1","updated":"2023-08-30T10:57:41Z","published":"2023-08-30T10:57:41Z","title":"Latency-aware Unified Dynamic Networks for Efficient Image Recognition","summary":" Dynamic computation has emerged as a promising avenue to enhance the\ninference efficiency of deep networks. It allows selective activation of\ncomputational units, leading to a reduction in unnecessary computations for\neach input sample. However, the actual efficiency of these dynamic models can\ndeviate from theoretical predictions. This mismatch arises from: 1) the lack of\na unified approach due to fragmented research; 2) the focus on algorithm design\nover critical scheduling strategies, especially in CUDA-enabled GPU contexts;\nand 3) challenges in measuring practical latency, given that most libraries\ncater to static operations. Addressing these issues, we unveil the\nLatency-Aware Unified Dynamic Networks (LAUDNet), a framework that integrates\nthree primary dynamic paradigms-spatially adaptive computation, dynamic layer\nskipping, and dynamic channel skipping. To bridge the theoretical and practical\nefficiency gap, LAUDNet merges algorithmic design with scheduling optimization,\nguided by a latency predictor that accurately gauges dynamic operator latency.\nWe've tested LAUDNet across multiple vision tasks, demonstrating its capacity\nto notably reduce the latency of models like ResNet-101 by over 50% on\nplatforms such as V100, RTX3090, and TX2 GPUs. Notably, LAUDNet stands out in\nbalancing accuracy and efficiency. Code is available at:\nhttps://www.github.com/LeapLabTHU/LAUDNet.\n","authors":["Yizeng Han","Zeyu Liu","Zhihang Yuan","Yifan Pu","Chaofei Wang","Shiji Song","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2308.15949v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15942v1","updated":"2023-08-30T10:48:53Z","published":"2023-08-30T10:48:53Z","title":"Stage-by-stage Wavelet Optimization Refinement Diffusion Model for\n Sparse-View CT Reconstruction","summary":" Diffusion models have emerged as potential tools to tackle the challenge of\nsparse-view CT reconstruction, displaying superior performance compared to\nconventional methods. Nevertheless, these prevailing diffusion models\npredominantly focus on the sinogram or image domains, which can lead to\ninstability during model training, potentially culminating in convergence\ntowards local minimal solutions. The wavelet trans-form serves to disentangle\nimage contents and features into distinct frequency-component bands at varying\nscales, adeptly capturing diverse directional structures. Employing the Wavelet\ntransform as a guiding sparsity prior significantly enhances the robustness of\ndiffusion models. In this study, we present an innovative approach named the\nStage-by-stage Wavelet Optimization Refinement Diffusion (SWORD) model for\nsparse-view CT reconstruction. Specifically, we establish a unified\nmathematical model integrating low-frequency and high-frequency generative\nmodels, achieving the solution with optimization procedure. Furthermore, we\nperform the low-frequency and high-frequency generative models on wavelet's\ndecomposed components rather than sinogram or image domains, ensuring the\nstability of model training. Our method rooted in established optimization\ntheory, comprising three distinct stages, including low-frequency generation,\nhigh-frequency refinement and domain transform. Our experimental results\ndemonstrate that the proposed method outperforms existing state-of-the-art\nmethods both quantitatively and qualitatively.\n","authors":["Kai Xu","Shiyu Lu","Bin Huang","Weiwen Wu","Qiegen Liu"],"pdf_url":"https://arxiv.org/pdf/2308.15942v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.16098v5","updated":"2023-08-30T10:48:12Z","published":"2022-11-29T11:17:34Z","title":"Three-stage binarization of color document images based on discrete\n wavelet transform and generative adversarial networks","summary":" The efficient segmentation of foreground text information from the background\nin degraded color document images is a critical challenge in the preservation\nof ancient manuscripts. The imperfect preservation of ancient manuscripts over\ntime has led to various types of degradation, such as staining, yellowing, and\nink seepage, significantly affecting image binarization results. This work\nproposes a three-stage method using Generative Adversarial Networks (GAN) for\nenhancing and binarizing degraded color document images through Discrete\nWavelet Transform (DWT). Stage-1 involves applying DWT and retaining the\nLow-Low (LL) subband images for image enhancement. In Stage-2, the original\ninput image is divided into four single-channel images (Red, Green, Blue, and\nGray), and each is trained with independent adversarial networks to extract\ncolor foreground information. In Stage-3, the output image from Stage-2 and the\noriginal input image are used to train independent adversarial networks for\ndocument binarization, enabling the integration of global and local features.\nThe experimental results demonstrate that our proposed method outperforms other\nclassic and state-of-the-art (SOTA) methods on the Document Image Binarization\nContest (DIBCO) datasets. We have released our implementation code at\nhttps://github.com/abcpp12383/ThreeStageBinarization.\n","authors":["Rui-Yang Ju","Yu-Shian Lin","Chih-Chia Chen","Chun-Tse Chien","Jen-Shiun Chiang"],"pdf_url":"https://arxiv.org/pdf/2211.16098v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15428v2","updated":"2023-08-30T10:38:41Z","published":"2023-07-28T09:26:00Z","title":"Implicit neural representation for change detection","summary":" Identifying changes in a pair of 3D aerial LiDAR point clouds, obtained\nduring two distinct time periods over the same geographic region presents a\nsignificant challenge due to the disparities in spatial coverage and the\npresence of noise in the acquisition system. The most commonly used approaches\nto detecting changes in point clouds are based on supervised methods which\nnecessitate extensive labelled data often unavailable in real-world\napplications. To address these issues, we propose an unsupervised approach that\ncomprises two components: Implicit Neural Representation (INR) for continuous\nshape reconstruction and a Gaussian Mixture Model for categorising changes. INR\noffers a grid-agnostic representation for encoding bi-temporal point clouds,\nwith unmatched spatial support that can be regularised to enhance\nhigh-frequency details and reduce noise. The reconstructions at each timestamp\nare compared at arbitrary spatial scales, leading to a significant increase in\ndetection capabilities. We apply our method to a benchmark dataset comprising\nsimulated LiDAR point clouds for urban sprawling. This dataset encompasses\ndiverse challenging scenarios, varying in resolutions, input modalities and\nnoise levels. This enables a comprehensive multi-scenario evaluation, comparing\nour method with the current state-of-the-art approach. We outperform the\nprevious methods by a margin of 10% in the intersection over union metric. In\naddition, we put our techniques to practical use by applying them in a\nreal-world scenario to identify instances of illicit excavation of\narchaeological sites and validate our results by comparing them with findings\nfrom field experts.\n","authors":["Peter Naylor","Diego Di Carlo","Arianna Traviglia","Makoto Yamada","Marco Fiorucci"],"pdf_url":"https://arxiv.org/pdf/2307.15428v2.pdf","comment":"Main article is 10 pages + 6 pages of supplementary. Conference style\n paper"},{"id":"http://arxiv.org/abs/2308.15939v1","updated":"2023-08-30T10:35:36Z","published":"2023-08-30T10:35:36Z","title":"AnoVL: Adapting Vision-Language Models for Unified Zero-shot Anomaly\n Localization","summary":" Contrastive Language-Image Pre-training (CLIP) models have shown promising\nperformance on zero-shot visual recognition tasks by learning visual\nrepresentations under natural language supervision. Recent studies attempt the\nuse of CLIP to tackle zero-shot anomaly detection by matching images with\nnormal and abnormal state prompts. However, since CLIP focuses on building\ncorrespondence between paired text prompts and global image-level\nrepresentations, the lack of patch-level vision to text alignment limits its\ncapability on precise visual anomaly localization. In this work, we introduce a\ntraining-free adaptation (TFA) framework of CLIP for zero-shot anomaly\nlocalization. In the visual encoder, we innovate a training-free value-wise\nattention mechanism to extract intrinsic local tokens of CLIP for patch-level\nlocal description. From the perspective of text supervision, we particularly\ndesign a unified domain-aware contrastive state prompting template. On top of\nthe proposed TFA, we further introduce a test-time adaptation (TTA) mechanism\nto refine anomaly localization results, where a layer of trainable parameters\nin the adapter is optimized using TFA's pseudo-labels and synthetic\nnoise-corrupted tokens. With both TFA and TTA adaptation, we significantly\nexploit the potential of CLIP for zero-shot anomaly localization and\ndemonstrate the effectiveness of our proposed methods on various datasets.\n","authors":["Hanqiu Deng","Zhaoxiang Zhang","Jinan Bao","Xingyu Li"],"pdf_url":"https://arxiv.org/pdf/2308.15939v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15932v1","updated":"2023-08-30T10:21:57Z","published":"2023-08-30T10:21:57Z","title":"Attention-based CT Scan Interpolation for Lesion Segmentation of\n Colorectal Liver Metastases","summary":" Small liver lesions common to colorectal liver metastases (CRLMs) are\nchallenging for convolutional neural network (CNN) segmentation models,\nespecially when we have a wide range of slice thicknesses in the computed\ntomography (CT) scans. Slice thickness of CT images may vary by clinical\nindication. For example, thinner slices are used for presurgical planning when\nfine anatomic details of small vessels are required. While keeping the\neffective radiation dose in patients as low as possible, various slice\nthicknesses are employed in CRLMs due to their limitations. However,\ndifferences in slice thickness across CTs lead to significant performance\ndegradation in CT segmentation models based on CNNs. This paper proposes a\nnovel unsupervised attention-based interpolation model to generate intermediate\nslices from consecutive triplet slices in CT scans. We integrate segmentation\nloss during the interpolation model's training to leverage segmentation labels\nin existing slices to generate middle ones. Unlike common interpolation\ntechniques in CT volumes, our model highlights the regions of interest (liver\nand lesions) inside the abdominal CT scans in the interpolated slice. Moreover,\nour model's outputs are consistent with the original input slices while\nincreasing the segmentation performance in two cutting-edge 3D segmentation\npipelines. We tested the proposed model on the CRLM dataset to upsample\nsubjects with thick slices and create isotropic volume for our segmentation\nmodel. The produced isotropic dataset increases the Dice score in the\nsegmentation of lesions and outperforms other interpolation approaches in terms\nof interpolation metrics.\n","authors":["Mohammad Hamghalam","Richard K. G. Do","Amber L. Simpson"],"pdf_url":"https://arxiv.org/pdf/2308.15932v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09829v2","updated":"2023-08-30T10:19:02Z","published":"2023-07-19T08:34:25Z","title":"What do neural networks learn in image classification? A frequency\n shortcut perspective","summary":" Frequency analysis is useful for understanding the mechanisms of\nrepresentation learning in neural networks (NNs). Most research in this area\nfocuses on the learning dynamics of NNs for regression tasks, while little for\nclassification. This study empirically investigates the latter and expands the\nunderstanding of frequency shortcuts. First, we perform experiments on\nsynthetic datasets, designed to have a bias in different frequency bands. Our\nresults demonstrate that NNs tend to find simple solutions for classification,\nand what they learn first during training depends on the most distinctive\nfrequency characteristics, which can be either low- or high-frequencies.\nSecond, we confirm this phenomenon on natural images. We propose a metric to\nmeasure class-wise frequency characteristics and a method to identify frequency\nshortcuts. The results show that frequency shortcuts can be texture-based or\nshape-based, depending on what best simplifies the objective. Third, we\nvalidate the transferability of frequency shortcuts on out-of-distribution\n(OOD) test sets. Our results suggest that frequency shortcuts can be\ntransferred across datasets and cannot be fully avoided by larger model\ncapacity and data augmentation. We recommend that future research should focus\non effective training schemes mitigating frequency shortcut learning.\n","authors":["Shunxin Wang","Raymond Veldhuis","Christoph Brune","Nicola Strisciuglio"],"pdf_url":"https://arxiv.org/pdf/2307.09829v2.pdf","comment":"Accepted at ICCV2023"},{"id":"http://arxiv.org/abs/2308.15918v1","updated":"2023-08-30T09:45:14Z","published":"2023-08-30T09:45:14Z","title":"Physics-Informed DeepMRI: Bridging the Gap from Heat Diffusion to\n k-Space Interpolation","summary":" In the field of parallel imaging (PI), alongside image-domain regularization\nmethods, substantial research has been dedicated to exploring $k$-space\ninterpolation. However, the interpretability of these methods remains an\nunresolved issue. Furthermore, these approaches currently face acceleration\nlimitations that are comparable to those experienced by image-domain methods.\nIn order to enhance interpretability and overcome the acceleration limitations,\nthis paper introduces an interpretable framework that unifies both $k$-space\ninterpolation techniques and image-domain methods, grounded in the physical\nprinciples of heat diffusion equations. Building upon this foundational\nframework, a novel $k$-space interpolation method is proposed. Specifically, we\nmodel the process of high-frequency information attenuation in $k$-space as a\nheat diffusion equation, while the effort to reconstruct high-frequency\ninformation from low-frequency regions can be conceptualized as a reverse heat\nequation. However, solving the reverse heat equation poses a challenging\ninverse problem. To tackle this challenge, we modify the heat equation to align\nwith the principles of magnetic resonance PI physics and employ the score-based\ngenerative method to precisely execute the modified reverse heat diffusion.\nFinally, experimental validation conducted on publicly available datasets\ndemonstrates the superiority of the proposed approach over traditional\n$k$-space interpolation methods, deep learning-based $k$-space interpolation\nmethods, and conventional diffusion models in terms of reconstruction accuracy,\nparticularly in high-frequency regions.\n","authors":["Zhuo-Xu Cui","Congcong Liu","Xiaohong Fan","Chentao Cao","Jing Cheng","Qingyong Zhu","Yuanyuan Liu","Sen Jia","Yihang Zhou","Haifeng Wang","Yanjie Zhu","Jianping Zhang","Qiegen Liu","Dong Liang"],"pdf_url":"https://arxiv.org/pdf/2308.15918v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.14027v2","updated":"2023-08-30T09:26:11Z","published":"2023-03-24T14:37:07Z","title":"Poincaré ResNet","summary":" This paper introduces an end-to-end residual network that operates entirely\non the Poincar\\'e ball model of hyperbolic space. Hyperbolic learning has\nrecently shown great potential for visual understanding, but is currently only\nperformed in the penultimate layer(s) of deep networks. All visual\nrepresentations are still learned through standard Euclidean networks. In this\npaper we investigate how to learn hyperbolic representations of visual data\ndirectly from the pixel-level. We propose Poincar\\'e ResNet, a hyperbolic\ncounterpart of the celebrated residual network, starting from Poincar\\'e 2D\nconvolutions up to Poincar\\'e residual connections. We identify three\nroadblocks for training convolutional networks entirely in hyperbolic space and\npropose a solution for each: (i) Current hyperbolic network initializations\ncollapse to the origin, limiting their applicability in deeper networks. We\nprovide an identity-based initialization that preserves norms over many layers.\n(ii) Residual networks rely heavily on batch normalization, which comes with\nexpensive Fr\\'echet mean calculations in hyperbolic space. We introduce\nPoincar\\'e midpoint batch normalization as a faster and equally effective\nalternative. (iii) Due to the many intermediate operations in Poincar\\'e\nlayers, we lastly find that the computation graphs of deep learning libraries\nblow up, limiting our ability to train on deep hyperbolic networks. We provide\nmanual backward derivations of core hyperbolic operations to maintain\nmanageable computation graphs.\n","authors":["Max van Spengler","Erwin Berkhout","Pascal Mettes"],"pdf_url":"https://arxiv.org/pdf/2303.14027v2.pdf","comment":"International Conference on Computer Vision 2023"},{"id":"http://arxiv.org/abs/2308.15887v1","updated":"2023-08-30T09:04:24Z","published":"2023-08-30T09:04:24Z","title":"On the Potential of CLIP for Compositional Logical Reasoning","summary":" In this paper we explore the possibility of using OpenAI's CLIP to perform\nlogically coherent grounded visual reasoning. To that end, we formalize our\nterms and give a geometric analysis of how embeddings in CLIP's latent space\nwould need to be configured in order for the system to be logically coherent.\nOur main conclusion is that, as usually configured, CLIP cannot perform such\nreasoning.\n","authors":["Justin Brody"],"pdf_url":"https://arxiv.org/pdf/2308.15887v1.pdf","comment":"In Proceedings ICLP 2023, arXiv:2308.14898"},{"id":"http://arxiv.org/abs/2308.15881v1","updated":"2023-08-30T09:03:28Z","published":"2023-08-30T09:03:28Z","title":"Interpretability-guided Data Augmentation for Robust Segmentation in\n Multi-centre Colonoscopy Data","summary":" Multi-centre colonoscopy images from various medical centres exhibit distinct\ncomplicating factors and overlays that impact the image content, contingent on\nthe specific acquisition centre. Existing Deep Segmentation networks struggle\nto achieve adequate generalizability in such data sets, and the currently\navailable data augmentation methods do not effectively address these sources of\ndata variability. As a solution, we introduce an innovative data augmentation\napproach centred on interpretability saliency maps, aimed at enhancing the\ngeneralizability of Deep Learning models within the realm of multi-centre\ncolonoscopy image segmentation. The proposed augmentation technique\ndemonstrates increased robustness across different segmentation models and\ndomains. Thorough testing on a publicly available multi-centre dataset for\npolyp detection demonstrates the effectiveness and versatility of our approach,\nwhich is observed both in quantitative and qualitative results. The code is\npublicly available at:\nhttps://github.com/nki-radiology/interpretability_augmentation\n","authors":["Valentina Corbetta","Regina Beets-Tan","Wilson Silva"],"pdf_url":"https://arxiv.org/pdf/2308.15881v1.pdf","comment":"10 pages, 4 figures, 1 table, accepted at MICCAI 2023 Workshop on\n Machine Learning in Medical Imaging (MLMI)"},{"id":"http://arxiv.org/abs/2308.15868v1","updated":"2023-08-30T08:56:36Z","published":"2023-08-30T08:56:36Z","title":"Feature Attention Network (FA-Net): A Deep-Learning Based Approach for\n Underwater Single Image Enhancement","summary":" Underwater image processing and analysis have been a hotspot of study in\nrecent years, as more emphasis has been focused to underwater monitoring and\nusage of marine resources. Compared with the open environment, underwater image\nencountered with more complicated conditions such as light abortion,\nscattering, turbulence, nonuniform illumination and color diffusion. Although\nconsiderable advances and enhancement techniques achieved in resolving these\nissues, they treat low-frequency information equally across the entire channel,\nwhich results in limiting the network's representativeness. We propose a deep\nlearning and feature-attention-based end-to-end network (FA-Net) to solve this\nproblem. In particular, we propose a Residual Feature Attention Block (RFAB),\ncontaining the channel attention, pixel attention, and residual learning\nmechanism with long and short skip connections. RFAB allows the network to\nfocus on learning high-frequency information while skipping low-frequency\ninformation on multi-hop connections. The channel and pixel attention mechanism\nconsiders each channel's different features and the uneven distribution of haze\nover different pixels in the image. The experimental results shows that the\nFA-Net propose by us provides higher accuracy, quantitatively and qualitatively\nand superiority to previous state-of-the-art methods.\n","authors":["Muhammad Hamza","Ammar Hawbani","Sami Ul Rehman","Xingfu Wang","Liang Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.15868v1.pdf","comment":"Fourteenth International Conference on Digital Image Processing\n (ICDIP 2022), 2022, Wuhan, China, May 20-23, 2022.8 pages.5 Figures.doi:\n 10.1117/12.2644516"},{"id":"http://arxiv.org/abs/2308.15855v1","updated":"2023-08-30T08:44:21Z","published":"2023-08-30T08:44:21Z","title":"Semi-supervised Domain Adaptation with Inter and Intra-domain Mixing for\n Semantic Segmentation","summary":" Despite recent advances in semantic segmentation, an inevitable challenge is\nthe performance degradation caused by the domain shift in real application.\nCurrent dominant approach to solve this problem is unsupervised domain\nadaptation (UDA). However, the absence of labeled target data in UDA is overly\nrestrictive and limits performance. To overcome this limitation, a more\npractical scenario called semi-supervised domain adaptation (SSDA) has been\nproposed. Existing SSDA methods are derived from the UDA paradigm and primarily\nfocus on leveraging the unlabeled target data and source data. In this paper,\nwe highlight the significance of exploiting the intra-domain information\nbetween the limited labeled target data and unlabeled target data, as it\ngreatly benefits domain adaptation. Instead of solely using the scarce labeled\ndata for supervision, we propose a novel SSDA framework that incorporates both\ninter-domain mixing and intra-domain mixing, where inter-domain mixing\nmitigates the source-target domain gap and intra-domain mixing enriches the\navailable target domain information. By simultaneously learning from\ninter-domain mixing and intra-domain mixing, the network can capture more\ndomain-invariant features and promote its performance on the target domain. We\nalso explore different domain mixing operations to better exploit the target\ndomain information. Comprehensive experiments conducted on the GTA5toCityscapes\nand SYNTHIA2Cityscapes benchmarks demonstrate the effectiveness of our method,\nsurpassing previous methods by a large margin.\n","authors":["Weifu Fu","Qiang Nie","Jialin Li","Yuhuan Lin","Kai Wu","Yong Liu","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2308.15855v1.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2211.12436v2","updated":"2023-08-30T08:40:16Z","published":"2022-11-22T17:45:06Z","title":"Dynamic Depth-Supervised NeRF for Multi-View RGB-D Operating Room Images","summary":" The operating room (OR) is an environment of interest for the development of\nsensing systems, enabling the detection of people, objects, and their semantic\nrelations. Due to frequent occlusions in the OR, these systems often rely on\ninput from multiple cameras. While increasing the number of cameras generally\nincreases algorithm performance, there are hard limitations to the number and\nlocations of cameras in the OR. Neural Radiance Fields (NeRF) can be used to\nrender synthetic views from arbitrary camera positions, virtually enlarging the\nnumber of cameras in the dataset. In this work, we explore the use of NeRF for\nview synthesis of dynamic scenes in the OR, and we show that regularisation\nwith depth supervision from RGB-D sensor data results in higher image quality.\nWe optimise a dynamic depth-supervised NeRF with up to six synchronised cameras\nthat capture the surgical field in five distinct phases before and during a\nknee replacement surgery. We qualitatively inspect views rendered by a virtual\ncamera that moves 180 degrees around the surgical field at differing time\nvalues. Quantitatively, we evaluate view synthesis from an unseen camera\nposition in terms of PSNR, SSIM and LPIPS for the colour channels and in MAE\nand error percentage for the estimated depth. We find that NeRFs can be used to\ngenerate geometrically consistent views, also from interpolated camera\npositions and at interpolated time intervals. Views are generated from an\nunseen camera pose with an average PSNR of 18.2 and a depth estimation error of\n2.0%. Our results show the potential of a dynamic NeRF for view synthesis in\nthe OR and stress the relevance of depth supervision in a clinical setting.\n","authors":["Beerend G. A. Gerats","Jelmer M. Wolterink","Ivo A. M. J. Broeders"],"pdf_url":"https://arxiv.org/pdf/2211.12436v2.pdf","comment":"Accepted to the Workshop on Ambient Intelligence for HealthCare 2023"},{"id":"http://arxiv.org/abs/2308.15854v1","updated":"2023-08-30T08:40:15Z","published":"2023-08-30T08:40:15Z","title":"Zero-shot Inversion Process for Image Attribute Editing with Diffusion\n Models","summary":" Denoising diffusion models have shown outstanding performance in image\nediting. Existing works tend to use either image-guided methods, which provide\na visual reference but lack control over semantic coherence, or text-guided\nmethods, which ensure faithfulness to text guidance but lack visual quality. To\naddress the problem, we propose the Zero-shot Inversion Process (ZIP), a\nframework that injects a fusion of generated visual reference and text guidance\ninto the semantic latent space of a \\textit{frozen} pre-trained diffusion\nmodel. Only using a tiny neural network, the proposed ZIP produces diverse\ncontent and attributes under the intuitive control of the text prompt.\nMoreover, ZIP shows remarkable robustness for both in-domain and out-of-domain\nattribute manipulation on real images. We perform detailed experiments on\nvarious benchmark datasets. Compared to state-of-the-art methods, ZIP produces\nimages of equivalent quality while providing a realistic editing effect.\n","authors":["Zhanbo Feng","Zenan Ling","Ci Gong","Feng Zhou","Jie Li","Robert C. Qiu"],"pdf_url":"https://arxiv.org/pdf/2308.15854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14847v2","updated":"2023-08-30T08:34:08Z","published":"2023-08-28T19:08:17Z","title":"NSF: Neural Surface Fields for Human Modeling from Monocular Depth","summary":" Obtaining personalized 3D animatable avatars from a monocular camera has\nseveral real world applications in gaming, virtual try-on, animation, and\nVR/XR, etc. However, it is very challenging to model dynamic and fine-grained\nclothing deformations from such sparse data. Existing methods for modeling 3D\nhumans from depth data have limitations in terms of computational efficiency,\nmesh coherency, and flexibility in resolution and topology. For instance,\nreconstructing shapes using implicit functions and extracting explicit meshes\nper frame is computationally expensive and cannot ensure coherent meshes across\nframes. Moreover, predicting per-vertex deformations on a pre-designed human\ntemplate with a discrete surface lacks flexibility in resolution and topology.\nTo overcome these limitations, we propose a novel method `\\keyfeature: Neural\nSurface Fields' for modeling 3D clothed humans from monocular depth. NSF\ndefines a neural field solely on the base surface which models a continuous and\nflexible displacement field. NSF can be adapted to the base surface with\ndifferent resolution and topology without retraining at inference time.\nCompared to existing approaches, our method eliminates the expensive per-frame\nsurface extraction while maintaining mesh coherency, and is capable of\nreconstructing meshes with arbitrary resolution without retraining. To foster\nresearch in this direction, we release our code in project page at:\nhttps://yuxuan-xue.com/nsf.\n","authors":["Yuxuan Xue","Bharat Lal Bhatnagar","Riccardo Marin","Nikolaos Sarafianos","Yuanlu Xu","Gerard Pons-Moll","Tony Tung"],"pdf_url":"https://arxiv.org/pdf/2308.14847v2.pdf","comment":"Accpted to ICCV 2023; Homepage at: https://yuxuan-xue.com/nsf"},{"id":"http://arxiv.org/abs/2308.15846v1","updated":"2023-08-30T08:33:13Z","published":"2023-08-30T08:33:13Z","title":"Exploring Multi-Modal Contextual Knowledge for Open-Vocabulary Object\n Detection","summary":" In this paper, we for the first time explore helpful multi-modal contextual\nknowledge to understand novel categories for open-vocabulary object detection\n(OVD). The multi-modal contextual knowledge stands for the joint relationship\nacross regions and words. However, it is challenging to incorporate such\nmulti-modal contextual knowledge into OVD. The reason is that previous\ndetection frameworks fail to jointly model multi-modal contextual knowledge, as\nobject detectors only support vision inputs and no caption description is\nprovided at test time. To this end, we propose a multi-modal contextual\nknowledge distillation framework, MMC-Det, to transfer the learned contextual\nknowledge from a teacher fusion transformer with diverse multi-modal masked\nlanguage modeling (D-MLM) to a student detector. The diverse multi-modal masked\nlanguage modeling is realized by an object divergence constraint upon\ntraditional multi-modal masked language modeling (MLM), in order to extract\nfine-grained region-level visual contexts, which are vital to object detection.\nExtensive experiments performed upon various detection datasets show the\neffectiveness of our multi-modal context learning strategy, where our approach\nwell outperforms the recent state-of-the-art methods.\n","authors":["Yifan Xu","Mengdan Zhang","Xiaoshan Yang","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2308.15846v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15844v1","updated":"2023-08-30T08:31:55Z","published":"2023-08-30T08:31:55Z","title":"Reconstructing Groups of People with Hypergraph Relational Reasoning","summary":" Due to the mutual occlusion, severe scale variation, and complex spatial\ndistribution, the current multi-person mesh recovery methods cannot produce\naccurate absolute body poses and shapes in large-scale crowded scenes. To\naddress the obstacles, we fully exploit crowd features for reconstructing\ngroups of people from a monocular image. A novel hypergraph relational\nreasoning network is proposed to formulate the complex and high-order relation\ncorrelations among individuals and groups in the crowd. We first extract\ncompact human features and location information from the original\nhigh-resolution image. By conducting the relational reasoning on the extracted\nindividual features, the underlying crowd collectiveness and interaction\nrelationship can provide additional group information for the reconstruction.\nFinally, the updated individual features and the localization information are\nused to regress human meshes in camera coordinates. To facilitate the network\ntraining, we further build pseudo ground-truth on two crowd datasets, which may\nalso promote future research on pose estimation and human behavior\nunderstanding in crowded scenes. The experimental results show that our\napproach outperforms other baseline methods both in crowded and common\nscenarios. The code and datasets are publicly available at\nhttps://github.com/boycehbz/GroupRec.\n","authors":["Buzhen Huang","Jingyi Ju","Zhihao Li","Yangang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.15844v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.15839v1","updated":"2023-08-30T08:21:52Z","published":"2023-08-30T08:21:52Z","title":"Utilizing Task-Generic Motion Prior to Recover Full-Body Motion from\n Very Sparse Signals","summary":" The most popular type of devices used to track a user's posture in a virtual\nreality experience consists of a head-mounted display and two controllers held\nin both hands. However, due to the limited number of tracking sensors (three in\ntotal), faithfully recovering the user in full-body is challenging, limiting\nthe potential for interactions among simulated user avatars within the virtual\nworld. Therefore, recent studies have attempted to reconstruct full-body poses\nusing neural networks that utilize previously learned human poses or accept a\nseries of past poses over a short period. In this paper, we propose a method\nthat utilizes information from a neural motion prior to improve the accuracy of\nreconstructed user's motions. Our approach aims to reconstruct user's full-body\nposes by predicting the latent representation of the user's overall motion from\nlimited input signals and integrating this information with tracking sensor\ninputs. This is based on the premise that the ultimate goal of pose\nreconstruction is to reconstruct the motion, which is a series of poses. Our\nresults show that this integration enables more accurate reconstruction of the\nuser's full-body motion, particularly enhancing the robustness of lower body\nmotion reconstruction from impoverished signals. Web:\nhttps://https://mjsh34.github.io/mp-sspe/\n","authors":["Myungjin Shin","Dohae Lee","In-Kwon Lee"],"pdf_url":"https://arxiv.org/pdf/2308.15839v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.06262v4","updated":"2023-08-30T08:21:13Z","published":"2023-01-16T05:08:50Z","title":"Collaborative Perception in Autonomous Driving: Methods, Datasets and\n Challenges","summary":" Collaborative perception is essential to address occlusion and sensor failure\nissues in autonomous driving. In recent years, theoretical and experimental\ninvestigations of novel works for collaborative perception have increased\ntremendously. So far, however, few reviews have focused on systematical\ncollaboration modules and large-scale collaborative perception datasets. This\nwork reviews recent achievements in this field to bridge this gap and motivate\nfuture research. We start with a brief overview of collaboration schemes. After\nthat, we systematically summarize the collaborative perception methods for\nideal scenarios and real-world issues. The former focuses on collaboration\nmodules and efficiency, and the latter is devoted to addressing the problems in\nactual application. Furthermore, we present large-scale public datasets and\nsummarize quantitative results on these benchmarks. Finally, we highlight gaps\nand overlook challenges between current academic research and real-world\napplications. The project page is\nhttps://github.com/CatOneTwo/Collaborative-Perception-in-Autonomous-Driving\n","authors":["Yushan Han","Hui Zhang","Huifang Li","Yi Jin","Congyan Lang","Yidong Li"],"pdf_url":"https://arxiv.org/pdf/2301.06262v4.pdf","comment":"18 pages, 6 figures. Accepted by IEEE Intelligent Transportation\n Systems Magazine. URL:\n https://github.com/CatOneTwo/Collaborative-Perception-in-Autonomous-Driving"},{"id":"http://arxiv.org/abs/2308.15321v2","updated":"2023-08-30T08:20:30Z","published":"2023-08-29T14:16:09Z","title":"Elucidating the Exposure Bias in Diffusion Models","summary":" Diffusion models have demonstrated impressive generative capabilities, but\ntheir 'exposure bias' problem, described as the input mismatch between training\nand sampling, lacks in-depth exploration. In this paper, we systematically\ninvestigate the exposure bias problem in diffusion models by first analytically\nmodelling the sampling distribution, based on which we then attribute the\nprediction error at each sampling step as the root cause of the exposure bias\nissue. Furthermore, we discuss potential solutions to this issue and propose an\nintuitive metric for it. Along with the elucidation of exposure bias, we\npropose a simple, yet effective, training-free method called Epsilon Scaling to\nalleviate the exposure bias. We show that Epsilon Scaling explicitly moves the\nsampling trajectory closer to the vector field learned in the training phase by\nscaling down the network output (Epsilon), mitigating the input mismatch\nbetween training and sampling. Experiments on various diffusion frameworks\n(ADM, DDPM/DDIM, LDM), unconditional and conditional settings, and\ndeterministic vs. stochastic sampling verify the effectiveness of our method.\n","authors":["Mang Ning","Mingxiao Li","Jianlin Su","Albert Ali Salah","Itir Onal Ertugrul"],"pdf_url":"https://arxiv.org/pdf/2308.15321v2.pdf","comment":"7 pages, code available soon"},{"id":"http://arxiv.org/abs/2204.09398v2","updated":"2023-08-30T08:18:15Z","published":"2022-04-20T11:43:58Z","title":"Case-Aware Adversarial Training","summary":" The neural network (NN) becomes one of the most heated type of models in\nvarious signal processing applications. However, NNs are extremely vulnerable\nto adversarial examples (AEs). To defend AEs, adversarial training (AT) is\nbelieved to be the most effective method while due to the intensive\ncomputation, AT is limited to be applied in most applications. In this paper,\nto resolve the problem, we design a generic and efficient AT improvement\nscheme, namely case-aware adversarial training (CAT). Specifically, the\nintuition stems from the fact that a very limited part of informative samples\ncan contribute to most of model performance. Alternatively, if only the most\ninformative AEs are used in AT, we can lower the computation complexity of AT\nsignificantly as maintaining the defense effect. To achieve this, CAT achieves\ntwo breakthroughs. First, a method to estimate the information degree of\nadversarial examples is proposed for AE filtering. Second, to further enrich\nthe information that the NN can obtain from AEs, CAT involves a weight\nestimation and class-level balancing based sampling strategy to increase the\ndiversity of AT at each iteration. Extensive experiments show that CAT is\nfaster than vanilla AT by up to 3x while achieving competitive defense effect.\n","authors":["Mingyuan Fan","Yang Liu","Cen Chen"],"pdf_url":"https://arxiv.org/pdf/2204.09398v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03512v3","updated":"2023-08-30T08:10:20Z","published":"2023-07-07T11:00:44Z","title":"Tranfer Learning of Semantic Segmentation Methods for Identifying Buried\n Archaeological Structures on LiDAR Data","summary":" When applying deep learning to remote sensing data in archaeological\nresearch, a notable obstacle is the limited availability of suitable datasets\nfor training models. The application of transfer learning is frequently\nemployed to mitigate this drawback. However, there is still a need to explore\nits effectiveness when applied across different archaeological datasets. This\npaper compares the performance of various transfer learning configurations\nusing two semantic segmentation deep neural networks on two LiDAR datasets. The\nexperimental results indicate that transfer learning-based approaches in\narchaeology can lead to performance improvements, although a systematic\nenhancement has not yet been observed. We provide specific insights about the\nvalidity of such techniques that can serve as a baseline for future works.\n","authors":["Gregory Sech","Paolo Soleni","Wouter B. Verschoof-van der Vaart","Žiga Kokalj","Arianna Traviglia","Marco Fiorucci"],"pdf_url":"https://arxiv.org/pdf/2307.03512v3.pdf","comment":"Accepted to IEEE International Geoscience and Remote Sensing\n Symposium 2023 (IGARSS 2023) @IEEE copyright"},{"id":"http://arxiv.org/abs/2308.15829v1","updated":"2023-08-30T08:09:40Z","published":"2023-08-30T08:09:40Z","title":"Early Detection of Red Palm Weevil Infestations using Deep Learning\n Classification of Acoustic Signals","summary":" The Red Palm Weevil (RPW), also known as the palm weevil, is considered among\nthe world's most damaging insect pests of palms. Current detection techniques\ninclude the detection of symptoms of RPW using visual or sound inspection and\nchemical detection of volatile signatures generated by infested palm trees.\nHowever, efficient detection of RPW diseases at an early stage is considered\none of the most challenging issues for cultivating date palms. In this paper,\nan efficient approach to the early detection of RPW is proposed. The proposed\napproach is based on RPW sound activities being recorded and analyzed. The\nfirst step involves the conversion of sound data into images based on a\nselected set of features. The second step involves the combination of images\nfrom the same sound file but computed by different features into a single\nimage. The third step involves the application of different Deep Learning (DL)\ntechniques to classify resulting images into two classes: infested and not\ninfested. Experimental results show good performances of the proposed approach\nfor RPW detection using different DL techniques, namely MobileNetV2,\nResNet50V2, ResNet152V2, VGG16, VGG19, DenseNet121, DenseNet201, Xception, and\nInceptionV3. The proposed approach outperformed existing techniques for public\ndatasets.\n","authors":["Wadii Boulila","Ayyub Alzahem","Anis Koubaa","Bilel Benjdira","Adel Ammar"],"pdf_url":"https://arxiv.org/pdf/2308.15829v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15827v1","updated":"2023-08-30T08:03:49Z","published":"2023-08-30T08:03:49Z","title":"Introducing Language Guidance in Prompt-based Continual Learning","summary":" Continual Learning aims to learn a single model on a sequence of tasks\nwithout having access to data from previous tasks. The biggest challenge in the\ndomain still remains catastrophic forgetting: a loss in performance on seen\nclasses of earlier tasks. Some existing methods rely on an expensive replay\nbuffer to store a chunk of data from previous tasks. This, while promising,\nbecomes expensive when the number of tasks becomes large or data can not be\nstored for privacy reasons. As an alternative, prompt-based methods have been\nproposed that store the task information in a learnable prompt pool. This\nprompt pool instructs a frozen image encoder on how to solve each task. While\nthe model faces a disjoint set of classes in each task in this setting, we\nargue that these classes can be encoded to the same embedding space of a\npre-trained language encoder. In this work, we propose Language Guidance for\nPrompt-based Continual Learning (LGCL) as a plug-in for prompt-based methods.\nLGCL is model agnostic and introduces language guidance at the task level in\nthe prompt pool and at the class level on the output feature of the vision\nencoder. We show with extensive experimentation that LGCL consistently improves\nthe performance of prompt-based continual learning methods to set a new\nstate-of-the art. LGCL achieves these performance improvements without needing\nany additional learnable parameters.\n","authors":["Muhammad Gul Zain Ali Khan","Muhammad Ferjad Naeem","Luc Van Gool","Didier Stricker","Federico Tombari","Muhammad Zeshan Afzal"],"pdf_url":"https://arxiv.org/pdf/2308.15827v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2209.14624v2","updated":"2023-08-30T08:01:22Z","published":"2022-09-29T08:38:30Z","title":"Is Complexity Required for Neural Network Pruning? A Case Study on\n Global Magnitude Pruning","summary":" Pruning neural networks has become popular in the last decade when it was\nshown that a large number of weights can be safely removed from modern neural\nnetworks without compromising accuracy. Numerous pruning methods have been\nproposed since then, each claiming to be better than the previous. Many\nstate-of-the-art (SOTA) techniques today rely on complex pruning methodologies\nutilizing importance scores, getting feedback through back-propagation or\nhaving heuristics-based pruning rules amongst others. In this work, we question\nwhether this pattern of introducing complexity is really necessary to achieve\nbetter pruning results. We benchmark these SOTA techniques against a naive\npruning baseline, namely, Global Magnitude Pruning (Global MP). Global MP ranks\nweights in order of their magnitudes and prunes the smallest ones. Hence, in\nits vanilla form, it is one of the simplest pruning techniques. Surprisingly,\nwe find that vanilla Global MP outperforms all the other SOTA techniques and\nachieves a new SOTA result. It also achieves promising performance on FLOPs\nsparsification, which we find is enhanced, when pruning is conducted in a\ngradual fashion. We also find that Global MP is generalizable across tasks,\ndatasets, and models with superior performance. Moreover, a common issue that\nmany pruning algorithms run into at high sparsity rates, namely,\nlayer-collapse, can be easily fixed in Global MP by setting a minimum threshold\nof weights to be retained in each layer. Lastly, unlike many other SOTA\ntechniques, Global MP does not require any additional algorithm specific\nhyper-parameters and is very straightforward to tune and implement. We showcase\nour findings on various models (WRN-28-8, ResNet-32, ResNet-50, MobileNet-V1\nand FastGRNN) and multiple datasets (CIFAR-10, ImageNet and HAR-2). Code is\navailable at https://github.com/manasgupta-1/GlobalMP.\n","authors":["Manas Gupta","Efe Camci","Vishandi Rudy Keneta","Abhishek Vaidyanathan","Ritwik Kanodia","Chuan-Sheng Foo","Wu Min","Lin Jie"],"pdf_url":"https://arxiv.org/pdf/2209.14624v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15822v1","updated":"2023-08-30T07:48:32Z","published":"2023-08-30T07:48:32Z","title":"AMDNet23: A combined deep Contour-based Convolutional Neural Network and\n Long Short Term Memory system to diagnose Age-related Macular Degeneration","summary":" In light of the expanding population, an automated framework of disease\ndetection can assist doctors in the diagnosis of ocular diseases, yields\naccurate, stable, rapid outcomes, and improves the success rate of early\ndetection. The work initially intended the enhancing the quality of fundus\nimages by employing an adaptive contrast enhancement algorithm (CLAHE) and\nGamma correction. In the preprocessing techniques, CLAHE elevates the local\ncontrast of the fundus image and gamma correction increases the intensity of\nrelevant features. This study operates on a AMDNet23 system of deep learning\nthat combined the neural networks made up of convolutions (CNN) and short-term\nand long-term memory (LSTM) to automatically detect aged macular degeneration\n(AMD) disease from fundus ophthalmology. In this mechanism, CNN is utilized for\nextracting features and LSTM is utilized to detect the extracted features. The\ndataset of this research is collected from multiple sources and afterward\napplied quality assessment techniques, 2000 experimental fundus images\nencompass four distinct classes equitably. The proposed hybrid deep AMDNet23\nmodel demonstrates to detection of AMD ocular disease and the experimental\nresult achieved an accuracy 96.50%, specificity 99.32%, sensitivity 96.5%, and\nF1-score 96.49.0%. The system achieves state-of-the-art findings on fundus\nimagery datasets to diagnose AMD ocular disease and findings effectively\npotential of our method.\n","authors":["Md. Aiyub Ali","Md. Shakhawat Hossain","Md. Kawar Hossain","Subhadra Soumi Sikder","Sharun Akter Khushbu","Mirajul Islam"],"pdf_url":"https://arxiv.org/pdf/2308.15822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15816v1","updated":"2023-08-30T07:41:26Z","published":"2023-08-30T07:41:26Z","title":"Improving Underwater Visual Tracking With a Large Scale Dataset and\n Image Enhancement","summary":" This paper presents a new dataset and general tracker enhancement method for\nUnderwater Visual Object Tracking (UVOT). Despite its significance, underwater\ntracking has remained unexplored due to data inaccessibility. It poses distinct\nchallenges; the underwater environment exhibits non-uniform lighting\nconditions, low visibility, lack of sharpness, low contrast, camouflage, and\nreflections from suspended particles. Performance of traditional tracking\nmethods designed primarily for terrestrial or open-air scenarios drops in such\nconditions. We address the problem by proposing a novel underwater image\nenhancement algorithm designed specifically to boost tracking quality. The\nmethod has resulted in a significant performance improvement, of up to 5.0%\nAUC, of state-of-the-art (SOTA) visual trackers. To develop robust and accurate\nUVOT methods, large-scale datasets are required. To this end, we introduce a\nlarge-scale UVOT benchmark dataset consisting of 400 video segments and 275,000\nmanually annotated frames enabling underwater training and evaluation of deep\ntrackers. The videos are labelled with several underwater-specific tracking\nattributes including watercolor variation, target distractors, camouflage,\ntarget relative size, and low visibility conditions. The UVOT400 dataset,\ntracking results, and the code are publicly available on:\nhttps://github.com/BasitAlawode/UWVOT400.\n","authors":["Basit Alawode","Fayaz Ali Dharejo","Mehnaz Ummar","Yuhang Guo","Arif Mahmood","Naoufel Werghi","Fahad Shahbaz Khan","Sajid Javed"],"pdf_url":"https://arxiv.org/pdf/2308.15816v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15807v1","updated":"2023-08-30T07:23:32Z","published":"2023-08-30T07:23:32Z","title":"ACNPU: A 4.75TOPS/W 1080P@30FPS Super Resolution Accelerator with\n Decoupled Asymmetric Convolution","summary":" Deep learning-driven superresolution (SR) outperforms traditional techniques\nbut also faces the challenge of high complexity and memory bandwidth. This\nchallenge leads many accelerators to opt for simpler and shallow models like\nFSRCNN, compromising performance for real-time needs, especially for\nresource-limited edge devices. This paper proposes an energy-efficient SR\naccelerator, ACNPU, to tackle this challenge. The ACNPU enhances image quality\nby 0.34dB with a 27-layer model, but needs 36\\% less complexity than FSRCNN,\nwhile maintaining a similar model size, with the \\textit{decoupled asymmetric\nconvolution and split-bypass structure}. The hardware-friendly 17K-parameter\nmodel enables \\textit{holistic model fusion} instead of localized layer fusion\nto remove external DRAM access of intermediate feature maps. The on-chip memory\nbandwidth is further reduced with the \\textit{input stationary flow} and\n\\textit{parallel-layer execution} to reduce power consumption. Hardware is\nregular and easy to control to support different layers by \\textit{processing\nelements (PEs) clusters with reconfigurable input and uniform data flow}. The\nimplementation in the 40 nm CMOS process consumes 2333 K gate counts and 198KB\nSRAMs. The ACNPU achieves 31.7 FPS and 124.4 FPS for x2 and x4 scales Full-HD\ngeneration, respectively, which attains 4.75 TOPS/W energy efficiency.\n","authors":["Tun-Hao Yang","Tian-Sheuan Chang"],"pdf_url":"https://arxiv.org/pdf/2308.15807v1.pdf","comment":"9 pages, 14 figures"},{"id":"http://arxiv.org/abs/2308.07016v2","updated":"2023-08-30T07:01:42Z","published":"2023-08-14T09:04:06Z","title":"HHTrack: Hyperspectral Object Tracking Using Hybrid Attention","summary":" Hyperspectral imagery provides abundant spectral information beyond the\nvisible RGB bands, offering rich discriminative details about objects in a\nscene. Leveraging such data has the potential to enhance visual tracking\nperformance. In this paper, we propose a hyperspectral object tracker based on\nhybrid attention (HHTrack). The core of HHTrack is a hyperspectral hybrid\nattention (HHA) module that unifies feature extraction and fusion within one\ncomponent through token interactions. A hyperspectral bands fusion (HBF) module\nis also introduced to selectively aggregate spatial and spectral signatures\nfrom the full hyperspectral input. Extensive experiments demonstrate the\nstate-of-the-art performance of HHTrack on benchmark Near Infrared (NIR), Red\nNear Infrared (Red-NIR), and Visible (VIS) hyperspectral tracking datasets. Our\nwork provides new insights into harnessing the strengths of transformers and\nhyperspectral fusion to advance robust object tracking.\n","authors":["Yuedong Tan"],"pdf_url":"https://arxiv.org/pdf/2308.07016v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.11499v3","updated":"2023-08-30T06:57:57Z","published":"2022-08-24T12:47:58Z","title":"Semi-supervised Semantic Segmentation with Mutual Knowledge Distillation","summary":" Consistency regularization has been widely studied in recent semisupervised\nsemantic segmentation methods, and promising performance has been achieved. In\nthis work, we propose a new consistency regularization framework, termed mutual\nknowledge distillation (MKD), combined with data and feature augmentation. We\nintroduce two auxiliary mean-teacher models based on consistency\nregularization. More specifically, we use the pseudo-labels generated by a mean\nteacher to supervise the student network to achieve a mutual knowledge\ndistillation between the two branches. In addition to using image-level strong\nand weak augmentation, we also discuss feature augmentation. This involves\nconsidering various sources of knowledge to distill the student network. Thus,\nwe can significantly increase the diversity of the training samples.\nExperiments on public benchmarks show that our framework outperforms previous\nstate-of-the-art (SOTA) methods under various semi-supervised settings. Code is\navailable at semi-mmseg.\n","authors":["Jianlong Yuan","Jinchao Ge","Zhibin Wang","Yifan Liu"],"pdf_url":"https://arxiv.org/pdf/2208.11499v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15795v1","updated":"2023-08-30T06:56:53Z","published":"2023-08-30T06:56:53Z","title":"Occlusion-Aware Detection and Re-ID Calibrated Network for Multi-Object\n Tracking","summary":" Multi-Object Tracking (MOT) is a crucial computer vision task that aims to\npredict the bounding boxes and identities of objects simultaneously. While\nstate-of-the-art methods have made remarkable progress by jointly optimizing\nthe multi-task problems of detection and Re-ID feature learning, yet, few\napproaches explore to tackle the occlusion issue, which is a long-standing\nchallenge in the MOT field. Generally, occluded objects may hinder the detector\nfrom estimating the bounding boxes, resulting in fragmented trajectories. And\nthe learned occluded Re-ID embeddings are less distinct since they contain\ninterferer. To this end, we propose an occlusion-aware detection and Re-ID\ncalibrated network for multi-object tracking, termed as ORCTrack. Specifically,\nwe propose an Occlusion-Aware Attention (OAA) module in the detector that\nhighlights the object features while suppressing the occluded background\nregions. OAA can serve as a modulator that enhances the detector for some\npotentially occluded objects. Furthermore, we design a Re-ID embedding matching\nblock based on the optimal transport problem, which focuses on enhancing and\ncalibrating the Re-ID representations through different adjacent frames\ncomplementarily. To validate the effectiveness of the proposed method,\nextensive experiments are conducted on two challenging VisDrone2021-MOT and\nKITTI benchmarks. Experimental evaluations demonstrate the superiority of our\napproach, which can achieve new state-of-the-art performance and enjoy high\nrun-time efficiency.\n","authors":["Yukun Su","Ruizhou Sun","Xin Shu","Yu Zhang","Qingyao Wu"],"pdf_url":"https://arxiv.org/pdf/2308.15795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15791v1","updated":"2023-08-30T06:49:34Z","published":"2023-08-30T06:49:34Z","title":"Neural Video Compression with Temporal Layer-Adaptive Hierarchical\n B-frame Coding","summary":" Neural video compression (NVC) is a rapidly evolving video coding research\narea, with some models achieving superior coding efficiency compared to the\nlatest video coding standard Versatile Video Coding (VVC). In conventional\nvideo coding standards, the hierarchical B-frame coding, which utilizes a\nbidirectional prediction structure for higher compression, had been\nwell-studied and exploited. In NVC, however, limited research has investigated\nthe hierarchical B scheme. In this paper, we propose an NVC model exploiting\nhierarchical B-frame coding with temporal layer-adaptive optimization. We first\nextend an existing unidirectional NVC model to a bidirectional model, which\nachieves -21.13% BD-rate gain over the unidirectional baseline model. However,\nthis model faces challenges when applied to sequences with complex or large\nmotions, leading to performance degradation. To address this, we introduce\ntemporal layer-adaptive optimization, incorporating methods such as temporal\nlayer-adaptive quality scaling (TAQS) and temporal layer-adaptive latent\nscaling (TALS). The final model with the proposed methods achieves an\nimpressive BD-rate gain of -39.86% against the baseline. It also resolves the\nchallenges in sequences with large or complex motions with up to -49.13% more\nBD-rate gains than the simple bidirectional extension. This improvement is\nattributed to the allocation of more bits to lower temporal layers, thereby\nenhancing overall reconstruction quality with smaller bits. Since our method\nhas little dependency on a specific NVC model architecture, it can serve as a\ngeneral tool for extending unidirectional NVC models to the ones with\nhierarchical B-frame coding.\n","authors":["Yeongwoong Kim","Suyong Bahk","Seungeon Kim","Won Hee Lee","Dokwan Oh","Hui Yong Kim"],"pdf_url":"https://arxiv.org/pdf/2308.15791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.07394v3","updated":"2023-08-30T06:36:08Z","published":"2022-06-15T08:55:47Z","title":"Efficient Adaptive Ensembling for Image Classification","summary":" In recent times, with the exception of sporadic cases, the trend in Computer\nVision is to achieve minor improvements compared to considerable increases in\ncomplexity.\n To reverse this trend, we propose a novel method to boost image\nclassification performances without increasing complexity.\n To this end, we revisited ensembling, a powerful approach, often not used\nproperly due to its more complex nature and the training time, so as to make it\nfeasible through a specific design choice. First, we trained two\nEfficientNet-b0 end-to-end models (known to be the architecture with the best\noverall accuracy/complexity trade-off for image classification) on disjoint\nsubsets of data (i.e. bagging). Then, we made an efficient adaptive ensemble by\nperforming fine-tuning of a trainable combination layer. In this way, we were\nable to outperform the state-of-the-art by an average of 0.5$\\%$ on the\naccuracy, with restrained complexity both in terms of the number of parameters\n(by 5-60 times), and the FLoating point Operations Per Second (FLOPS) by 10-100\ntimes on several major benchmark datasets.\n","authors":["Antonio Bruno","Davide Moroni","Massimo Martinelli"],"pdf_url":"https://arxiv.org/pdf/2206.07394v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.04557v2","updated":"2023-08-30T06:30:43Z","published":"2023-03-08T13:15:19Z","title":"Scene Matters: Model-based Deep Video Compression","summary":" Video compression has always been a popular research area, where many\ntraditional and deep video compression methods have been proposed. These\nmethods typically rely on signal prediction theory to enhance compression\nperformance by designing high efficient intra and inter prediction strategies\nand compressing video frames one by one. In this paper, we propose a novel\nmodel-based video compression (MVC) framework that regards scenes as the\nfundamental units for video sequences. Our proposed MVC directly models the\nintensity variation of the entire video sequence in one scene, seeking\nnon-redundant representations instead of reducing redundancy through\nspatio-temporal predictions. To achieve this, we employ implicit neural\nrepresentation as our basic modeling architecture. To improve the efficiency of\nvideo modeling, we first propose context-related spatial positional embedding\nand frequency domain supervision in spatial context enhancement. For temporal\ncorrelation capturing, we design the scene flow constrain mechanism and\ntemporal contrastive loss. Extensive experimental results demonstrate that our\nmethod achieves up to a 20\\% bitrate reduction compared to the latest video\ncoding standard H.266 and is more efficient in decoding than existing video\ncoding strategies.\n","authors":["Lv Tang","Xinfeng Zhang","Gai Zhang","Xiaoqi Ma"],"pdf_url":"https://arxiv.org/pdf/2303.04557v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10799v2","updated":"2023-08-30T05:01:31Z","published":"2023-06-19T09:39:10Z","title":"SelfTalk: A Self-Supervised Commutative Training Diagram to Comprehend\n 3D Talking Faces","summary":" Speech-driven 3D face animation technique, extending its applications to\nvarious multimedia fields. Previous research has generated promising realistic\nlip movements and facial expressions from audio signals. However, traditional\nregression models solely driven by data face several essential problems, such\nas difficulties in accessing precise labels and domain gaps between different\nmodalities, leading to unsatisfactory results lacking precision and coherence.\nTo enhance the visual accuracy of generated lip movement while reducing the\ndependence on labeled data, we propose a novel framework SelfTalk, by involving\nself-supervision in a cross-modals network system to learn 3D talking faces.\nThe framework constructs a network system consisting of three modules: facial\nanimator, speech recognizer, and lip-reading interpreter. The core of SelfTalk\nis a commutative training diagram that facilitates compatible features exchange\namong audio, text, and lip shape, enabling our models to learn the intricate\nconnection between these factors. The proposed framework leverages the\nknowledge learned from the lip-reading interpreter to generate more plausible\nlip shapes. Extensive experiments and user studies demonstrate that our\nproposed approach achieves state-of-the-art performance both qualitatively and\nquantitatively. We recommend watching the supplementary video.\n","authors":["Ziqiao Peng","Yihao Luo","Yue Shi","Hao Xu","Xiangyu Zhu","Jun He","Hongyan Liu","Zhaoxin Fan"],"pdf_url":"https://arxiv.org/pdf/2306.10799v2.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2304.04027v3","updated":"2023-08-30T04:55:04Z","published":"2023-04-08T14:40:35Z","title":"Estimating 3D Dental Structures using Simulated Panoramic Radiographs\n and Neural Ray Tracing","summary":" Panoramic radiography (Panoramic X-ray, PX) is a widely used imaging modality\nfor dental examination. However, PX only provides a flattened 2D image, lacking\nin a 3D view of the oral structure. In this paper, we propose a framework to\nestimate 3D oral structures from real-world PX. Our framework tackles full 3D\nreconstruction for varying subjects (patients) where each reconstruction is\nbased only on a single panoramic image. We create an intermediate\nrepresentation called simulated PX (SimPX) from 3D Cone-beam computed\ntomography (CBCT) data based on the Beer-Lambert law of X-ray rendering and\nrotational principles of PX imaging. SimPX aims at not only truthfully\nsimulating PX, but also facilitates the reverting process back to 3D data. We\npropose a novel neural model based on ray tracing which exploits both global\nand local input features to convert SimPX to 3D output. At inference, a real PX\nimage is translated to a SimPX-style image with semantic regularization, and\nthe translated image is processed by generation module to produce high-quality\noutputs. Experiments show that our method outperforms prior state-of-the-art in\nreconstruction tasks both quantitatively and qualitatively. Unlike prior\nmethods, Our method does not require any prior information such as the shape of\ndental arches, nor the matched PX-CBCT dataset for training, which is difficult\nto obtain in clinical practice.\n","authors":["Sihwa Park","Seongjun Kim","Doeyoung Kwon","Yohan Jang","In-Seok Song","Seungjun Baek"],"pdf_url":"https://arxiv.org/pdf/2304.04027v3.pdf","comment":"20 pages, 16 figures"},{"id":"http://arxiv.org/abs/2306.01762v2","updated":"2023-08-30T04:53:15Z","published":"2023-05-27T06:00:51Z","title":"Pre-trained transformer for adversarial purification","summary":" With more and more deep neural networks being deployed as various daily\nservices, their reliability is essential. It's frightening that deep neural\nnetworks are vulnerable and sensitive to adversarial attacks, the most common\none of which for the services is evasion-based. Recent works usually strengthen\nthe robustness by adversarial training or leveraging the knowledge of an amount\nof clean data. However, in practical terms, retraining and redeploying the\nmodel need a large computational budget, leading to heavy losses to the online\nservice. In addition, when adversarial examples of a certain attack are\ndetected, only limited adversarial examples are available for the service\nprovider, while much clean data may not be accessible. Given the mentioned\nproblems, we propose a new scenario, RaPiD (Rapid Plug-in Defender), which is\nto rapidly defend against a certain attack for the frozen original service\nmodel with limitations of few clean and adversarial examples. Motivated by the\ngeneralization and the universal computation ability of pre-trained transformer\nmodels, we come up with a new defender method, CeTaD, which stands for\nConsidering Pre-trained Transformers as Defenders. In particular, we evaluate\nthe effectiveness and the transferability of CeTaD in the case of one-shot\nadversarial examples and explore the impact of different parts of CeTaD as well\nas training data conditions. CeTaD is flexible, able to be embedded into an\narbitrary differentiable model, and suitable for various types of attacks.\n","authors":["Kai Wu","Yujian Betterest Li","Xiaoyu Zhang","Handing Wang","Jing Liu"],"pdf_url":"https://arxiv.org/pdf/2306.01762v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.01418v3","updated":"2023-08-30T04:41:10Z","published":"2023-03-02T17:09:27Z","title":"Human Motion Diffusion as a Generative Prior","summary":" Recent work has demonstrated the significant potential of denoising diffusion\nmodels for generating human motion, including text-to-motion capabilities.\nHowever, these methods are restricted by the paucity of annotated motion data,\na focus on single-person motions, and a lack of detailed control. In this\npaper, we introduce three forms of composition based on diffusion priors:\nsequential, parallel, and model composition. Using sequential composition, we\ntackle the challenge of long sequence generation. We introduce DoubleTake, an\ninference-time method with which we generate long animations consisting of\nsequences of prompted intervals and their transitions, using a prior trained\nonly for short clips. Using parallel composition, we show promising steps\ntoward two-person generation. Beginning with two fixed priors as well as a few\ntwo-person training examples, we learn a slim communication block, ComMDM, to\ncoordinate interaction between the two resulting motions. Lastly, using model\ncomposition, we first train individual priors to complete motions that realize\na prescribed motion for a given joint. We then introduce DiffusionBlending, an\ninterpolation mechanism to effectively blend several such models to enable\nflexible and efficient fine-grained joint and trajectory-level control and\nediting. We evaluate the composition methods using an off-the-shelf motion\ndiffusion model, and further compare the results to dedicated models trained\nfor these specific tasks.\n","authors":["Yonatan Shafir","Guy Tevet","Roy Kapon","Amit H. Bermano"],"pdf_url":"https://arxiv.org/pdf/2303.01418v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15752v1","updated":"2023-08-30T04:29:48Z","published":"2023-08-30T04:29:48Z","title":"Large-scale data extraction from the UNOS organ donor documents","summary":" The scope of our study is all UNOS data of the USA organ donors since 2008.\nThe data is not analyzable in a large scale in the past because it was captured\nin PDF documents known as \"Attachments\", whereby every donor is represented by\ndozens of PDF documents in heterogenous formats. To make the data analyzable,\none needs to convert the content inside these PDFs to an analyzable data\nformat, such as a standard SQL database. In this paper we will focus on 2022\nUNOS data comprised of $\\approx 400,000$ PDF documents spanning millions of\npages. The totality of UNOS data covers 15 years (2008--20022) and our results\nwill be quickly extended to the entire data. Our method captures a portion of\nthe data in DCD flowsheets, kidney perfusion data, and data captured during\npatient hospital stay (e.g. vital signs, ventilator settings, etc.). The\ncurrent paper assumes that the reader is familiar with the content of the UNOS\ndata. The overview of the types of data and challenges they present is a\nsubject of another paper. Here we focus on demonstrating that the goal of\nbuilding a comprehensive, analyzable database from UNOS documents is an\nattainable task, and we provide an overview of our methodology. The project\nresulted in datasets by far larger than previously available even in this\npreliminary phase.\n","authors":["Marek Rychlik","Bekir Tanriover","Yan Han"],"pdf_url":"https://arxiv.org/pdf/2308.15752v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.03741v4","updated":"2023-08-30T04:18:50Z","published":"2022-12-07T16:10:08Z","title":"FineDance: A Fine-grained Choreography Dataset for 3D Full Body Dance\n Generation","summary":" Generating full-body and multi-genre dance sequences from given music is a\nchallenging task, due to the limitations of existing datasets and the inherent\ncomplexity of the fine-grained hand motion and dance genres. To address these\nproblems, we propose FineDance, which contains 14.6 hours of music-dance paired\ndata, with fine-grained hand motions, fine-grained genres (22 dance genres),\nand accurate posture. To the best of our knowledge, FineDance is the largest\nmusic-dance paired dataset with the most dance genres. Additionally, to address\nmonotonous and unnatural hand movements existing in previous methods, we\npropose a full-body dance generation network, which utilizes the diverse\ngeneration capabilities of the diffusion model to solve monotonous problems,\nand use expert nets to solve unreal problems. To further enhance the\ngenre-matching and long-term stability of generated dances, we propose a\nGenre&Coherent aware Retrieval Module. Besides, we propose a novel metric named\nGenre Matching Score to evaluate the genre-matching degree between dance and\nmusic. Quantitative and qualitative experiments demonstrate the quality of\nFineDance, and the state-of-the-art performance of FineNet. The FineDance\nDataset and more qualitative samples can be found at our website.\n","authors":["Ronghui Li","Junfan Zhao","Yachao Zhang","Mingyang Su","Zeping Ren","Han Zhang","Yansong Tang","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2212.03741v4.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2305.12596v2","updated":"2023-08-30T03:55:54Z","published":"2023-05-21T23:10:14Z","title":"iWarpGAN: Disentangling Identity and Style to Generate Synthetic Iris\n Images","summary":" Generative Adversarial Networks (GANs) have shown success in approximating\ncomplex distributions for synthetic image generation. However, current\nGAN-based methods for generating biometric images, such as iris, have certain\nlimitations: (a) the synthetic images often closely resemble images in the\ntraining dataset; (b) the generated images lack diversity in terms of the\nnumber of unique identities represented in them; and (c) it is difficult to\ngenerate multiple images pertaining to the same identity. To overcome these\nissues, we propose iWarpGAN that disentangles identity and style in the context\nof the iris modality by using two transformation pathways: Identity\nTransformation Pathway to generate unique identities from the training set, and\nStyle Transformation Pathway to extract the style code from a reference image\nand output an iris image using this style. By concatenating the transformed\nidentity code and reference style code, iWarpGAN generates iris images with\nboth inter- and intra-class variations. The efficacy of the proposed method in\ngenerating such iris DeepFakes is evaluated both qualitatively and\nquantitatively using ISO/IEC 29794-6 Standard Quality Metrics and the VeriEye\niris matcher. Further, the utility of the synthetically generated images is\ndemonstrated by improving the performance of deep learning based iris matchers\nthat augment synthetic data with real data during the training process.\n","authors":["Shivangi Yadav","Arun Ross"],"pdf_url":"https://arxiv.org/pdf/2305.12596v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15740v1","updated":"2023-08-30T03:35:55Z","published":"2023-08-30T03:35:55Z","title":"Beard Segmentation and Recognition Bias","summary":" A person's facial hairstyle, such as presence and size of beard, can\nsignificantly impact face recognition accuracy. There are publicly-available\ndeep networks that achieve reasonable accuracy at binary attribute\nclassification, such as beard / no beard, but few if any that segment the\nfacial hair region. To investigate the effect of facial hair in a rigorous\nmanner, we first created a set of fine-grained facial hair annotations to train\na segmentation model and evaluate its accuracy across African-American and\nCaucasian face images. We then use our facial hair segmentations to categorize\nimage pairs according to the degree of difference or similarity in the facial\nhairstyle. We find that the False Match Rate (FMR) for image pairs with\ndifferent categories of facial hairstyle varies by a factor of over 10 for\nAfrican-American males and over 25 for Caucasian males. To reduce the bias\nacross image pairs with different facial hairstyles, we propose a scheme for\nadaptive thresholding based on facial hairstyle similarity. Evaluation on a\nsubject-disjoint set of images shows that adaptive similarity thresholding\nbased on facial hairstyles of the image pair reduces the ratio between the\nhighest and lowest FMR across facial hairstyle categories for African-American\nfrom 10.7 to 1.8 and for Caucasians from 25.9 to 1.3. Facial hair annotations\nand facial hair segmentation model will be publicly available.\n","authors":["Kagan Ozturk","Grace Bezold","Aman Bhatta","Haiyu Wu","Kevin Bowyer"],"pdf_url":"https://arxiv.org/pdf/2308.15740v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.02862v2","updated":"2023-08-30T03:21:29Z","published":"2023-03-06T03:27:17Z","title":"EvHandPose: Event-based 3D Hand Pose Estimation with Sparse Supervision","summary":" Event camera shows great potential in 3D hand pose estimation, especially\naddressing the challenges of fast motion and high dynamic range in a low-power\nway. However, due to the asynchronous differential imaging mechanism, it is\nchallenging to design event representation to encode hand motion information\nespecially when the hands are not moving (causing motion ambiguity), and it is\ninfeasible to fully annotate the temporally dense event stream. In this paper,\nwe propose EvHandPose with novel hand flow representations in Event-to-Pose\nmodule for accurate hand pose estimation and alleviating the motion ambiguity\nissue. To solve the problem under sparse annotation, we design contrast\nmaximization and hand-edge constraints in Pose-to-IWE (Image with Warped\nEvents) module and formulate EvHandPose in a weakly-supervision framework. We\nfurther build EvRealHands, the first large-scale real-world event-based hand\npose dataset on several challenging scenes to bridge the real-synthetic domain\ngap. Experiments on EvRealHands demonstrate that EvHandPose outperforms\nprevious event-based methods under all evaluation scenes, achieves accurate and\nstable hand pose estimation with high temporal resolution in fast motion and\nstrong light scenes compared with RGB-based methods, generalizes well to\noutdoor scenes and another type of event camera, and shows the potential for\nthe hand gesture recognition task.\n","authors":["Jianping Jiang","Jiahe Li","Baowen Zhang","Xiaoming Deng","Boxin Shi"],"pdf_url":"https://arxiv.org/pdf/2303.02862v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15733v1","updated":"2023-08-30T03:17:57Z","published":"2023-08-30T03:17:57Z","title":"Drone-NeRF: Efficient NeRF Based 3D Scene Reconstruction for Large-Scale\n Drone Survey","summary":" Neural rendering has garnered substantial attention owing to its capacity for\ncreating realistic 3D scenes. However, its applicability to extensive scenes\nremains challenging, with limitations in effectiveness. In this work, we\npropose the Drone-NeRF framework to enhance the efficient reconstruction of\nunbounded large-scale scenes suited for drone oblique photography using Neural\nRadiance Fields (NeRF). Our approach involves dividing the scene into uniform\nsub-blocks based on camera position and depth visibility. Sub-scenes are\ntrained in parallel using NeRF, then merged for a complete scene. We refine the\nmodel by optimizing camera poses and guiding NeRF with a uniform sampler.\nIntegrating chosen samples enhances accuracy. A hash-coded fusion MLP\naccelerates density representation, yielding RGB and Depth outputs. Our\nframework accounts for sub-scene constraints, reduces parallel-training noise,\nhandles shadow occlusion, and merges sub-regions for a polished rendering\nresult. This Drone-NeRF framework demonstrates promising capabilities in\naddressing challenges related to scene complexity, rendering efficiency, and\naccuracy in drone-obtained imagery.\n","authors":["Zhihao Jia","Bing Wang","Changhao Chen"],"pdf_url":"https://arxiv.org/pdf/2308.15733v1.pdf","comment":"15 pages, 7 figures, in submission"},{"id":"http://arxiv.org/abs/2303.07543v4","updated":"2023-08-30T03:12:34Z","published":"2023-03-14T00:13:57Z","title":"WDiscOOD: Out-of-Distribution Detection via Whitened Linear Discriminant\n Analysis","summary":" Deep neural networks are susceptible to generating overconfident yet\nerroneous predictions when presented with data beyond known concepts. This\nchallenge underscores the importance of detecting out-of-distribution (OOD)\nsamples in the open world. In this work, we propose a novel feature-space OOD\ndetection score based on class-specific and class-agnostic information.\nSpecifically, the approach utilizes Whitened Linear Discriminant Analysis to\nproject features into two subspaces - the discriminative and residual subspaces\n- for which the in-distribution (ID) classes are maximally separated and\nclosely clustered, respectively. The OOD score is then determined by combining\nthe deviation from the input data to the ID pattern in both subspaces. The\nefficacy of our method, named WDiscOOD, is verified on the large-scale\nImageNet-1k benchmark, with six OOD datasets that cover a variety of\ndistribution shifts. WDiscOOD demonstrates superior performance on deep\nclassifiers with diverse backbone architectures, including CNN and vision\ntransformer. Furthermore, we also show that WDiscOOD more effectively detects\nnovel concepts in representation spaces trained with contrastive objectives,\nincluding supervised contrastive loss and multi-modality contrastive loss.\n","authors":["Yiye Chen","Yunzhi Lin","Ruinian Xu","Patricio A. Vela"],"pdf_url":"https://arxiv.org/pdf/2303.07543v4.pdf","comment":"Accepted by ICCV 2023. Code is available at:\n https://github.com/ivalab/WDiscOOD.git"},{"id":"http://arxiv.org/abs/2305.04466v2","updated":"2023-08-30T03:10:19Z","published":"2023-05-08T05:34:15Z","title":"Generalized Universal Domain Adaptation with Generative Flow Networks","summary":" We introduce a new problem in unsupervised domain adaptation, termed as\nGeneralized Universal Domain Adaptation (GUDA), which aims to achieve precise\nprediction of all target labels including unknown categories. GUDA bridges the\ngap between label distribution shift-based and label space mismatch-based\nvariants, essentially categorizing them as a unified problem, guiding to a\ncomprehensive framework for thoroughly solving all the variants. The key\nchallenge of GUDA is developing and identifying novel target categories while\nestimating the target label distribution. To address this problem, we take\nadvantage of the powerful exploration capability of generative flow networks\nand propose an active domain adaptation algorithm named GFlowDA, which selects\ndiverse samples with probabilities proportional to a reward function. To\nenhance the exploration capability and effectively perceive the target label\ndistribution, we tailor the states and rewards, and introduce an efficient\nsolution for parent exploration and state transition. We also propose a\ntraining paradigm for GUDA called Generalized Universal Adversarial Network\n(GUAN), which involves collaborative optimization between GUAN and GFlowNet.\nTheoretical analysis highlights the importance of exploration, and extensive\nexperiments on benchmark datasets demonstrate the superiority of GFlowDA.\n","authors":["Didi Zhu","Yinchuan Li","Yunfeng Shao","Jianye Hao","Fei Wu","Kun Kuang","Jun Xiao","Chao Wu"],"pdf_url":"https://arxiv.org/pdf/2305.04466v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15724v1","updated":"2023-08-30T02:56:55Z","published":"2023-08-30T02:56:55Z","title":"Background Debiased SAR Target Recognition via Causal Interventional\n Regularizer","summary":" Recent studies have utilized deep learning (DL) techniques to automatically\nextract features from synthetic aperture radar (SAR) images, which shows great\npromise for enhancing the performance of SAR automatic target recognition\n(ATR). However, our research reveals a previously overlooked issue: SAR images\nto be recognized include not only the foreground (i.e., the target), but also a\ncertain size of the background area. When a DL-model is trained exclusively on\nforeground data, its recognition performance is significantly superior to a\nmodel trained on original data that includes both foreground and background.\nThis suggests that the presence of background impedes the ability of the\nDL-model to learn additional semantic information about the target. To address\nthis issue, we construct a structural causal model (SCM) that incorporates the\nbackground as a confounder. Based on the constructed SCM, we propose a causal\nintervention based regularization method to eliminate the negative impact of\nbackground on feature semantic learning and achieve background debiased\nSAR-ATR. The proposed causal interventional regularizer can be integrated into\nany existing DL-based SAR-ATR models to mitigate the impact of background\ninterference on the feature extraction and recognition accuracy. Experimental\nresults on the Moving and Stationary Target Acquisition and Recognition (MSTAR)\ndataset indicate that the proposed method can enhance the efficiency of\nexisting DL-based methods in a plug-and-play manner.\n","authors":["Hongwei Dong","Fangzhou Han","Lingyu Si","Wenwen Qiang","Lamei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.15724v1.pdf","comment":"38 pages, 8 figures"},{"id":"http://arxiv.org/abs/2304.11862v4","updated":"2023-08-30T02:55:09Z","published":"2023-04-24T07:16:54Z","title":"Universal Domain Adaptation via Compressive Attention Matching","summary":" Universal domain adaptation (UniDA) aims to transfer knowledge from the\nsource domain to the target domain without any prior knowledge about the label\nset. The challenge lies in how to determine whether the target samples belong\nto common categories. The mainstream methods make judgments based on the sample\nfeatures, which overemphasizes global information while ignoring the most\ncrucial local objects in the image, resulting in limited accuracy. To address\nthis issue, we propose a Universal Attention Matching (UniAM) framework by\nexploiting the self-attention mechanism in vision transformer to capture the\ncrucial object information. The proposed framework introduces a novel\nCompressive Attention Matching (CAM) approach to explore the core information\nby compressively representing attentions. Furthermore, CAM incorporates a\nresidual-based measurement to determine the sample commonness. By utilizing the\nmeasurement, UniAM achieves domain-wise and category-wise Common Feature\nAlignment (CFA) and Target Class Separation (TCS). Notably, UniAM is the first\nmethod utilizing the attention in vision transformer directly to perform\nclassification tasks. Extensive experiments show that UniAM outperforms the\ncurrent state-of-the-art methods on various benchmark datasets.\n","authors":["Didi Zhu","Yincuan Li","Junkun Yuan","Zexi Li","Kun Kuang","Chao Wu"],"pdf_url":"https://arxiv.org/pdf/2304.11862v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10720v3","updated":"2023-08-30T02:47:27Z","published":"2023-06-19T06:41:19Z","title":"Exploring the Relationship between Samples and Masks for Robust Defect\n Localization","summary":" Defect detection aims to detect and localize regions out of the normal\ndistribution.Previous approaches model normality and compare it with the input\nto identify defective regions, potentially limiting their generalizability.This\npaper proposes a one-stage framework that detects defective patterns directly\nwithout the modeling process.This ability is adopted through the joint efforts\nof three parties: a generative adversarial network (GAN), a newly proposed\nscaled pattern loss, and a dynamic masked cycle-consistent auxiliary network.\nExplicit information that could indicate the position of defects is\nintentionally excluded to avoid learning any direct mapping.Experimental\nresults on the texture class of the challenging MVTec AD dataset show that the\nproposed method is 2.9\\% higher than the SOTA methods in F1-Score, while\nsubstantially outperforming SOTA methods in generalizability.\n","authors":["Jiang Lin","Yaping Yan"],"pdf_url":"https://arxiv.org/pdf/2306.10720v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10421v2","updated":"2023-08-30T02:32:08Z","published":"2023-08-21T02:13:40Z","title":"UniM$^2$AE: Multi-modal Masked Autoencoders with Unified 3D\n Representation for 3D Perception in Autonomous Driving","summary":" Masked Autoencoders (MAE) play a pivotal role in learning potent\nrepresentations, delivering outstanding results across various 3D perception\ntasks essential for autonomous driving. In real-world driving scenarios, it's\ncommonplace to deploy multiple sensors for comprehensive environment\nperception. While integrating multi-modal features from these sensors can\nproduce rich and powerful features, there is a noticeable gap in MAE methods\naddressing this integration. This research delves into multi-modal Masked\nAutoencoders tailored for a unified representation space in autonomous driving,\naiming to pioneer a more efficient fusion of two distinct modalities. To\nintricately marry the semantics inherent in images with the geometric\nintricacies of LiDAR point clouds, the UniM$^2$AE is proposed. This model\nstands as a potent yet straightforward, multi-modal self-supervised\npre-training framework, mainly consisting of two designs. First, it projects\nthe features from both modalities into a cohesive 3D volume space, ingeniously\nexpanded from the bird's eye view (BEV) to include the height dimension. The\nextension makes it possible to back-project the informative features, obtained\nby fusing features from both modalities, into their native modalities to\nreconstruct the multiple masked inputs. Second, the Multi-modal 3D Interactive\nModule (MMIM) is invoked to facilitate the efficient inter-modal interaction\nduring the interaction process. Extensive experiments conducted on the nuScenes\nDataset attest to the efficacy of UniM$^2$AE, indicating enhancements in 3D\nobject detection and BEV map segmentation by 1.2\\%(NDS) and 6.5\\% (mIoU),\nrespectively. Code is available at https://github.com/hollow-503/UniM2AE.\n","authors":["Jian Zou","Tianyu Huang","Guanglei Yang","Zhenhua Guo","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2308.10421v2.pdf","comment":"Code available at https://github.com/hollow-503/UniM2AE"},{"id":"http://arxiv.org/abs/2303.17895v4","updated":"2023-08-30T02:10:53Z","published":"2023-03-31T08:56:29Z","title":"EA-LSS: Edge-aware Lift-splat-shot Framework for 3D BEV Object Detection","summary":" In recent years, great progress has been made in the Lift-Splat-Shot-based\n(LSS-based) 3D object detection method. However, inaccurate depth estimation\nremains an important constraint to the accuracy of camera-only and multi-model\n3D object detection models, especially in regions where the depth changes\nsignificantly (i.e., the \"depth jump\" problem). In this paper, we proposed a\nnovel Edge-aware Lift-splat-shot (EA-LSS) framework. Specifically, edge-aware\ndepth fusion (EADF) module is proposed to alleviate the \"depth jump\" problem\nand fine-grained depth (FGD) module to further enforce refined supervision on\ndepth. Our EA-LSS framework is compatible for any LSS-based 3D object detection\nmodels, and effectively boosts their performances with negligible increment of\ninference time. Experiments on nuScenes benchmarks demonstrate that EA-LSS is\neffective in either camera-only or multi-model models. It is worth mentioning\nthat EA-LSS achieved the state-of-the-art performance on nuScenes test\nbenchmarks with mAP and NDS of 76.5% and 77.6%, respectively.\n","authors":["Haotian Hu","Fanyi Wang","Jingwen Su","Yaonong Wang","Laifeng Hu","Weiye Fang","Jingwei Xu","Zhiwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.17895v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15705v1","updated":"2023-08-30T02:01:19Z","published":"2023-08-30T02:01:19Z","title":"Towards Earlier Detection of Oral Diseases On Smartphones Using Oral and\n Dental RGB Images","summary":" Oral diseases such as periodontal (gum) diseases and dental caries (cavities)\naffect billions of people across the world today. However, previous\nstate-of-the-art models have relied on X-ray images to detect oral diseases,\nmaking them inaccessible to remote monitoring, developing countries, and\ntelemedicine. To combat this overuse of X-ray imagery, we propose a lightweight\nmachine learning model capable of detecting calculus (also known as hardened\nplaque or tartar) in RGB images while running efficiently on low-end devices.\nThe model, a modified MobileNetV3-Small neural network transfer learned from\nImageNet, achieved an accuracy of 72.73% (which is comparable to\nstate-of-the-art solutions) while still being able to run on mobile devices due\nto its reduced memory requirements and processing times. A ResNet34-based model\nwas also constructed and achieved an accuracy of 81.82%. Both of these models\nwere tested on a mobile app, demonstrating their potential to limit the number\nof serious oral disease cases as their predictions can help patients schedule\nappointments earlier without the need to go to the clinic.\n","authors":["Ayush Garg","Julia Lu","Anika Maji"],"pdf_url":"https://arxiv.org/pdf/2308.15705v1.pdf","comment":"10 pages, 6 figures, 1 formula. This research was conducted as a\n mentored project performed for a college course and research program at the\n University of California Santa Barbara's Summer Research Academies program"},{"id":"http://arxiv.org/abs/2308.15005v2","updated":"2023-08-30T01:54:27Z","published":"2023-08-29T03:54:26Z","title":"Few-Shot Object Detection via Synthetic Features with Optimal Transport","summary":" Few-shot object detection aims to simultaneously localize and classify the\nobjects in an image with limited training samples. However, most existing\nfew-shot object detection methods focus on extracting the features of a few\nsamples of novel classes that lack diversity. Hence, they may not be sufficient\nto capture the data distribution. To address that limitation, in this paper, we\npropose a novel approach in which we train a generator to generate synthetic\ndata for novel classes. Still, directly training a generator on the novel class\nis not effective due to the lack of novel data. To overcome that issue, we\nleverage the large-scale dataset of base classes. Our overarching goal is to\ntrain a generator that captures the data variations of the base dataset. We\nthen transform the captured variations into novel classes by generating\nsynthetic data with the trained generator. To encourage the generator to\ncapture data variations on base classes, we propose to train the generator with\nan optimal transport loss that minimizes the optimal transport distance between\nthe distributions of real and synthetic data. Extensive experiments on two\nbenchmark datasets demonstrate that the proposed method outperforms the state\nof the art. Source code will be available.\n","authors":["Anh-Khoa Nguyen Vu","Thanh-Toan Do","Vinh-Tiep Nguyen","Tam Le","Minh-Triet Tran","Tam V. Nguyen"],"pdf_url":"https://arxiv.org/pdf/2308.15005v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.06767v3","updated":"2023-08-30T01:25:29Z","published":"2023-04-13T18:22:40Z","title":"RAFT: Reward rAnked FineTuning for Generative Foundation Model Alignment","summary":" Generative foundation models are susceptible to implicit biases that can\narise from extensive unsupervised training data. Such biases can produce\nsuboptimal samples, skewed outcomes, and unfairness, with potentially serious\nconsequences. Consequently, aligning these models with human ethics and\npreferences is an essential step toward ensuring their responsible and\neffective deployment in real-world applications. Prior research has primarily\nemployed Reinforcement Learning from Human Feedback (RLHF) to address this\nproblem, where generative models are fine-tuned with RL algorithms guided by a\nhuman-feedback-informed reward model. However, the inefficiencies and\ninstabilities associated with RL algorithms frequently present substantial\nobstacles to the successful alignment, necessitating the development of a more\nrobust and streamlined approach. To this end, we introduce a new framework,\nReward rAnked FineTuning (RAFT), designed to align generative models\neffectively. Utilizing a reward model and a sufficient number of samples, our\napproach selects the high-quality samples, discarding those that exhibit\nundesired behavior, and subsequently enhancing the model by fine-tuning on\nthese filtered samples. Our studies show that RAFT can effectively improve the\nmodel performance in both reward learning and other automated metrics in both\nlarge language models and diffusion models.\n","authors":["Hanze Dong","Wei Xiong","Deepanshu Goyal","Yihan Zhang","Winnie Chow","Rui Pan","Shizhe Diao","Jipeng Zhang","Kashun Shum","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2304.06767v3.pdf","comment":"26 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.15692v1","updated":"2023-08-30T01:21:11Z","published":"2023-08-30T01:21:11Z","title":"Intriguing Properties of Diffusion Models: A Large-Scale Dataset for\n Evaluating Natural Attack Capability in Text-to-Image Generative Models","summary":" Denoising probabilistic diffusion models have shown breakthrough performance\nthat can generate more photo-realistic images or human-level illustrations than\nthe prior models such as GANs. This high image-generation capability has\nstimulated the creation of many downstream applications in various areas.\nHowever, we find that this technology is indeed a double-edged sword: We\nidentify a new type of attack, called the Natural Denoising Diffusion (NDD)\nattack based on the finding that state-of-the-art deep neural network (DNN)\nmodels still hold their prediction even if we intentionally remove their robust\nfeatures, which are essential to the human visual system (HVS), by text\nprompts. The NDD attack can generate low-cost, model-agnostic, and\ntransferrable adversarial attacks by exploiting the natural attack capability\nin diffusion models. Motivated by the finding, we construct a large-scale\ndataset, Natural Denoising Diffusion Attack (NDDA) dataset, to systematically\nevaluate the risk of the natural attack capability of diffusion models with\nstate-of-the-art text-to-image diffusion models. We evaluate the natural attack\ncapability by answering 6 research questions. Through a user study to confirm\nthe validity of the NDD attack, we find that the NDD attack can achieve an 88%\ndetection rate while being stealthy to 93% of human subjects. We also find that\nthe non-robust features embedded by diffusion models contribute to the natural\nattack capability. To confirm the model-agnostic and transferrable attack\ncapability, we perform the NDD attack against an AD vehicle and find that 73%\nof the physically printed attacks can be detected as a stop sign. We hope that\nour study and dataset can help our community to be aware of the risk of\ndiffusion models and facilitate further research toward robust DNN models.\n","authors":["Takami Sato","Justin Yue","Nanze Chen","Ningfei Wang","Qi Alfred Chen"],"pdf_url":"https://arxiv.org/pdf/2308.15692v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15690v1","updated":"2023-08-30T01:14:32Z","published":"2023-08-30T01:14:32Z","title":"CongNaMul: A Dataset for Advanced Image Processing of Soybean Sprouts","summary":" We present 'CongNaMul', a comprehensive dataset designed for various tasks in\nsoybean sprouts image analysis. The CongNaMul dataset is curated to facilitate\ntasks such as image classification, semantic segmentation, decomposition, and\nmeasurement of length and weight. The classification task provides four classes\nto determine the quality of soybean sprouts: normal, broken, spotted, and\nbroken and spotted, for the development of AI-aided automatic quality\ninspection technology. For semantic segmentation, images with varying\ncomplexity, from single sprout images to images with multiple sprouts, along\nwith human-labelled mask images, are included. The label has 4 different\nclasses: background, head, body, tail. The dataset also provides images and\nmasks for the image decomposition task, including two separate sprout images\nand their combined form. Lastly, 5 physical features of sprouts (head length,\nbody length, body thickness, tail length, weight) are provided for image-based\nmeasurement tasks. This dataset is expected to be a valuable resource for a\nwide range of research and applications in the advanced analysis of images of\nsoybean sprouts. Also, we hope that this dataset can assist researchers\nstudying classification, semantic segmentation, decomposition, and physical\nfeature measurement in other industrial fields, in evaluating their models. The\ndataset is available at the authors' repository. (https://bhban.kr/data)\n","authors":["Byunghyun Ban","Donghun Ryu","Su-won Hwang"],"pdf_url":"https://arxiv.org/pdf/2308.15690v1.pdf","comment":"Accepted to International Conference on ICT Convergence 2023"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.12767v2","updated":"2023-08-30T15:52:34Z","published":"2023-08-24T13:14:49Z","title":"On the Consistency of Average Embeddings for Item Recommendation","summary":" A prevalent practice in recommender systems consists in averaging item\nembeddings to represent users or higher-level concepts in the same embedding\nspace. This paper investigates the relevance of such a practice. For this\npurpose, we propose an expected precision score, designed to measure the\nconsistency of an average embedding relative to the items used for its\nconstruction. We subsequently analyze the mathematical expression of this score\nin a theoretical setting with specific assumptions, as well as its empirical\nbehavior on real-world data from music streaming services. Our results\nemphasize that real-world averages are less consistent for recommendation,\nwhich paves the way for future research to better align real-world embeddings\nwith assumptions from our theoretical setting.\n","authors":["Walid Bendada","Guillaume Salha-Galvan","Romain Hennequin","Thomas Bouabça","Tristan Cazenave"],"pdf_url":"https://arxiv.org/pdf/2308.12767v2.pdf","comment":"17th ACM Conference on Recommender Systems (RecSys 2023)"},{"id":"http://arxiv.org/abs/2305.17926v2","updated":"2023-08-30T13:22:35Z","published":"2023-05-29T07:41:03Z","title":"Large Language Models are not Fair Evaluators","summary":" In this paper, we uncover a systematic bias in the evaluation paradigm of\nadopting large language models~(LLMs), e.g., GPT-4, as a referee to score and\ncompare the quality of responses generated by candidate models. We find that\nthe quality ranking of candidate responses can be easily hacked by simply\naltering their order of appearance in the context. This manipulation allows us\nto skew the evaluation result, making one model appear considerably superior to\nthe other, e.g., Vicuna-13B could beat ChatGPT on 66 over 80 tested queries\nwith ChatGPT as an evaluator. To address this issue, we propose a calibration\nframework with three simple yet effective strategies: 1) Multiple Evidence\nCalibration, which requires the evaluator model to generate multiple evaluation\nevidence before assigning ratings; 2) Balanced Position Calibration, which\naggregates results across various orders to determine the final score; 3)\nHuman-in-the-Loop Calibration, which introduces a balanced position diversity\nentropy to measure the difficulty of each example and seeks human assistance\nwhen needed. We also manually annotate the \"win/tie/lose\" outcomes of responses\nfrom ChatGPT and Vicuna-13B in the Vicuna Benchmark's question prompt, and\nextensive experiments demonstrate that our approach successfully mitigates\nevaluation bias, resulting in closer alignment with human judgments. We release\nour code and human annotation at \\url{https://github.com/i-Eval/FairEval} to\nfacilitate future research.\n","authors":["Peiyi Wang","Lei Li","Liang Chen","Zefan Cai","Dawei Zhu","Binghuai Lin","Yunbo Cao","Qi Liu","Tianyu Liu","Zhifang Sui"],"pdf_url":"https://arxiv.org/pdf/2305.17926v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15980v1","updated":"2023-08-30T12:09:18Z","published":"2023-08-30T12:09:18Z","title":"Adaptive Multi-Modalities Fusion in Sequential Recommendation Systems","summary":" In sequential recommendation, multi-modal information (e.g., text or image)\ncan provide a more comprehensive view of an item's profile. The optimal stage\n(early or late) to fuse modality features into item representations is still\ndebated. We propose a graph-based approach (named MMSR) to fuse modality\nfeatures in an adaptive order, enabling each modality to prioritize either its\ninherent sequential nature or its interplay with other modalities. MMSR\nrepresents each user's history as a graph, where the modality features of each\nitem in a user's history sequence are denoted by cross-linked nodes. The edges\nbetween homogeneous nodes represent intra-modality sequential relationships,\nand the ones between heterogeneous nodes represent inter-modality\ninterdependence relationships. During graph propagation, MMSR incorporates dual\nattention, differentiating homogeneous and heterogeneous neighbors. To\nadaptively assign nodes with distinct fusion orders, MMSR allows each node's\nrepresentation to be asynchronously updated through an update gate. In\nscenarios where modalities exhibit stronger sequential relationships, the\nupdate gate prioritizes updates among homogeneous nodes. Conversely, when the\ninterdependent relationships between modalities are more pronounced, the update\ngate prioritizes updates among heterogeneous nodes. Consequently, MMSR\nestablishes a fusion order that spans a spectrum from early to late modality\nfusion. In experiments across six datasets, MMSR consistently outperforms\nstate-of-the-art models, and our graph propagation methods surpass other graph\nneural networks. Additionally, MMSR naturally manages missing modalities.\n","authors":["Hengchang Hu","Wei Guo","Yong Liu","Min-Yen Kan"],"pdf_url":"https://arxiv.org/pdf/2308.15980v1.pdf","comment":"CIKM'2023"},{"id":"http://arxiv.org/abs/2308.15968v1","updated":"2023-08-30T11:45:35Z","published":"2023-08-30T11:45:35Z","title":"Denoising Attention for Query-aware User Modeling in Personalized Search","summary":" The personalization of search results has gained increasing attention in the\npast few years, thanks to the development of Neural Networks-based approaches\nfor Information Retrieval and the importance of personalization in many search\nscenarios. Recent works have proposed to build user models at query time by\nleveraging the Attention mechanism, which allows weighing the contribution of\nthe user-related information w.r.t. the current query. This approach allows\ntaking into account the diversity of the user's interests by giving more\nimportance to those related to the current search performed by the user.\n In this paper, we first discuss some shortcomings of the standard Attention\nformulation when employed for personalization. In particular, we focus on\nissues related to its normalization mechanism and its inability to entirely\nfilter out noisy user-related information. Then, we introduce the Denoising\nAttention mechanism: an Attention variant that directly tackles the above\nshortcomings by adopting a robust normalization scheme and introducing a\nfiltering mechanism. The reported experimental evaluation shows the benefits of\nthe proposed approach over other Attention-based variants.\n","authors":["Elias Bassani","Pranav Kasela","Gabriella Pasi"],"pdf_url":"https://arxiv.org/pdf/2308.15968v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15823v1","updated":"2023-08-30T07:53:27Z","published":"2023-08-30T07:53:27Z","title":"DRGame: Diversified Recommendation for Multi-category Video Games with\n Balanced Implicit Preferences","summary":" The growing popularity of subscription services in video game consumption has\nemphasized the importance of offering diversified recommendations. Providing\nusers with a diverse range of games is essential for ensuring continued\nengagement and fostering long-term subscriptions. However, existing\nrecommendation models face challenges in effectively handling highly imbalanced\nimplicit feedback in gaming interactions. Additionally, they struggle to take\ninto account the distinctive characteristics of multiple categories and the\nlatent user interests associated with these categories. In response to these\nchallenges, we propose a novel framework, named DRGame, to obtain diversified\nrecommendation. It is centered on multi-category video games, consisting of two\n{components}: Balance-driven Implicit Preferences Learning for data\npre-processing and Clustering-based Diversified Recommendation {Module} for\nfinal prediction. The first module aims to achieve a balanced representation of\nimplicit feedback in game time, thereby discovering a comprehensive view of\nplayer interests across different categories. The second module adopts\ncategory-aware representation learning to cluster and select players and games\nbased on balanced implicit preferences, and then employs asymmetric neighbor\naggregation to achieve diversified recommendations. Experimental results on a\nreal-world dataset demonstrate the superiority of our proposed method over\nexisting approaches in terms of game diversity recommendations.\n","authors":["Kangzhe Liu","Jianghong Ma","Shanshan Feng","Haijun Zhang","Zhao Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.15823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15813v1","updated":"2023-08-30T07:36:12Z","published":"2023-08-30T07:36:12Z","title":"Knowledge-grounded Natural Language Recommendation Explanation","summary":" Explanations accompanied by a recommendation can assist users in\nunderstanding the decision made by recommendation systems, which in turn\nincreases a user's confidence and trust in the system. Recently, research has\nfocused on generating natural language explanations in a human-readable format.\nThus far, the proposed approaches leverage item reviews written by users, which\nare often subjective, sparse in language, and unable to account for new items\nthat have not been purchased or reviewed before. Instead, we aim to generate\nfact-grounded recommendation explanations that are objectively described with\nitem features while implicitly considering a user's preferences, based on the\nuser's purchase history. To achieve this, we propose a knowledge graph (KG)\napproach to natural language explainable recommendation. Our approach draws on\nuser-item features through a novel collaborative filtering-based KG\nrepresentation to produce fact-grounded, personalized explanations, while\njointly learning user-item representations for recommendation scoring.\nExperimental results show that our approach consistently outperforms previous\nstate-of-the-art models on natural language explainable recommendation.\n","authors":["Anthony Colas","Jun Araki","Zhengyu Zhou","Bingqing Wang","Zhe Feng"],"pdf_url":"https://arxiv.org/pdf/2308.15813v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06566v3","updated":"2023-08-30T06:46:43Z","published":"2023-05-11T04:51:21Z","title":"ONCE: Boosting Content-based Recommendation with Both Open- and\n Closed-source Large Language Models","summary":" Personalized content-based recommender systems have become indispensable\ntools for users to navigate through the vast amount of content available on\nplatforms like daily news websites and book recommendation services. However,\nexisting recommenders face significant challenges in understanding the content\nof items. Large language models (LLMs), which possess deep semantic\ncomprehension and extensive knowledge from pretraining, have proven to be\neffective in various natural language processing tasks. In this study, we\nexplore the potential of leveraging both open- and closed-source LLMs to\nenhance content-based recommendation. With open-source LLMs, we utilize their\ndeep layers as content encoders, enriching the representation of content at the\nembedding level. For closed-source LLMs, we employ prompting techniques to\nenrich the training data at the token level. Through comprehensive experiments,\nwe demonstrate the high effectiveness of both types of LLMs and show the\nsynergistic relationship between them. Notably, we observed a significant\nrelative improvement of up to 19.32% compared to existing state-of-the-art\nrecommendation models. These findings highlight the immense potential of both\nopen- and closed-source of LLMs in enhancing content-based recommendation\nsystems. We will make our code and LLM-generated data available for other\nresearchers to reproduce our results.\n","authors":["Qijiong Liu","Nuo Chen","Tetsuya Sakai","Xiao-Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2305.06566v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10028v2","updated":"2023-08-30T06:33:32Z","published":"2023-08-19T14:25:59Z","title":"Voucher Abuse Detection with Prompt-based Fine-tuning on Graph Neural\n Networks","summary":" Voucher abuse detection is an important anomaly detection problem in\nE-commerce. While many GNN-based solutions have emerged, the supervised\nparadigm depends on a large quantity of labeled data. A popular alternative is\nto adopt self-supervised pre-training using label-free data, and further\nfine-tune on a downstream task with limited labels. Nevertheless, the\n\"pre-train, fine-tune\" paradigm is often plagued by the objective gap between\npre-training and downstream tasks. Hence, we propose VPGNN, a prompt-based\nfine-tuning framework on GNNs for voucher abuse detection. We design a novel\ngraph prompting function to reformulate the downstream task into a similar\ntemplate as the pretext task in pre-training, thereby narrowing the objective\ngap. Extensive experiments on both proprietary and public datasets demonstrate\nthe strength of VPGNN in both few-shot and semi-supervised scenarios. Moreover,\nan online deployment of VPGNN in a production environment shows a 23.4%\nimprovement over two existing deployed models.\n","authors":["Zhihao Wen","Yuan Fang","Yihan Liu","Yang Guo","Shuji Hao"],"pdf_url":"https://arxiv.org/pdf/2308.10028v2.pdf","comment":"7 pages, Accepted by CIKM23 Applied Research Track"},{"id":"http://arxiv.org/abs/2308.15703v1","updated":"2023-08-30T01:56:57Z","published":"2023-08-30T01:56:57Z","title":"Fragment and Integrate Network (FIN): A Novel Spatial-Temporal Modeling\n Based on Long Sequential Behavior for Online Food Ordering Click-Through Rate\n Prediction","summary":" Spatial-temporal information has been proven to be of great significance for\nclick-through rate prediction tasks in online Location-Based Services (LBS),\nespecially in mainstream food ordering platforms such as DoorDash, Uber Eats,\nMeituan, and Ele.me. Modeling user spatial-temporal preferences with sequential\nbehavior data has become a hot topic in recommendation systems and online\nadvertising. However, most of existing methods either lack the representation\nof rich spatial-temporal information or only handle user behaviors with limited\nlength, e.g. 100. In this paper, we tackle these problems by designing a new\nspatial-temporal modeling paradigm named Fragment and Integrate Network (FIN).\nFIN consists of two networks: (i) Fragment Network (FN) extracts Multiple\nSub-Sequences (MSS) from lifelong sequential behavior data, and captures the\nspecific spatial-temporal representation by modeling each MSS respectively.\nHere both a simplified attention and a complicated attention are adopted to\nbalance the performance gain and resource consumption. (ii) Integrate Network\n(IN) builds a new integrated sequence by utilizing spatial-temporal interaction\non MSS and captures the comprehensive spatial-temporal representation by\nmodeling the integrated sequence with a complicated attention. Both public\ndatasets and production datasets have demonstrated the accuracy and scalability\nof FIN. Since 2022, FIN has been fully deployed in the recommendation\nadvertising system of Ele.me, one of the most popular online food ordering\nplatforms in China, obtaining 5.7% improvement on Click-Through Rate (CTR) and\n7.3% increase on Revenue Per Mille (RPM).\n","authors":["Jun Li","Jingjian Wang","Hongwei Wang","Xing Deng","Jielong Chen","Bing Cao","Zekun Wang","Guanjie Xu","Ge Zhang","Feng Shi","Hualei Liu"],"pdf_url":"https://arxiv.org/pdf/2308.15703v1.pdf","comment":"Accepted by CIKM 2023 Applied Research Paper"},{"id":"http://arxiv.org/abs/2308.15701v1","updated":"2023-08-30T01:54:48Z","published":"2023-08-30T01:54:48Z","title":"A Survey on Multi-Behavior Sequential Recommendation","summary":" Recommender systems is set up to address the issue of information overload in\ntraditional information retrieval systems, which is focused on recommending\ninformation that is of most interest to users from massive information.\nGenerally, there is a sequential nature and heterogeneity to the behavior of a\nperson interacting with a system, leading to the proposal of multi-behavior\nsequential recommendation (MBSR). MBSR is a relatively new and worthy direction\nfor in-depth research, which can achieve state-of-the-art recommendation\nthrough suitable modeling, and some related works have been proposed. This\nsurvey aims to shed light on the MBSR problem. Firstly, we introduce MBSR in\ndetail, including its problem definition, application scenarios and challenges\nfaced. Secondly, we detail the classification of MBSR, including\nneighborhood-based methods, matrix factorization-based methods and deep\nlearning-based methods, where we further classify the deep learning-based\nmethods into different learning architectures based on RNN, GNN, Transformer,\nand generic architectures as well as architectures that integrate hybrid\ntechniques. In each method, we present related works based on the data\nperspective and the modeling perspective, as well as analyze the strengths,\nweaknesses and features of these works. Finally, we discuss some promising\nfuture research directions to address the challenges and improve the current\nstatus of MBSR.\n","authors":["Xiaoqing Chen","Zhitao Li","Weike Pan","Zhong Ming"],"pdf_url":"https://arxiv.org/pdf/2308.15701v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2308.15470v2","updated":"2023-08-30T17:59:52Z","published":"2023-08-29T17:50:27Z","title":"Policy composition in reinforcement learning via multi-objective policy\n optimization","summary":" We enable reinforcement learning agents to learn successful behavior policies\nby utilizing relevant pre-existing teacher policies. The teacher policies are\nintroduced as objectives, in addition to the task objective, in a\nmulti-objective policy optimization setting. Using the Multi-Objective Maximum\na Posteriori Policy Optimization algorithm (Abdolmaleki et al. 2020), we show\nthat teacher policies can help speed up learning, particularly in the absence\nof shaping rewards. In two domains with continuous observation and action\nspaces, our agents successfully compose teacher policies in sequence and in\nparallel, and are also able to further extend the policies of the teachers in\norder to solve the task.\n Depending on the specified combination of task and teacher(s), teacher(s) may\nnaturally act to limit the final performance of an agent. The extent to which\nagents are required to adhere to teacher policies are determined by\nhyperparameters which determine both the effect of teachers on learning speed\nand the eventual performance of the agent on the task. In the humanoid domain\n(Tassa et al. 2018), we also equip agents with the ability to control the\nselection of teachers. With this ability, agents are able to meaningfully\ncompose from the teacher policies to achieve a superior task reward on the walk\ntask than in cases without access to the teacher policies. We show the\nresemblance of composed task policies with the corresponding teacher policies\nthrough videos.\n","authors":["Shruti Mishra","Ankit Anand","Jordan Hoffmann","Nicolas Heess","Martin Riedmiller","Abbas Abdolmaleki","Doina Precup"],"pdf_url":"https://arxiv.org/pdf/2308.15470v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16157v1","updated":"2023-08-30T17:22:11Z","published":"2023-08-30T17:22:11Z","title":"Algebraic, Topological, and Mereological Foundations of Existential\n Granules","summary":" In this research, new concepts of existential granules that determine\nthemselves are invented, and are characterized from algebraic, topological, and\nmereological perspectives. Existential granules are those that determine\nthemselves initially, and interact with their environment subsequently.\nExamples of the concept, such as those of granular balls, though inadequately\ndefined, algorithmically established, and insufficiently theorized in earlier\nworks by others, are already used in applications of rough sets and soft\ncomputing. It is shown that they fit into multiple theoretical frameworks\n(axiomatic, adaptive, and others) of granular computing. The characterization\nis intended for algorithm development, application to classification problems\nand possible mathematical foundations of generalizations of the approach.\nAdditionally, many open problems are posed and directions provided.\n","authors":["Mani A"],"pdf_url":"https://arxiv.org/pdf/2308.16157v1.pdf","comment":"15 Pages"},{"id":"http://arxiv.org/abs/2006.08426v4","updated":"2023-08-30T17:19:36Z","published":"2020-06-15T14:26:56Z","title":"Walking in the Shadow: A New Perspective on Descent Directions for\n Constrained Minimization","summary":" Descent directions such as movement towards Descent directions, including\nmovement towards Frank-Wolfe vertices, away-steps, in-face away-steps and\npairwise directions, have been an important design consideration in conditional\ngradient descent (CGD) variants. In this work, we attempt to demystify the\nimpact of the movement in these directions towards attaining constrained\nminimizers. The optimal local direction of descent is the directional\nderivative (i.e., shadow) of the projection of the negative gradient. We show\nthat this direction is the best away-step possible, and the continuous-time\ndynamics of moving in the shadow is equivalent to the dynamics of projected\ngradient descent (PGD), although it's non-trivial to discretize. We also show\nthat Frank-Wolfe (FW) vertices correspond to projecting onto the polytope using\nan \"infinite\" step in the direction of the negative gradient, thus providing a\nnew perspective on these steps. We combine these insights into a novel\nShadow-CG method that uses FW and shadow steps, while enjoying linear\nconvergence, with a rate that depends on the number of breakpoints in its\nprojection curve, rather than the pyramidal width. We provide a linear bound on\nthe number of breakpoints for simple polytopes and present scaling-invariant\nupper bounds for general polytopes based on the number of facets. We exemplify\nthe benefit of using Shadow-CG computationally for various applications, while\nraising an open question about tightening the bound on the number of\nbreakpoints for general polytopes.\n","authors":["Hassan Mortagy","Swati Gupta","Sebastian Pokutta"],"pdf_url":"https://arxiv.org/pdf/2006.08426v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16150v1","updated":"2023-08-30T17:16:02Z","published":"2023-08-30T17:16:02Z","title":"Modality Cycles with Masked Conditional Diffusion for Unsupervised\n Anomaly Segmentation in MRI","summary":" Unsupervised anomaly segmentation aims to detect patterns that are distinct\nfrom any patterns processed during training, commonly called abnormal or\nout-of-distribution patterns, without providing any associated manual\nsegmentations. Since anomalies during deployment can lead to model failure,\ndetecting the anomaly can enhance the reliability of models, which is valuable\nin high-risk domains like medical imaging. This paper introduces Masked\nModality Cycles with Conditional Diffusion (MMCCD), a method that enables\nsegmentation of anomalies across diverse patterns in multimodal MRI. The method\nis based on two fundamental ideas. First, we propose the use of cyclic modality\ntranslation as a mechanism for enabling abnormality detection.\nImage-translation models learn tissue-specific modality mappings, which are\ncharacteristic of tissue physiology. Thus, these learned mappings fail to\ntranslate tissues or image patterns that have never been encountered during\ntraining, and the error enables their segmentation. Furthermore, we combine\nimage translation with a masked conditional diffusion model, which attempts to\n`imagine' what tissue exists under a masked area, further exposing unknown\npatterns as the generative model fails to recreate them. We evaluate our method\non a proxy task by training on healthy-looking slices of BraTS2021\nmulti-modality MRIs and testing on slices with tumors. We show that our method\ncompares favorably to previous unsupervised approaches based on image\nreconstruction and denoising with autoencoders and diffusion models.\n","authors":["Ziyun Liang","Harry Anthony","Felix Wagner","Konstantinos Kamnitsas"],"pdf_url":"https://arxiv.org/pdf/2308.16150v1.pdf","comment":"Accepted in Multiscale Multimodal Medical Imaging workshop in MICCAI\n 2023"},{"id":"http://arxiv.org/abs/2308.16149v1","updated":"2023-08-30T17:07:17Z","published":"2023-08-30T17:07:17Z","title":"Jais and Jais-chat: Arabic-Centric Foundation and Instruction-Tuned Open\n Generative Large Language Models","summary":" We introduce Jais and Jais-chat, new state-of-the-art Arabic-centric\nfoundation and instruction-tuned open generative large language models (LLMs).\nThe models are based on the GPT-3 decoder-only architecture and are pretrained\non a mixture of Arabic and English texts, including source code in various\nprogramming languages. With 13 billion parameters, they demonstrate better\nknowledge and reasoning capabilities in Arabic than any existing open Arabic\nand multilingual models by a sizable margin, based on extensive evaluation.\nMoreover, the models are competitive in English compared to English-centric\nopen models of similar size, despite being trained on much less English data.\nWe provide a detailed description of the training, the tuning, the safety\nalignment, and the evaluation of the models. We release two open versions of\nthe model -- the foundation Jais model, and an instruction-tuned Jais-chat\nvariant -- with the aim of promoting research on Arabic LLMs. Available at\nhttps://huggingface.co/inception-mbzuai/jais-13b-chat\n","authors":["Neha Sengupta","Sunil Kumar Sahu","Bokang Jia","Satheesh Katipomu","Haonan Li","Fajri Koto","Osama Mohammed Afzal","Samta Kamboj","Onkar Pandit","Rahul Pal","Lalit Pradhan","Zain Muhammad Mujahid","Massa Baali","Alham Fikri Aji","Zhengzhong Liu","Andy Hock","Andrew Feldman","Jonathan Lee","Andrew Jackson","Preslav Nakov","Timothy Baldwin","Eric Xing"],"pdf_url":"https://arxiv.org/pdf/2308.16149v1.pdf","comment":"Arabic-centric, foundation model, large-language model, LLM,\n generative model, instruction-tuned, Jais, Jais-chat"},{"id":"http://arxiv.org/abs/2308.01981v2","updated":"2023-08-30T17:02:55Z","published":"2023-08-03T18:28:50Z","title":"CartiMorph: a framework for automated knee articular cartilage\n morphometrics","summary":" We introduce CartiMorph, a framework for automated knee articular cartilage\nmorphometrics. It takes an image as input and generates quantitative metrics\nfor cartilage subregions, including the percentage of full-thickness cartilage\nloss (FCL), mean thickness, surface area, and volume. CartiMorph leverages the\npower of deep learning models for hierarchical image feature representation.\nDeep learning models were trained and validated for tissue segmentation,\ntemplate construction, and template-to-image registration. We established\nmethods for surface-normal-based cartilage thickness mapping, FCL estimation,\nand rule-based cartilage parcellation. Our cartilage thickness map showed less\nerror in thin and peripheral regions. We evaluated the effectiveness of the\nadopted segmentation model by comparing the quantitative metrics obtained from\nmodel segmentation and those from manual segmentation. The root-mean-squared\ndeviation of the FCL measurements was less than 8%, and strong correlations\nwere observed for the mean thickness (Pearson's correlation coefficient $\\rho\n\\in [0.82,0.97]$), surface area ($\\rho \\in [0.82,0.98]$) and volume ($\\rho \\in\n[0.89,0.98]$) measurements. We compared our FCL measurements with those from a\nprevious study and found that our measurements deviated less from the ground\ntruths. We observed superior performance of the proposed rule-based cartilage\nparcellation method compared with the atlas-based approach. CartiMorph has the\npotential to promote imaging biomarkers discovery for knee osteoarthritis.\n","authors":["Yongcheng Yao","Junru Zhong","Liping Zhang","Sheheryar Khan","Weitian Chen"],"pdf_url":"https://arxiv.org/pdf/2308.01981v2.pdf","comment":"To be published in Medical Image Analysis"},{"id":"http://arxiv.org/abs/2308.16139v1","updated":"2023-08-30T16:52:20Z","published":"2023-08-30T16:52:20Z","title":"MedShapeNet -- A Large-Scale Dataset of 3D Medical Shapes for Computer\n Vision","summary":" We present MedShapeNet, a large collection of anatomical shapes (e.g., bones,\norgans, vessels) and 3D surgical instrument models. Prior to the deep learning\nera, the broad application of statistical shape models (SSMs) in medical image\nanalysis is evidence that shapes have been commonly used to describe medical\ndata. Nowadays, however, state-of-the-art (SOTA) deep learning algorithms in\nmedical imaging are predominantly voxel-based. In computer vision, on the\ncontrary, shapes (including, voxel occupancy grids, meshes, point clouds and\nimplicit surface models) are preferred data representations in 3D, as seen from\nthe numerous shape-related publications in premier vision conferences, such as\nthe IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), as\nwell as the increasing popularity of ShapeNet (about 51,300 models) and\nPrinceton ModelNet (127,915 models) in computer vision research. MedShapeNet is\ncreated as an alternative to these commonly used shape benchmarks to facilitate\nthe translation of data-driven vision algorithms to medical applications, and\nit extends the opportunities to adapt SOTA vision algorithms to solve critical\nmedical problems. Besides, the majority of the medical shapes in MedShapeNet\nare modeled directly on the imaging data of real patients, and therefore it\ncomplements well existing shape benchmarks comprising of computer-aided design\n(CAD) models. MedShapeNet currently includes more than 100,000 medical shapes,\nand provides annotations in the form of paired data. It is therefore also a\nfreely available repository of 3D models for extended reality (virtual reality\n- VR, augmented reality - AR, mixed reality - MR) and medical 3D printing. This\nwhite paper describes in detail the motivations behind MedShapeNet, the shape\nacquisition procedures, the use cases, as well as the usage of the online shape\nsearch portal: https://medshapenet.ikim.nrw/\n","authors":["Jianning Li","Antonio Pepe","Christina Gsaxner","Gijs Luijten","Yuan Jin","Narmada Ambigapathy","Enrico Nasca","Naida Solak","Gian Marco Melito","Afaque R. Memon","Xiaojun Chen","Jan Stefan Kirschke","Ezequiel de la Rosa","Patrich Ferndinand Christ","Hongwei Bran Li","David G. Ellis","Michele R. Aizenberg","Sergios Gatidis","Thomas Kuestner","Nadya Shusharina","Nicholas Heller","Vincent Andrearczyk","Adrien Depeursinge","Mathieu Hatt","Anjany Sekuboyina","Maximilian Loeffler","Hans Liebl","Reuben Dorent","Tom Vercauteren","Jonathan Shapey","Aaron Kujawa","Stefan Cornelissen","Patrick Langenhuizen","Achraf Ben-Hamadou","Ahmed Rekik","Sergi Pujades","Edmond Boyer","Federico Bolelli","Costantino Grana","Luca Lumetti","Hamidreza Salehi","Jun Ma","Yao Zhang","Ramtin Gharleghi","Susann Beier","Eduardo A. Garza-Villarreal","Thania Balducci","Diego Angeles-Valdez","Roberto Souza","Leticia Rittner","Richard Frayne","Yuanfeng Ji","Soumick Chatterjee","Andreas Nuernberger","Joao Pedrosa","Carlos Ferreira","Guilherme Aresta","Antonio Cunha","Aurelio Campilho","Yannick Suter","Jose Garcia","Alain Lalande","Emmanuel Audenaert","Claudia Krebs","Timo Van Leeuwen","Evie Vereecke","Rainer Roehrig","Frank Hoelzle","Vahid Badeli","Kathrin Krieger","Matthias Gunzer","Jianxu Chen","Amin Dada","Miriam Balzer","Jana Fragemann","Frederic Jonske","Moritz Rempe","Stanislav Malorodov","Fin H. Bahnsen","Constantin Seibold","Alexander Jaus","Ana Sofia Santos","Mariana Lindo","Andre Ferreira","Victor Alves","Michael Kamp","Amr Abourayya","Felix Nensa","Fabian Hoerst","Alexander Brehmer","Lukas Heine","Lars E. Podleska","Matthias A. Fink","Julius Keyl","Konstantinos Tserpes","Moon-Sung Kim","Shireen Elhabian","Hans Lamecker","Dzenan Zukic","Beatriz Paniagua","Christian Wachinger","Martin Urschler","Luc Duong","Jakob Wasserthal","Peter F. Hoyer","Oliver Basu","Thomas Maal","Max J. H. Witjes","Ping Luo","Bjoern Menze","Mauricio Reyes","Christos Davatzikos","Behrus Puladi","Jens Kleesiek","Jan Egger"],"pdf_url":"https://arxiv.org/pdf/2308.16139v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2308.14815v2","updated":"2023-08-30T16:31:53Z","published":"2023-08-28T18:06:24Z","title":"Distributionally Robust Statistical Verification with Imprecise Neural\n Networks","summary":" A particularly challenging problem in AI safety is providing guarantees on\nthe behavior of high-dimensional autonomous systems. Verification approaches\ncentered around reachability analysis fail to scale, and purely statistical\napproaches are constrained by the distributional assumptions about the sampling\nprocess. Instead, we pose a distributionally robust version of the statistical\nverification problem for black-box systems, where our performance guarantees\nhold over a large family of distributions. This paper proposes a novel approach\nbased on a combination of active learning, uncertainty quantification, and\nneural network verification. A central piece of our approach is an ensemble\ntechnique called Imprecise Neural Networks, which provides the uncertainty to\nguide active learning. The active learning uses an exhaustive neural-network\nverification tool Sherlock to collect samples. An evaluation on multiple\nphysical simulators in the openAI gym Mujoco environments with\nreinforcement-learned controllers demonstrates that our approach can provide\nuseful and scalable guarantees for high-dimensional systems.\n","authors":["Souradeep Dutta","Michele Caprio","Vivian Lin","Matthew Cleaveland","Kuk Jin Jang","Ivan Ruchkin","Oleg Sokolsky","Insup Lee"],"pdf_url":"https://arxiv.org/pdf/2308.14815v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14683v2","updated":"2023-08-30T16:30:29Z","published":"2023-05-24T03:44:50Z","title":"On progressive sharpening, flat minima and generalisation","summary":" We present a new approach to understanding the relationship between loss\ncurvature and input-output model behaviour in deep learning. Specifically, we\nuse existing empirical analyses of the spectrum of deep network loss Hessians\nto ground an ansatz tying together the loss Hessian and the input-output\nJacobian of a deep neural network over training samples throughout training. We\nthen prove a series of theoretical results which quantify the degree to which\nthe input-output Jacobian of a model approximates its Lipschitz norm over a\ndata distribution, and deduce a novel generalisation bound in terms of the\nempirical Jacobian. We use our ansatz, together with our theoretical results,\nto give a new account of the recently observed progressive sharpening\nphenomenon, as well as the generalisation properties of flat minima.\nExperimental evidence is provided to validate our claims.\n","authors":["Lachlan Ewen MacDonald","Jack Valmadre","Simon Lucey"],"pdf_url":"https://arxiv.org/pdf/2305.14683v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16122v1","updated":"2023-08-30T16:21:02Z","published":"2023-08-30T16:21:02Z","title":"Spatial Graph Coarsening: Weather and Weekday Prediction with London's\n Bike-Sharing Service using GNN","summary":" This study introduced the use of Graph Neural Network (GNN) for predicting\nthe weather and weekday of a day in London, from the dataset of Santander\nCycles bike-sharing system as a graph classification task. The proposed GNN\nmodels newly introduced (i) a concatenation operator of graph features with\ntrained node embeddings and (ii) a graph coarsening operator based on\ngeographical contiguity, namely \"Spatial Graph Coarsening\". With the node\nfeatures of land-use characteristics and number of households around the bike\nstations and graph features of temperatures in the city, our proposed models\noutperformed the baseline model in cross-entropy loss and accuracy of the\nvalidation dataset.\n","authors":["Yuta Sato","Pak Hei Lam","Shruti Gupta","Fareesah Hussain"],"pdf_url":"https://arxiv.org/pdf/2308.16122v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16113v1","updated":"2023-08-30T16:14:20Z","published":"2023-08-30T16:14:20Z","title":"survex: an R package for explaining machine learning survival models","summary":" Due to their flexibility and superior performance, machine learning models\nfrequently complement and outperform traditional statistical survival models.\nHowever, their widespread adoption is hindered by a lack of user-friendly tools\nto explain their internal operations and prediction rationales. To tackle this\nissue, we introduce the survex R package, which provides a cohesive framework\nfor explaining any survival model by applying explainable artificial\nintelligence techniques. The capabilities of the proposed software encompass\nunderstanding and diagnosing survival models, which can lead to their\nimprovement. By revealing insights into the decision-making process, such as\nvariable effects and importances, survex enables the assessment of model\nreliability and the detection of biases. Thus, transparency and responsibility\nmay be promoted in sensitive areas, such as biomedical research and healthcare\napplications.\n","authors":["Mikołaj Spytek","Mateusz Krzyziński","Sophie Hanna Langbein","Hubert Baniecki","Marvin N. Wright","Przemysław Biecek"],"pdf_url":"https://arxiv.org/pdf/2308.16113v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11582v2","updated":"2023-08-30T16:06:27Z","published":"2023-05-19T10:43:57Z","title":"What You Hear Is What You See: Audio Quality Metrics From Image Quality\n Metrics","summary":" In this study, we investigate the feasibility of utilizing state-of-the-art\nimage perceptual metrics for evaluating audio signals by representing them as\nspectrograms. The encouraging outcome of the proposed approach is based on the\nsimilarity between the neural mechanisms in the auditory and visual pathways.\nFurthermore, we customise one of the metrics which has a psychoacoustically\nplausible architecture to account for the peculiarities of sound signals. We\nevaluate the effectiveness of our proposed metric and several baseline metrics\nusing a music dataset, with promising results in terms of the correlation\nbetween the metrics and the perceived quality of audio as rated by human\nevaluators.\n","authors":["Tashi Namgyal","Alexander Hepburn","Raul Santos-Rodriguez","Valero Laparra","Jesus Malo"],"pdf_url":"https://arxiv.org/pdf/2305.11582v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07001v2","updated":"2023-08-30T15:58:45Z","published":"2023-06-12T10:10:57Z","title":"Cancellation-Free Regret Bounds for Lagrangian Approaches in Constrained\n Markov Decision Processes","summary":" Constrained Markov Decision Processes (CMDPs) are one of the common ways to\nmodel safe reinforcement learning problems, where constraint functions model\nthe safety objectives. Lagrangian-based dual or primal-dual algorithms provide\nefficient methods for learning in CMDPs. For these algorithms, the currently\nknown regret bounds in the finite-horizon setting allow for a \"cancellation of\nerrors\"; one can compensate for a constraint violation in one episode with a\nstrict constraint satisfaction in another. However, we do not consider such a\nbehavior safe in practical applications. In this paper, we overcome this\nweakness by proposing a novel model-based dual algorithm OptAug-CMDP for\ntabular finite-horizon CMDPs. Our algorithm is motivated by the augmented\nLagrangian method and can be performed efficiently. We show that during $K$\nepisodes of exploring the CMDP, our algorithm obtains a regret of\n$\\tilde{O}(\\sqrt{K})$ for both the objective and the constraint violation.\nUnlike existing Lagrangian approaches, our algorithm achieves this regret\nwithout the need for the cancellation of errors.\n","authors":["Adrian Müller","Pragnya Alatur","Giorgia Ramponi","Niao He"],"pdf_url":"https://arxiv.org/pdf/2306.07001v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16105v1","updated":"2023-08-30T15:54:06Z","published":"2023-08-30T15:54:06Z","title":"Advanced Deep Regression Models for Forecasting Time Series Oil\n Production","summary":" Global oil demand is rapidly increasing and is expected to reach 106.3\nmillion barrels per day by 2040. Thus, it is vital for hydrocarbon extraction\nindustries to forecast their production to optimize their operations and avoid\nlosses. Big companies have realized that exploiting the power of deep learning\n(DL) and the massive amount of data from various oil wells for this purpose can\nsave a lot of operational costs and reduce unwanted environmental impacts. In\nthis direction, researchers have proposed models using conventional machine\nlearning (ML) techniques for oil production forecasting. However, these\ntechniques are inappropriate for this problem as they can not capture\nhistorical patterns found in time series data, resulting in inaccurate\npredictions. This research aims to overcome these issues by developing advanced\ndata-driven regression models using sequential convolutions and long short-term\nmemory (LSTM) units. Exhaustive analyses are conducted to select the optimal\nsequence length, model hyperparameters, and cross-well dataset formation to\nbuild highly generalized robust models. A comprehensive experimental study on\nVolve oilfield data validates the proposed models. It reveals that the\nLSTM-based sequence learning model can predict oil production better than the\n1-D convolutional neural network (CNN) with mean absolute error (MAE) and R2\nscore of 111.16 and 0.98, respectively. It is also found that the LSTM-based\nmodel performs better than all the existing state-of-the-art solutions and\nachieves a 37% improvement compared to a standard linear regression, which is\nconsidered the baseline model in this work.\n","authors":["Siavash Hosseini","Thangarajah Akilan"],"pdf_url":"https://arxiv.org/pdf/2308.16105v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12767v2","updated":"2023-08-30T15:52:34Z","published":"2023-08-24T13:14:49Z","title":"On the Consistency of Average Embeddings for Item Recommendation","summary":" A prevalent practice in recommender systems consists in averaging item\nembeddings to represent users or higher-level concepts in the same embedding\nspace. This paper investigates the relevance of such a practice. For this\npurpose, we propose an expected precision score, designed to measure the\nconsistency of an average embedding relative to the items used for its\nconstruction. We subsequently analyze the mathematical expression of this score\nin a theoretical setting with specific assumptions, as well as its empirical\nbehavior on real-world data from music streaming services. Our results\nemphasize that real-world averages are less consistent for recommendation,\nwhich paves the way for future research to better align real-world embeddings\nwith assumptions from our theoretical setting.\n","authors":["Walid Bendada","Guillaume Salha-Galvan","Romain Hennequin","Thomas Bouabça","Tristan Cazenave"],"pdf_url":"https://arxiv.org/pdf/2308.12767v2.pdf","comment":"17th ACM Conference on Recommender Systems (RecSys 2023)"},{"id":"http://arxiv.org/abs/2104.10751v3","updated":"2023-08-30T15:51:05Z","published":"2021-04-21T20:31:28Z","title":"Rule Generation for Classification: Scalability, Interpretability, and\n Fairness","summary":" We introduce a new rule-based optimization method for classification with\nconstraints. The proposed method leverages column generation for linear\nprogramming, and hence, is scalable to large datasets. The resulting pricing\nsubproblem is shown to be NP-Hard. We recourse to a decision tree-based\nheuristic and solve a proxy pricing subproblem for acceleration. The method\nreturns a set of rules along with their optimal weights indicating the\nimportance of each rule for learning. We address interpretability and fairness\nby assigning cost coefficients to the rules and introducing additional\nconstraints. In particular, we focus on local interpretability and generalize\nseparation criterion in fairness to multiple sensitive attributes and classes.\nWe test the performance of the proposed methodology on a collection of datasets\nand present a case study to elaborate on its different aspects. The proposed\nrule-based learning method exhibits a good compromise between local\ninterpretability and fairness on the one side, and accuracy on the other side.\n","authors":["Adia C. Lumadjeng","Tabea Röber","M. Hakan Akyüz","Ş. İlker Birbil"],"pdf_url":"https://arxiv.org/pdf/2104.10751v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.03975v3","updated":"2023-08-30T15:32:59Z","published":"2021-10-08T08:49:35Z","title":"Tensor train completion: local recovery guarantees via Riemannian\n optimization","summary":" In this work, we estimate the number of randomly selected elements of a\ntensor that with high probability guarantees local convergence of Riemannian\ngradient descent for tensor train completion. We derive a new bound for the\northogonal projections onto the tangent spaces based on the harmonic mean of\nthe unfoldings' singular values and introduce a notion of core coherence for\ntensor trains. We also extend the results to tensor train completion with\nauxiliary subspace information and obtain the corresponding local convergence\nguarantees.\n","authors":["Stanislav Budzinskiy","Nikolai Zamarashkin"],"pdf_url":"https://arxiv.org/pdf/2110.03975v3.pdf","comment":"1 figure added; Accepted version"},{"id":"http://arxiv.org/abs/2308.16089v1","updated":"2023-08-30T15:26:35Z","published":"2023-08-30T15:26:35Z","title":"Application of Zone Method based Machine Learning and Physics-Informed\n Neural Networks in Reheating Furnaces","summary":" Despite the high economic relevance of Foundation Industries, certain\ncomponents like Reheating furnaces within their manufacturing chain are\nenergy-intensive. Notable energy consumption reduction could be obtained by\nreducing the overall heating time in furnaces. Computer-integrated Machine\nLearning (ML) and Artificial Intelligence (AI) powered control systems in\nfurnaces could be enablers in achieving the Net-Zero goals in Foundation\nIndustries for sustainable manufacturing.\n In this work, due to the infeasibility of achieving good quality data in\nscenarios like reheating furnaces, classical Hottel's zone method based\ncomputational model has been used to generate data for ML and Deep Learning\n(DL) based model training via regression. It should be noted that the zone\nmethod provides an elegant way to model the physical phenomenon of Radiative\nHeat Transfer (RHT), the dominating heat transfer mechanism in high-temperature\nprocesses inside heating furnaces. Using this data, an extensive comparison\namong a wide range of state-of-the-art, representative ML and DL methods has\nbeen made against their temperature prediction performances in varying furnace\nenvironments. Owing to their holistic balance among inference times and model\nperformance, DL stands out among its counterparts. To further enhance the\nOut-Of-Distribution (OOD) generalization capability of the trained DL models,\nwe propose a Physics-Informed Neural Network (PINN) by incorporating prior\nphysical knowledge using a set of novel Energy-Balance regularizers. Our setup\nis a generic framework, is geometry-agnostic of the 3D structure of the\nunderlying furnace, and as such could accommodate any standard ML regression\nmodel, to serve as a Digital Twin of the underlying physical processes, for\ntransitioning Foundation Industries towards Industry 4.0.\n","authors":["Ujjal Kr Dutta","Aldo Lipani","Chuan Wang","Yukun Hu"],"pdf_url":"https://arxiv.org/pdf/2308.16089v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.11197v2","updated":"2023-08-30T15:26:00Z","published":"2023-02-22T08:14:24Z","title":"Quantized Low-Rank Multivariate Regression with Random Dithering","summary":" Low-rank multivariate regression (LRMR) is an important statistical learning\nmodel that combines highly correlated tasks as a multiresponse regression\nproblem with low-rank priori on the coefficient matrix. In this paper, we study\nquantized LRMR, a practical setting where the responses and/or the covariates\nare discretized to finite precision. We focus on the estimation of the\nunderlying coefficient matrix. To make consistent estimator that could achieve\narbitrarily small error possible, we employ uniform quantization with random\ndithering, i.e., we add appropriate random noise to the data before\nquantization. Specifically, uniform dither and triangular dither are used for\nresponses and covariates, respectively. Based on the quantized data, we propose\nthe constrained Lasso and regularized Lasso estimators, and derive the\nnon-asymptotic error bounds. With the aid of dithering, the estimators achieve\nminimax optimal rate, while quantization only slightly worsens the\nmultiplicative factor in the error rate. Moreover, we extend our results to a\nlow-rank regression model with matrix responses. We corroborate and demonstrate\nour theoretical results via simulations on synthetic data or image restoration.\n","authors":["Junren Chen","Yueqi Wang","Michael K. Ng"],"pdf_url":"https://arxiv.org/pdf/2302.11197v2.pdf","comment":"16 pages (Submitted)"},{"id":"http://arxiv.org/abs/2305.09438v3","updated":"2023-08-30T14:56:16Z","published":"2023-05-16T13:50:24Z","title":"MPI-rical: Data-Driven MPI Distributed Parallelism Assistance with\n Transformers","summary":" Message Passing Interface (MPI) plays a crucial role in distributed memory\nparallelization across multiple nodes. However, parallelizing MPI code\nmanually, and specifically, performing domain decomposition, is a challenging,\nerror-prone task. In this paper, we address this problem by developing\nMPI-RICAL, a novel data-driven, programming-assistance tool that assists\nprogrammers in writing domain decomposition based distributed memory\nparallelization code. Specifically, we train a supervised language model to\nsuggest MPI functions and their proper locations in the code on the fly. We\nalso introduce MPICodeCorpus, the first publicly available corpus of MPI-based\nparallel programs that is created by mining more than 15,000 open-source\nrepositories on GitHub. Experimental results have been done on MPICodeCorpus\nand more importantly, on a compiled benchmark of MPI-based parallel programs\nfor numerical computations that represent real-world scientific applications.\nMPI-RICAL achieves F1 scores between 0.87-0.91 on these programs, demonstrating\nits accuracy in suggesting correct MPI functions at appropriate code\nlocations.. The source code used in this work, as well as other relevant\nsources, are available at:\nhttps://github.com/Scientific-Computing-Lab-NRCN/MPI-rical\n","authors":["Nadav Schneider","Tal Kadosh","Niranjan Hasabnis","Timothy Mattson","Yuval Pinter","Gal Oren"],"pdf_url":"https://arxiv.org/pdf/2305.09438v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16067v1","updated":"2023-08-30T14:44:04Z","published":"2023-08-30T14:44:04Z","title":"Consensus of state of the art mortality prediction models: From\n all-cause mortality to sudden death prediction","summary":" Worldwide, many millions of people die suddenly and unexpectedly each year,\neither with or without a prior history of cardiovascular disease. Such events\nare sparse (once in a lifetime), many victims will not have had prior\ninvestigations for cardiac disease and many different definitions of sudden\ndeath exist. Accordingly, sudden death is hard to predict.\n This analysis used NHS Electronic Health Records (EHRs) for people aged\n$\\geq$50 years living in the Greater Glasgow and Clyde (GG\\&C) region in 2010\n(n = 380,000) to try to overcome these challenges. We investigated whether\nmedical history, blood tests, prescription of medicines, and hospitalisations\nmight, in combination, predict a heightened risk of sudden death.\n We compared the performance of models trained to predict either sudden death\nor all-cause mortality. We built six models for each outcome of interest: three\ntaken from state-of-the-art research (BEHRT, Deepr and Deep Patient), and three\nof our own creation. We trained these using two different data representations:\na language-based representation, and a sparse temporal matrix.\n We used global interpretability to understand the most important features of\neach model, and compare how much agreement there was amongst models using Rank\nBiased Overlap. It is challenging to account for correlated variables without\nincreasing the complexity of the interpretability technique. We overcame this\nby clustering features into groups and comparing the most important groups for\neach model. We found the agreement between models to be much higher when\naccounting for correlated variables.\n Our analysis emphasises the challenge of predicting sudden death and\nemphasises the need for better understanding and interpretation of machine\nlearning models applied to healthcare applications.\n","authors":["Dr Yola Jones","Dr Fani Deligianni","Dr Jeff Dalton","Dr Pierpaolo Pellicori","Professor John G F Cleland"],"pdf_url":"https://arxiv.org/pdf/2308.16067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.08491v3","updated":"2023-08-30T14:43:46Z","published":"2023-01-20T09:36:42Z","title":"Modeling Moral Choices in Social Dilemmas with Multi-Agent Reinforcement\n Learning","summary":" Practical uses of Artificial Intelligence (AI) in the real world have\ndemonstrated the importance of embedding moral choices into intelligent agents.\nThey have also highlighted that defining top-down ethical constraints on AI\naccording to any one type of morality is extremely challenging and can pose\nrisks. A bottom-up learning approach may be more appropriate for studying and\ndeveloping ethical behavior in AI agents. In particular, we believe that an\ninteresting and insightful starting point is the analysis of emergent behavior\nof Reinforcement Learning (RL) agents that act according to a predefined set of\nmoral rewards in social dilemmas.\n In this work, we present a systematic analysis of the choices made by\nintrinsically-motivated RL agents whose rewards are based on moral theories. We\naim to design reward structures that are simplified yet representative of a set\nof key ethical systems. Therefore, we first define moral reward functions that\ndistinguish between consequence- and norm-based agents, between morality based\non societal norms or internal virtues, and between single- and mixed-virtue\n(e.g., multi-objective) methodologies. Then, we evaluate our approach by\nmodeling repeated dyadic interactions between learning moral agents in three\niterated social dilemma games (Prisoner's Dilemma, Volunteer's Dilemma and Stag\nHunt). We analyze the impact of different types of morality on the emergence of\ncooperation, defection or exploitation, and the corresponding social outcomes.\nFinally, we discuss the implications of these findings for the development of\nmoral agents in artificial and mixed human-AI societies.\n","authors":["Elizaveta Tennant","Stephen Hailes","Mirco Musolesi"],"pdf_url":"https://arxiv.org/pdf/2301.08491v3.pdf","comment":"Accepted at IJCAI 2023 (32nd International Joint Conference on\n Artificial Intelligence - Macao, S.A.R.)"},{"id":"http://arxiv.org/abs/2107.07752v2","updated":"2023-08-30T14:39:24Z","published":"2021-07-16T08:07:22Z","title":"NeXtQSM -- A complete deep learning pipeline for data-consistent\n quantitative susceptibility mapping trained with hybrid data","summary":" Deep learning based Quantitative Susceptibility Mapping (QSM) has shown great\npotential in recent years, obtaining similar results to established\nnon-learning approaches. Many current deep learning approaches are not data\nconsistent, require in vivo training data or solve the QSM problem in\nconsecutive steps resulting in the propagation of errors. Here we aim to\novercome these limitations and developed a framework to solve the QSM\nprocessing steps jointly. We developed a new hybrid training data generation\nmethod that enables the end-to-end training for solving background field\ncorrection and dipole inversion in a data-consistent fashion using a\nvariational network that combines the QSM model term and a learned regularizer.\nWe demonstrate that NeXtQSM overcomes the limitations of previous deep learning\nmethods. NeXtQSM offers a new deep learning based pipeline for computing\nquantitative susceptibility maps that integrates each processing step into the\ntraining and provides results that are robust and fast.\n","authors":["Francesco Cognolato","Kieran O'Brien","Jin Jin","Simon Robinson","Frederik B. Laun","Markus Barth","Steffen Bollmann"],"pdf_url":"https://arxiv.org/pdf/2107.07752v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16061v1","updated":"2023-08-30T14:36:25Z","published":"2023-08-30T14:36:25Z","title":"Conti Inc.: Understanding the Internal Discussions of a large\n Ransomware-as-a-Service Operator with Machine Learning","summary":" Ransomware-as-a-service (RaaS) is increasing the scale and complexity of\nransomware attacks. Understanding the internal operations behind RaaS has been\na challenge due to the illegality of such activities. The recent chat leak of\nthe Conti RaaS operator, one of the most infamous ransomware operators on the\ninternational scene, offers a key opportunity to better understand the inner\nworkings of such organizations. This paper analyzes the main topic discussions\nin the Conti chat leak using machine learning techniques such as Natural\nLanguage Processing (NLP) and Latent Dirichlet Allocation (LDA), as well as\nvisualization strategies. Five discussion topics are found: 1) Business, 2)\nTechnical, 3) Internal tasking/Management, 4) Malware, and 5) Customer\nService/Problem Solving. Moreover, the distribution of topics among Conti\nmembers shows that only 4% of individuals have specialized discussions while\nalmost all individuals (96%) are all-rounders, meaning that their discussions\nrevolve around the five topics. The results also indicate that a significant\nproportion of Conti discussions are non-tech related. This study thus\nhighlights that running such large RaaS operations requires a workforce skilled\nbeyond technical abilities, with individuals involved in various tasks, from\nmanagement to customer service or problem solving. The discussion topics also\nshow that the organization behind the Conti RaaS oper5086933ator shares\nsimilarities with a large firm. We conclude that, although RaaS represents an\nexample of specialization in the cybercrime industry, only a few members are\nspecialized in one topic, while the rest runs and coordinates the RaaS\noperation.\n","authors":["Estelle Ruellan","Masarah Paquet-Clouston","Sebastian Garcia"],"pdf_url":"https://arxiv.org/pdf/2308.16061v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16059v1","updated":"2023-08-30T14:31:24Z","published":"2023-08-30T14:31:24Z","title":"A Parameter-Free Two-Bit Covariance Estimator with Improved Operator\n Norm Error Rate","summary":" A covariance matrix estimator using two bits per entry was recently developed\nby Dirksen, Maly and Rauhut [Annals of Statistics, 50(6), pp. 3538-3562]. The\nestimator achieves near minimax rate for general sub-Gaussian distributions,\nbut also suffers from two downsides: theoretically, there is an essential gap\non operator norm error between their estimator and sample covariance when the\ndiagonal of the covariance matrix is dominated by only a few entries;\npractically, its performance heavily relies on the dithering scale, which needs\nto be tuned according to some unknown parameters. In this work, we propose a\nnew 2-bit covariance matrix estimator that simultaneously addresses both\nissues. Unlike the sign quantizer associated with uniform dither in Dirksen et\nal., we adopt a triangular dither prior to a 2-bit quantizer inspired by the\nmulti-bit uniform quantizer. By employing dithering scales varying across\nentries, our estimator enjoys an improved operator norm error rate that depends\non the effective rank of the underlying covariance matrix rather than the\nambient dimension, thus closing the theoretical gap. Moreover, our proposed\nmethod eliminates the need of any tuning parameter, as the dithering scales are\nentirely determined by the data. Experimental results under Gaussian samples\nare provided to showcase the impressive numerical performance of our estimator.\nRemarkably, by halving the dithering scales, our estimator oftentimes achieves\noperator norm errors less than twice of the errors of sample covariance.\n","authors":["Junren Chen","Michael K. Ng"],"pdf_url":"https://arxiv.org/pdf/2308.16059v1.pdf","comment":"24 pages, 2 figures"},{"id":"http://arxiv.org/abs/2308.16056v1","updated":"2023-08-30T14:28:26Z","published":"2023-08-30T14:28:26Z","title":"Low-Rank Multitask Learning based on Tensorized SVMs and LSSVMs","summary":" Multitask learning (MTL) leverages task-relatedness to enhance performance.\nWith the emergence of multimodal data, tasks can now be referenced by multiple\nindices. In this paper, we employ high-order tensors, with each mode\ncorresponding to a task index, to naturally represent tasks referenced by\nmultiple indices and preserve their structural relations. Based on this\nrepresentation, we propose a general framework of low-rank MTL methods with\ntensorized support vector machines (SVMs) and least square support vector\nmachines (LSSVMs), where the CP factorization is deployed over the coefficient\ntensor. Our approach allows to model the task relation through a linear\ncombination of shared factors weighted by task-specific factors and is\ngeneralized to both classification and regression problems. Through the\nalternating optimization scheme and the Lagrangian function, each subproblem is\ntransformed into a convex problem, formulated as a quadratic programming or\nlinear system in the dual form. In contrast to previous MTL frameworks, our\ndecision function in the dual induces a weighted kernel function with a\ntask-coupling term characterized by the similarities of the task-specific\nfactors, better revealing the explicit relations across tasks in MTL.\nExperimental results validate the effectiveness and superiority of our proposed\nmethods compared to existing state-of-the-art approaches in MTL. The code of\nimplementation will be available at https://github.com/liujiani0216/TSVM-MTL.\n","authors":["Jiani Liu","Qinghua Tao","Ce Zhu","Yipeng Liu","Xiaolin Huang","Johan A. K. Suykens"],"pdf_url":"https://arxiv.org/pdf/2308.16056v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.00736v3","updated":"2023-08-30T14:11:10Z","published":"2022-12-01T18:29:48Z","title":"An exponentially-growing family of universal quantum circuits","summary":" Quantum machine learning has become an area of growing interest but has\ncertain theoretical and hardware-specific limitations. Notably, the problem of\nvanishing gradients, or barren plateaus, renders the training impossible for\ncircuits with high qubit counts, imposing a limit on the number of qubits that\ndata scientists can use for solving problems. Independently, angle-embedded\nsupervised quantum neural networks were shown to produce truncated Fourier\nseries with a degree directly dependent on two factors: the depth of the\nencoding and the number of parallel qubits the encoding applied to. The degree\nof the Fourier series limits the model expressivity. This work introduces two\nnew architectures whose Fourier degrees grow exponentially: the sequential and\nparallel exponential quantum machine learning architectures. This is done by\nefficiently using the available Hilbert space when encoding, increasing the\nexpressivity of the quantum encoding. Therefore, the exponential growth allows\nstaying at the low-qubit limit to create highly expressive circuits avoiding\nbarren plateaus. Practically, parallel exponential architecture was shown to\noutperform the existing linear architectures by reducing their final mean\nsquare error value by up to 44.7% in a one-dimensional test problem.\nFurthermore, the feasibility of this technique was also shown on a trapped ion\nquantum processing unit.\n","authors":["Mo Kordzanganeh","Pavel Sekatski","Leonid Fedichkin","Alexey Melnikov"],"pdf_url":"https://arxiv.org/pdf/2212.00736v3.pdf","comment":"14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2303.12247v2","updated":"2023-08-30T14:09:13Z","published":"2023-03-22T01:01:14Z","title":"Exploring the Benefits of Visual Prompting in Differential Privacy","summary":" Visual Prompting (VP) is an emerging and powerful technique that allows\nsample-efficient adaptation to downstream tasks by engineering a well-trained\nfrozen source model. In this work, we explore the benefits of VP in\nconstructing compelling neural network classifiers with differential privacy\n(DP). We explore and integrate VP into canonical DP training methods and\ndemonstrate its simplicity and efficiency. In particular, we discover that VP\nin tandem with PATE, a state-of-the-art DP training method that leverages the\nknowledge transfer from an ensemble of teachers, achieves the state-of-the-art\nprivacy-utility trade-off with minimum expenditure of privacy budget. Moreover,\nwe conduct additional experiments on cross-domain image classification with a\nsufficient domain gap to further unveil the advantage of VP in DP. Lastly, we\nalso conduct extensive ablation studies to validate the effectiveness and\ncontribution of VP under DP consideration. Our code is available at\n(https://github.com/EzzzLi/Prompt-PATE).\n","authors":["Yizhe Li","Yu-Lin Tsai","Xuebin Ren","Chia-Mu Yu","Pin-Yu Chen"],"pdf_url":"https://arxiv.org/pdf/2303.12247v2.pdf","comment":"Published at ICCV 2023"},{"id":"http://arxiv.org/abs/2306.06208v3","updated":"2023-08-30T14:07:49Z","published":"2023-06-05T23:07:01Z","title":"DeltaNN: Assessing the Impact of Computational Environment Parameters on\n the Performance of Image Recognition Models","summary":" Image recognition tasks typically use deep learning and require enormous\nprocessing power, thus relying on hardware accelerators like GPUs and TPUs for\nfast, timely processing. Failure in real-time image recognition tasks can occur\ndue to sub-optimal mapping on hardware accelerators during model deployment,\nwhich may lead to timing uncertainty and erroneous behavior. Mapping on\nhardware accelerators is done using multiple software components like deep\nlearning frameworks, compilers, and device libraries, that we refer to as the\ncomputational environment. Owing to the increased use of image recognition\ntasks in safety-critical applications like autonomous driving and medical\nimaging, it is imperative to assess their robustness to changes in the\ncomputational environment, as the impact of parameters like deep learning\nframeworks, compiler optimizations, and hardware devices on model performance\nand correctness is not yet well understood.\n In this paper we present a differential testing framework, DeltaNN, that\nallows us to assess the impact of different computational environment\nparameters on the performance of image recognition models during deployment,\npost training. DeltaNN generates different implementations of a given image\nrecognition model for variations in environment parameters, namely, deep\nlearning frameworks, compiler optimizations and hardware devices and analyzes\ndifferences in model performance as a result. Using DeltaNN, we conduct an\nempirical study of robustness analysis of three popular image recognition\nmodels using the ImageNet dataset. We report the impact in terms of\nmisclassifications and inference time differences across different settings. In\ntotal, we observed up to 72% output label differences across deep learning\nframeworks, and up to 81% unexpected performance degradation in terms of\ninference time, when applying compiler optimizations.\n","authors":["Nikolaos Louloudakis","Perry Gibson","José Cano","Ajitha Rajan"],"pdf_url":"https://arxiv.org/pdf/2306.06208v3.pdf","comment":"11 pages, 10 figures, 2 tables"},{"id":"http://arxiv.org/abs/2305.08854v2","updated":"2023-08-30T14:01:36Z","published":"2023-05-15T17:59:57Z","title":"Laughing Matters: Introducing Laughing-Face Generation using Diffusion\n Models","summary":" Speech-driven animation has gained significant traction in recent years, with\ncurrent methods achieving near-photorealistic results. However, the field\nremains underexplored regarding non-verbal communication despite evidence\ndemonstrating its importance in human interaction. In particular, generating\nlaughter sequences presents a unique challenge due to the intricacy and nuances\nof this behaviour. This paper aims to bridge this gap by proposing a novel\nmodel capable of generating realistic laughter sequences, given a still\nportrait and an audio clip containing laughter. We highlight the failure cases\nof traditional facial animation methods and leverage recent advances in\ndiffusion models to produce convincing laughter videos. We train our model on a\ndiverse set of laughter datasets and introduce an evaluation metric\nspecifically designed for laughter. When compared with previous speech-driven\napproaches, our model achieves state-of-the-art performance across all metrics,\neven when these are re-trained for laughter generation. Our code and project\nare publicly available\n","authors":["Antoni Bigata Casademunt","Rodrigo Mira","Nikita Drobyshev","Konstantinos Vougioukas","Stavros Petridis","Maja Pantic"],"pdf_url":"https://arxiv.org/pdf/2305.08854v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06157v3","updated":"2023-08-30T13:41:23Z","published":"2023-06-10T23:50:02Z","title":"Fault Localization for Buggy Deep Learning Framework Conversions in\n Image Recognition","summary":" When deploying Deep Neural Networks (DNNs), developers often convert models\nfrom one deep learning framework to another (e.g., TensorFlow to PyTorch).\nHowever, this process is error-prone and can impact target model accuracy. To\nidentify the extent of such impact, we perform and briefly present a\ndifferential analysis against three DNNs widely used for image recognition\n(MobileNetV2, ResNet101, and InceptionV3) converted across four well-known deep\nlearning frameworks (PyTorch, Keras, TensorFlow (TF), and TFLite), which\nrevealed numerous model crashes and output label discrepancies of up to 72%. To\nmitigate such errors, we present a novel approach towards fault localization\nand repair of buggy deep learning framework conversions, focusing on\npre-trained image recognition models. Our technique consists of four stages of\nanalysis: 1) conversion tools, 2) model parameters, 3) model hyperparameters,\nand 4) graph representation. In addition, we propose various strategies towards\nfault repair of the faults detected. We implement our technique on top of the\nApache TVM deep learning compiler, and we test it by conducting a preliminary\nfault localization analysis for the conversion of InceptionV3 from TF to\nTFLite. Our approach detected a fault in a common DNN converter tool, which\nintroduced precision errors in weights, reducing model accuracy. After our\nfault localization, we repaired the issue, reducing our conversion error to\nzero.\n","authors":["Nikolaos Louloudakis","Perry Gibson","José Cano","Ajitha Rajan"],"pdf_url":"https://arxiv.org/pdf/2306.06157v3.pdf","comment":"5 pages, 3 figures, 1 table"},{"id":"http://arxiv.org/abs/2307.15016v2","updated":"2023-08-30T13:33:59Z","published":"2023-07-27T17:19:32Z","title":"How Good is Google Bard's Visual Understanding? An Empirical Study on\n Open Challenges","summary":" Google's Bard has emerged as a formidable competitor to OpenAI's ChatGPT in\nthe field of conversational AI. Notably, Bard has recently been updated to\nhandle visual inputs alongside text prompts during conversations. Given Bard's\nimpressive track record in handling textual inputs, we explore its capabilities\nin understanding and interpreting visual data (images) conditioned by text\nquestions. This exploration holds the potential to unveil new insights and\nchallenges for Bard and other forthcoming multi-modal Generative models,\nespecially in addressing complex computer vision problems that demand accurate\nvisual and language understanding. Specifically, in this study, we focus on 15\ndiverse task scenarios encompassing regular, camouflaged, medical, under-water\nand remote sensing data to comprehensively evaluate Bard's performance. Our\nprimary finding indicates that Bard still struggles in these vision scenarios,\nhighlighting the significant gap in vision-based understanding that needs to be\nbridged in future developments. We expect that this empirical study will prove\nvaluable in advancing future models, leading to enhanced capabilities in\ncomprehending and interpreting fine-grained visual data. Our project is\nreleased on https://github.com/htqin/GoogleBard-VisUnderstand\n","authors":["Haotong Qin","Ge-Peng Ji","Salman Khan","Deng-Ping Fan","Fahad Shahbaz Khan","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2307.15016v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16022v1","updated":"2023-08-30T13:22:20Z","published":"2023-08-30T13:22:20Z","title":"PAVI: Plate-Amortized Variational Inference","summary":" Given observed data and a probabilistic generative model, Bayesian inference\nsearches for the distribution of the model's parameters that could have yielded\nthe data. Inference is challenging for large population studies where millions\nof measurements are performed over a cohort of hundreds of subjects, resulting\nin a massive parameter space. This large cardinality renders off-the-shelf\nVariational Inference (VI) computationally impractical.\n In this work, we design structured VI families that efficiently tackle large\npopulation studies. Our main idea is to share the parameterization and learning\nacross the different i.i.d. variables in a generative model, symbolized by the\nmodel's \\textit{plates}. We name this concept \\textit{plate amortization}.\nContrary to off-the-shelf stochastic VI, which slows down inference, plate\namortization results in orders of magnitude faster to train variational\ndistributions.\n Applied to large-scale hierarchical problems, PAVI yields expressive,\nparsimoniously parameterized VI with an affordable training time. This faster\nconvergence effectively unlocks inference in those large regimes. We illustrate\nthe practical utility of PAVI through a challenging Neuroimaging example\nfeaturing 400 million latent parameters, demonstrating a significant step\ntowards scalable and expressive Variational Inference.\n","authors":["Louis Rouillard","Alexandre Le Bris","Thomas Moreau","Demian Wassermann"],"pdf_url":"https://arxiv.org/pdf/2308.16022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00501v2","updated":"2023-08-30T13:02:41Z","published":"2023-07-02T07:20:47Z","title":"Classifying World War II Era Ciphers with Machine Learning","summary":" We determine the accuracy with which machine learning and deep learning\ntechniques can classify selected World War II era ciphers when only ciphertext\nis available. The specific ciphers considered are Enigma, M-209, Sigaba,\nPurple, and Typex. We experiment with three classic machine learning models,\nnamely, Support Vector Machines (SVM), $k$-Nearest Neighbors ($k$-NN), and\nRandom Forest (RF). We also experiment with four deep learning neural\nnetwork-based models: Multi-Layer Perceptrons (MLP), Long Short-Term Memory\n(LSTM), Extreme Learning Machines (ELM), and Convolutional Neural Networks\n(CNN). Each model is trained on features consisting of histograms, digrams, and\nraw ciphertext letter sequences. Furthermore, the classification problem is\nconsidered under four distinct scenarios: Fixed plaintext with fixed keys,\nrandom plaintext with fixed keys, fixed plaintext with random keys, and random\nplaintext with random keys. Under the most realistic scenario, given 1000\ncharacters per ciphertext, we are able to distinguish the ciphers with greater\nthan 97% accuracy. In addition, we consider the accuracy of a subset of the\nlearning techniques as a function of the length of the ciphertext messages.\nSomewhat surprisingly, our classic machine learning models perform at least as\nwell as our deep learning models. We also find that ciphers that are more\nsimilar in design are somewhat more challenging to distinguish, but not as\ndifficult as might be expected.\n","authors":["Brooke Dalton","Mark Stamp"],"pdf_url":"https://arxiv.org/pdf/2307.00501v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16008v1","updated":"2023-08-30T12:55:02Z","published":"2023-08-30T12:55:02Z","title":"EnsembleFollower: A Hybrid Car-Following Framework Based On\n Reinforcement Learning and Hierarchical Planning","summary":" Car-following models have made significant contributions to our understanding\nof longitudinal driving behavior. However, they often exhibit limited accuracy\nand flexibility, as they cannot fully capture the complexity inherent in\ncar-following processes, or may falter in unseen scenarios due to their\nreliance on confined driving skills present in training data. It is worth\nnoting that each car-following model possesses its own strengths and weaknesses\ndepending on specific driving scenarios. Therefore, we propose\nEnsembleFollower, a hierarchical planning framework for achieving advanced\nhuman-like car-following. The EnsembleFollower framework involves a high-level\nReinforcement Learning-based agent responsible for judiciously managing\nmultiple low-level car-following models according to the current state, either\nby selecting an appropriate low-level model to perform an action or by\nallocating different weights across all low-level components. Moreover, we\npropose a jerk-constrained kinematic model for more convincing car-following\nsimulations. We evaluate the proposed method based on real-world driving data\nfrom the HighD dataset. The experimental results illustrate that\nEnsembleFollower yields improved accuracy of human-like behavior and achieves\neffectiveness in combining hybrid models, demonstrating that our proposed\nframework can handle diverse car-following conditions by leveraging the\nstrengths of various low-level models.\n","authors":["Xu Han","Xianda Chen","Meixin Zhu","Pinlong Cai","Jianshan Zhou","Xiaowen Chu"],"pdf_url":"https://arxiv.org/pdf/2308.16008v1.pdf","comment":"12 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.15987v1","updated":"2023-08-30T12:18:18Z","published":"2023-08-30T12:18:18Z","title":"FPTQ: Fine-grained Post-Training Quantization for Large Language Models","summary":" In the era of large-scale language models, the substantial parameter size\nposes significant challenges for deployment. Being a prevalent compression\ntechnique, quantization has emerged as the mainstream practice to tackle this\nissue, which is mainly centered on two recipes W8A8 and W4A16 (i.e. weights and\nactivations in such bit widths). In this study, we propose a novel W4A8\npost-training quantization method for the available open-sourced LLMs, which\ncombines the advantages of both two recipes. Therefore, we can leverage the\nbenefit in the I/O utilization of 4-bit weight quantization and the\nacceleration due to 8-bit matrix computation. Nevertheless, the W4A8 faces\nnotorious performance degradation. As a remedy, we involve layerwise activation\nquantization strategies which feature a novel logarithmic equalization for most\nintractable layers, and we combine them with fine-grained weight quantization.\nWithout whistles and bells, we eliminate the necessity for further fine-tuning\nand obtain the state-of-the-art W4A8 quantized performance on BLOOM, LLaMA, and\nLLaMA-2 on standard benchmarks. We confirm that the W4A8 quantization is\nachievable for the deployment of large language models, fostering their\nwide-spreading real-world applications.\n","authors":["Qingyuan Li","Yifan Zhang","Liang Li","Peng Yao","Bo Zhang","Xiangxiang Chu","Yerui Sun","Li Du","Yuchen Xie"],"pdf_url":"https://arxiv.org/pdf/2308.15987v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15984v1","updated":"2023-08-30T12:13:13Z","published":"2023-08-30T12:13:13Z","title":"Learning Structure-from-Motion with Graph Attention Networks","summary":" In this paper we tackle the problem of learning Structure-from-Motion (SfM)\nthrough the use of graph attention networks. SfM is a classic computer vision\nproblem that is solved though iterative minimization of reprojection errors,\nreferred to as Bundle Adjustment (BA), starting from a good initialization. In\norder to obtain a good enough initialization to BA, conventional methods rely\non a sequence of sub-problems (such as pairwise pose estimation, pose averaging\nor triangulation) which provides an initial solution that can then be refined\nusing BA. In this work we replace these sub-problems by learning a model that\ntakes as input the 2D keypoints detected across multiple views, and outputs the\ncorresponding camera poses and 3D keypoint coordinates. Our model takes\nadvantage of graph neural networks to learn SfM-specific primitives, and we\nshow that it can be used for fast inference of the reconstruction for new and\nunseen sequences. The experimental results show that the proposed model\noutperforms competing learning-based methods, and challenges COLMAP while\nhaving lower runtime.\n","authors":["Lucas Brynte","José Pedro Iglesias","Carl Olsson","Fredrik Kahl"],"pdf_url":"https://arxiv.org/pdf/2308.15984v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06154v2","updated":"2023-08-30T12:07:02Z","published":"2023-06-09T14:49:20Z","title":"HypLL: The Hyperbolic Learning Library","summary":" Deep learning in hyperbolic space is quickly gaining traction in the fields\nof machine learning, multimedia, and computer vision. Deep networks commonly\noperate in Euclidean space, implicitly assuming that data lies on regular\ngrids. Recent advances have shown that hyperbolic geometry provides a viable\nalternative foundation for deep learning, especially when data is hierarchical\nin nature and when working with few embedding dimensions. Currently however, no\naccessible open-source library exists to build hyperbolic network modules akin\nto well-known deep learning libraries. We present HypLL, the Hyperbolic\nLearning Library to bring the progress on hyperbolic deep learning together.\nHypLL is built on top of PyTorch, with an emphasis in its design for\nease-of-use, in order to attract a broad audience towards this new and\nopen-ended research direction. The code is available at:\nhttps://github.com/maxvanspengler/hyperbolic_learning_library.\n","authors":["Max van Spengler","Philipp Wirth","Pascal Mettes"],"pdf_url":"https://arxiv.org/pdf/2306.06154v2.pdf","comment":"ACM Multimedia Open-Source Software Competition 2023"},{"id":"http://arxiv.org/abs/2308.14521v2","updated":"2023-08-30T11:56:45Z","published":"2023-08-28T12:13:36Z","title":"Context-Aware Composition of Agent Policies by Markov Decision Process\n Entity Embeddings and Agent Ensembles","summary":" Computational agents support humans in many areas of life and are therefore\nfound in heterogeneous contexts. This means they operate in rapidly changing\nenvironments and can be confronted with huge state and action spaces. In order\nto perform services and carry out activities in a goal-oriented manner, agents\nrequire prior knowledge and therefore have to develop and pursue\ncontext-dependent policies. However, prescribing policies in advance is limited\nand inflexible, especially in dynamically changing environments. Moreover, the\ncontext of an agent determines its choice of actions. Since the environments\ncan be stochastic and complex in terms of the number of states and feasible\nactions, activities are usually modelled in a simplified way by Markov decision\nprocesses so that, e.g., agents with reinforcement learning are able to learn\npolicies, that help to capture the context and act accordingly to optimally\nperform activities. However, training policies for all possible contexts using\nreinforcement learning is time-consuming. A requirement and challenge for\nagents is to learn strategies quickly and respond immediately in cross-context\nenvironments and applications, e.g., the Internet, service robotics,\ncyber-physical systems. In this work, we propose a novel simulation-based\napproach that enables a) the representation of heterogeneous contexts through\nknowledge graphs and entity embeddings and b) the context-aware composition of\npolicies on demand by ensembles of agents running in parallel. The evaluation\nwe conducted with the \"Virtual Home\" dataset indicates that agents with a need\nto switch seamlessly between different contexts, can request on-demand composed\npolicies that lead to the successful completion of context-appropriate\nactivities without having to learn these policies in lengthy training steps and\nepisodes, in contrast to agents that use reinforcement learning.\n","authors":["Nicole Merkle","Ralf Mikut"],"pdf_url":"https://arxiv.org/pdf/2308.14521v2.pdf","comment":"30 pages, 11 figures, 9 tables, 3 listings, Re-submitted to Semantic\n Web Journal, Currently, under review"},{"id":"http://arxiv.org/abs/2308.15973v1","updated":"2023-08-30T11:51:38Z","published":"2023-08-30T11:51:38Z","title":"Demo: A Digital Twin of the 5G Radio Access Network for Anomaly\n Detection Functionality","summary":" Recently, the concept of digital twins (DTs) has received significant\nattention within the realm of 5G/6G. This demonstration shows an innovative DT\ndesign and implementation framework tailored toward integration within the 5G\ninfrastructure. The proposed DT enables near real-time anomaly detection\ncapability pertaining to user connectivity. It empowers the 5G system to\nproactively execute decisions for resource control and connection restoration.\n","authors":["Peizheng Li","Adnan Aijaz","Tim Farnham","Sajida Gufran","Sita Chintalapati"],"pdf_url":"https://arxiv.org/pdf/2308.15973v1.pdf","comment":"2 pages, 2 figures. This paper has been accepted by the 31st IEEE\n International Conference on Network Protocols (ICNP 2023)"},{"id":"http://arxiv.org/abs/2308.02562v2","updated":"2023-08-30T11:47:05Z","published":"2023-08-03T04:03:46Z","title":"Food Classification using Joint Representation of Visual and Textual\n Data","summary":" Food classification is an important task in health care. In this work, we\npropose a multimodal classification framework that uses the modified version of\nEfficientNet with the Mish activation function for image classification, and\nthe traditional BERT transformer-based network is used for text classification.\nThe proposed network and the other state-of-the-art methods are evaluated on a\nlarge open-source dataset, UPMC Food-101. The experimental results show that\nthe proposed network outperforms the other methods, a significant difference of\n11.57% and 6.34% in accuracy is observed for image and text classification,\nrespectively, when compared with the second-best performing method. We also\ncompared the performance in terms of accuracy, precision, and recall for text\nclassification using both machine learning and deep learning-based models. The\ncomparative analysis from the prediction results of both images and text\ndemonstrated the efficiency and robustness of the proposed approach.\n","authors":["Prateek Mittal","Puneet Goyal","Joohi Chauhan"],"pdf_url":"https://arxiv.org/pdf/2308.02562v2.pdf","comment":"Updated results and discussions to be posted and some sections needed\n to be expanded"},{"id":"http://arxiv.org/abs/2204.07000v2","updated":"2023-08-30T11:05:50Z","published":"2022-04-14T14:49:34Z","title":"Solving AC Power Flow with Graph Neural Networks under Realistic\n Constraints","summary":" In this paper, we propose a graph neural network architecture to solve the AC\npower flow problem under realistic constraints. To ensure a safe and resilient\noperation of distribution grids, AC power flow calculations are the means of\nchoice to determine grid operating limits or analyze grid asset utilization in\nplanning procedures. In our approach, we demonstrate the development of a\nframework that uses graph neural networks to learn the physical constraints of\nthe power flow. We present our model architecture on which we perform\nunsupervised training to learn a general solution of the AC power flow\nformulation independent of the specific topologies and supply tasks used for\ntraining. Finally, we demonstrate, validate and discuss our results on medium\nvoltage benchmark grids. In our approach, we focus on the physical and\ntopological properties of distribution grids to provide scalable solutions for\nreal grid topologies. Therefore, we take a data-driven approach, using large\nand diverse data sets consisting of realistic grid topologies, for the\nunsupervised training of the AC power flow graph neural network architecture\nand compare the results to a prior neural architecture and the Newton-Raphson\nmethod. Our approach shows a high increase in computation time and good\naccuracy compared to state-of-the-art solvers. It also out-performs that neural\nsolver for power flow in terms of accuracy.\n","authors":["Luis Böttcher","Hinrikus Wolf","Bastian Jung","Philipp Lutat","Marc Trageser","Oliver Pohl","Andreas Ulbig","Martin Grohe"],"pdf_url":"https://arxiv.org/pdf/2204.07000v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15428v2","updated":"2023-08-30T10:38:41Z","published":"2023-07-28T09:26:00Z","title":"Implicit neural representation for change detection","summary":" Identifying changes in a pair of 3D aerial LiDAR point clouds, obtained\nduring two distinct time periods over the same geographic region presents a\nsignificant challenge due to the disparities in spatial coverage and the\npresence of noise in the acquisition system. The most commonly used approaches\nto detecting changes in point clouds are based on supervised methods which\nnecessitate extensive labelled data often unavailable in real-world\napplications. To address these issues, we propose an unsupervised approach that\ncomprises two components: Implicit Neural Representation (INR) for continuous\nshape reconstruction and a Gaussian Mixture Model for categorising changes. INR\noffers a grid-agnostic representation for encoding bi-temporal point clouds,\nwith unmatched spatial support that can be regularised to enhance\nhigh-frequency details and reduce noise. The reconstructions at each timestamp\nare compared at arbitrary spatial scales, leading to a significant increase in\ndetection capabilities. We apply our method to a benchmark dataset comprising\nsimulated LiDAR point clouds for urban sprawling. This dataset encompasses\ndiverse challenging scenarios, varying in resolutions, input modalities and\nnoise levels. This enables a comprehensive multi-scenario evaluation, comparing\nour method with the current state-of-the-art approach. We outperform the\nprevious methods by a margin of 10% in the intersection over union metric. In\naddition, we put our techniques to practical use by applying them in a\nreal-world scenario to identify instances of illicit excavation of\narchaeological sites and validate our results by comparing them with findings\nfrom field experts.\n","authors":["Peter Naylor","Diego Di Carlo","Arianna Traviglia","Makoto Yamada","Marco Fiorucci"],"pdf_url":"https://arxiv.org/pdf/2307.15428v2.pdf","comment":"Main article is 10 pages + 6 pages of supplementary. Conference style\n paper"},{"id":"http://arxiv.org/abs/2308.15936v1","updated":"2023-08-30T10:33:02Z","published":"2023-08-30T10:33:02Z","title":"Jaccard-constrained dense subgraph discovery","summary":" Finding dense subgraphs is a core problem in graph mining with many\napplications in diverse domains. At the same time many real-world networks vary\nover time, that is, the dataset can be represented as a sequence of graph\nsnapshots. Hence, it is natural to consider the question of finding dense\nsubgraphs in a temporal network that are allowed to vary over time to a certain\ndegree. In this paper, we search for dense subgraphs that have large pairwise\nJaccard similarity coefficients. More formally, given a set of graph snapshots\nand a weight $\\lambda$, we find a collection of dense subgraphs such that the\nsum of densities of the induced subgraphs plus the sum of Jaccard indices,\nweighted by $\\lambda$, is maximized. We prove that this problem is NP-hard. To\ndiscover dense subgraphs with good objective value, we present an iterative\nalgorithm which runs in $\\mathcal{O}(n^2k^2 + m \\log n + k^3 n)$ time per\nsingle iteration, and a greedy algorithm which runs in $\\mathcal{O}(n^2k^2 + m\n\\log n + k^3 n)$ time, where $k$ is the length of the graph sequence and $n$\nand $m$ denote number of nodes and total number of edges respectively. We show\nexperimentally that our algorithms are efficient, they can find ground truth in\nsynthetic datasets and provide interpretable results from real-world datasets.\nFinally, we present a case study that shows the usefulness of our problem.\n","authors":["Chamalee Wickrama Arachchi","Nikolaj Tatti"],"pdf_url":"https://arxiv.org/pdf/2308.15936v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.13455v3","updated":"2023-08-30T10:27:00Z","published":"2022-10-21T09:59:15Z","title":"E-MCTS: Deep Exploration in Model-Based Reinforcement Learning by\n Planning with Epistemic Uncertainty","summary":" One of the most well-studied and highly performing planning approaches used\nin Model-Based Reinforcement Learning (MBRL) is Monte-Carlo Tree Search (MCTS).\nKey challenges of MCTS-based MBRL methods remain dedicated deep exploration and\nreliability in the face of the unknown, and both challenges can be alleviated\nthrough principled epistemic uncertainty estimation in the predictions of MCTS.\nWe present two main contributions: First, we develop methodology to propagate\nepistemic uncertainty in MCTS, enabling agents to estimate the epistemic\nuncertainty in their predictions. Second, we utilize the propagated uncertainty\nfor a novel deep exploration algorithm by explicitly planning to explore. We\nincorporate our approach into variations of MCTS-based MBRL approaches with\nlearned and provided dynamics models, and empirically show deep exploration\nthrough successful epistemic uncertainty estimation achieved by our approach.\nWe compare to a non-planning-based deep-exploration baseline, and demonstrate\nthat planning with epistemic MCTS significantly outperforms non-planning based\nexploration in the investigated deep exploration benchmark.\n","authors":["Yaniv Oren","Matthijs T. J. Spaan","Wendelin Böhmer"],"pdf_url":"https://arxiv.org/pdf/2210.13455v3.pdf","comment":"Submitted to NeurIPS 2023, accepted to EWRL 2023"},{"id":"http://arxiv.org/abs/2307.09829v2","updated":"2023-08-30T10:19:02Z","published":"2023-07-19T08:34:25Z","title":"What do neural networks learn in image classification? A frequency\n shortcut perspective","summary":" Frequency analysis is useful for understanding the mechanisms of\nrepresentation learning in neural networks (NNs). Most research in this area\nfocuses on the learning dynamics of NNs for regression tasks, while little for\nclassification. This study empirically investigates the latter and expands the\nunderstanding of frequency shortcuts. First, we perform experiments on\nsynthetic datasets, designed to have a bias in different frequency bands. Our\nresults demonstrate that NNs tend to find simple solutions for classification,\nand what they learn first during training depends on the most distinctive\nfrequency characteristics, which can be either low- or high-frequencies.\nSecond, we confirm this phenomenon on natural images. We propose a metric to\nmeasure class-wise frequency characteristics and a method to identify frequency\nshortcuts. The results show that frequency shortcuts can be texture-based or\nshape-based, depending on what best simplifies the objective. Third, we\nvalidate the transferability of frequency shortcuts on out-of-distribution\n(OOD) test sets. Our results suggest that frequency shortcuts can be\ntransferred across datasets and cannot be fully avoided by larger model\ncapacity and data augmentation. We recommend that future research should focus\non effective training schemes mitigating frequency shortcut learning.\n","authors":["Shunxin Wang","Raymond Veldhuis","Christoph Brune","Nicola Strisciuglio"],"pdf_url":"https://arxiv.org/pdf/2307.09829v2.pdf","comment":"Accepted at ICCV2023"},{"id":"http://arxiv.org/abs/2210.17287v3","updated":"2023-08-30T10:18:25Z","published":"2022-10-27T10:46:32Z","title":"Diffiner: A Versatile Diffusion-based Generative Refiner for Speech\n Enhancement","summary":" Although deep neural network (DNN)-based speech enhancement (SE) methods\noutperform the previous non-DNN-based ones, they often degrade the perceptual\nquality of generated outputs. To tackle this problem, we introduce a DNN-based\ngenerative refiner, Diffiner, aiming to improve perceptual speech quality\npre-processed by an SE method. We train a diffusion-based generative model by\nutilizing a dataset consisting of clean speech only. Then, our refiner\neffectively mixes clean parts newly generated via denoising diffusion\nrestoration into the degraded and distorted parts caused by a preceding SE\nmethod, resulting in refined speech. Once our refiner is trained on a set of\nclean speech, it can be applied to various SE methods without additional\ntraining specialized for each SE module. Therefore, our refiner can be a\nversatile post-processing module w.r.t. SE methods and has high potential in\nterms of modularity. Experimental results show that our method improved\nperceptual speech quality regardless of the preceding SE methods used.\n","authors":["Ryosuke Sawata","Naoki Murata","Yuhta Takida","Toshimitsu Uesaka","Takashi Shibuya","Shusuke Takahashi","Yuki Mitsufuji"],"pdf_url":"https://arxiv.org/pdf/2210.17287v3.pdf","comment":"Accepted by Interspeech 2023"},{"id":"http://arxiv.org/abs/2308.15930v1","updated":"2023-08-30T10:12:39Z","published":"2023-08-30T10:12:39Z","title":"LLaSM: Large Language and Speech Model","summary":" Multi-modal large language models have garnered significant interest\nrecently. Though, most of the works focus on vision-language multi-modal models\nproviding strong capabilities in following vision-and-language instructions.\nHowever, we claim that speech is also an important modality through which\nhumans interact with the world. Hence, it is crucial for a general-purpose\nassistant to be able to follow multi-modal speech-and-language instructions. In\nthis work, we propose Large Language and Speech Model (LLaSM). LLaSM is an\nend-to-end trained large multi-modal speech-language model with cross-modal\nconversational abilities, capable of following speech-and-language\ninstructions. Our early experiments show that LLaSM demonstrates a more\nconvenient and natural way for humans to interact with artificial intelligence.\nSpecifically, we also release a large Speech Instruction Following dataset\nLLaSM-Audio-Instructions. Code and demo are available at\nhttps://github.com/LinkSoul-AI/LLaSM and\nhttps://huggingface.co/spaces/LinkSoul/LLaSM. The LLaSM-Audio-Instructions\ndataset is available at\nhttps://huggingface.co/datasets/LinkSoul/LLaSM-Audio-Instructions.\n","authors":["Yu Shu","Siwei Dong","Guangyao Chen","Wenhao Huang","Ruihua Zhang","Daochen Shi","Qiqi Xiang","Yemin Shi"],"pdf_url":"https://arxiv.org/pdf/2308.15930v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.11141v2","updated":"2023-08-30T09:57:57Z","published":"2022-07-22T15:25:59Z","title":"Deep neural networks on diffeomorphism groups for optimal shape\n reparameterization","summary":" One of the fundamental problems in shape analysis is to align curves or\nsurfaces before computing geodesic distances between their shapes. Finding the\noptimal reparametrization realizing this alignment is a computationally\ndemanding task, typically done by solving an optimization problem on the\ndiffeomorphism group. In this paper, we propose an algorithm for constructing\napproximations of orientation-preserving diffeomorphisms by composition of\nelementary diffeomorphisms. The algorithm is implemented using PyTorch, and is\napplicable for both unparametrized curves and surfaces. Moreover, we show\nuniversal approximation properties for the constructed architectures, and\nobtain bounds for the Lipschitz constants of the resulting diffeomorphisms.\n","authors":["Elena Celledoni","Helge Glöckner","Jørgen Riseth","Alexander Schmeding"],"pdf_url":"https://arxiv.org/pdf/2207.11141v2.pdf","comment":"36 pages, 11 figures. Accepted by BIT Numerical Mathematics, not yet\n published"},{"id":"http://arxiv.org/abs/2308.15911v1","updated":"2023-08-30T09:38:44Z","published":"2023-08-30T09:38:44Z","title":"Cyclophobic Reinforcement Learning","summary":" In environments with sparse rewards, finding a good inductive bias for\nexploration is crucial to the agent's success. However, there are two competing\ngoals: novelty search and systematic exploration. While existing approaches\nsuch as curiosity-driven exploration find novelty, they sometimes do not\nsystematically explore the whole state space, akin to depth-first-search vs\nbreadth-first-search. In this paper, we propose a new intrinsic reward that is\ncyclophobic, i.e., it does not reward novelty, but punishes redundancy by\navoiding cycles. Augmenting the cyclophobic intrinsic reward with a sequence of\nhierarchical representations based on the agent's cropped observations we are\nable to achieve excellent results in the MiniGrid and MiniHack environments.\nBoth are particularly hard, as they require complex interactions with different\nobjects in order to be solved. Detailed comparisons with previous approaches\nand thorough ablation studies show that our newly proposed cyclophobic\nreinforcement learning is more sample efficient than other state of the art\nmethods in a variety of tasks.\n","authors":["Stefan Sylvius Wagner","Peter Arndt","Jan Robine","Stefan Harmeling"],"pdf_url":"https://arxiv.org/pdf/2308.15911v1.pdf","comment":"Published in Transactions on Machine Learning Research (08/2023)"},{"id":"http://arxiv.org/abs/2210.10264v3","updated":"2023-08-30T09:19:01Z","published":"2022-10-19T02:59:31Z","title":"SignReLU neural network and its approximation ability","summary":" Deep neural networks (DNNs) have garnered significant attention in various\nfields of science and technology in recent years. Activation functions define\nhow neurons in DNNs process incoming signals for them. They are essential for\nlearning non-linear transformations and for performing diverse computations\namong successive neuron layers. In the last few years, researchers have\ninvestigated the approximation ability of DNNs to explain their power and\nsuccess. In this paper, we explore the approximation ability of DNNs using a\ndifferent activation function, called SignReLU. Our theoretical results\ndemonstrate that SignReLU networks outperform rational and ReLU networks in\nterms of approximation performance. Numerical experiments are conducted\ncomparing SignReLU with the existing activations such as ReLU, Leaky ReLU, and\nELU, which illustrate the competitive practical performance of SignReLU.\n","authors":["Jianfei Li","Han Feng","Ding-Xuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2210.10264v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15905v1","updated":"2023-08-30T09:15:41Z","published":"2023-08-30T09:15:41Z","title":"Thermodynamic Computing via Autonomous Quantum Thermal Machines","summary":" We develop a physics-based model for classical computation based on\nautonomous quantum thermal machines. These machines consist of few interacting\nquantum bits (qubits) connected to several environments at different\ntemperatures. Heat flows through the machine are here exploited for computing.\nThe process starts by setting the temperatures of the environments according to\nthe logical input. The machine evolves, eventually reaching a non-equilibrium\nsteady state, from which the output of the computation can be determined via\nthe temperature of an auxilliary finite-size reservoir. Such a machine, which\nwe term a \"thermodynamic neuron\", can implement any linearly-separable\nfunction, and we discuss explicitly the cases of NOT, 3-majority and NOR gates.\nIn turn, we show that a network of thermodynamic neurons can perform any\ndesired function. We discuss the close connection between our model and\nartificial neurons (perceptrons), and argue that our model provides an\nalternative physics-based analogue implementation of neural networks, and more\ngenerally a platform for thermodynamic computing.\n","authors":["Patryk Lipka-Bartosik","Martí Perarnau-Llobet","Nicolas Brunner"],"pdf_url":"https://arxiv.org/pdf/2308.15905v1.pdf","comment":"12 + 4 pages. Comments welcome!"},{"id":"http://arxiv.org/abs/2308.15899v1","updated":"2023-08-30T09:09:42Z","published":"2023-08-30T09:09:42Z","title":"Beyond Traditional Neural Networks: Toward adding Reasoning and Learning\n Capabilities through Computational Logic Techniques","summary":" Deep Learning (DL) models have become popular for solving complex problems,\nbut they have limitations such as the need for high-quality training data, lack\nof transparency, and robustness issues. Neuro-Symbolic AI has emerged as a\npromising approach combining the strengths of neural networks and symbolic\nreasoning. Symbolic knowledge injection (SKI) techniques are a popular method\nto incorporate symbolic knowledge into sub-symbolic systems. This work proposes\nsolutions to improve the knowledge injection process and integrate elements of\nML and logic into multi-agent systems (MAS).\n","authors":["Andrea Rafanelli"],"pdf_url":"https://arxiv.org/pdf/2308.15899v1.pdf","comment":"In Proceedings ICLP 2023, arXiv:2308.14898"},{"id":"http://arxiv.org/abs/2308.15887v1","updated":"2023-08-30T09:04:24Z","published":"2023-08-30T09:04:24Z","title":"On the Potential of CLIP for Compositional Logical Reasoning","summary":" In this paper we explore the possibility of using OpenAI's CLIP to perform\nlogically coherent grounded visual reasoning. To that end, we formalize our\nterms and give a geometric analysis of how embeddings in CLIP's latent space\nwould need to be configured in order for the system to be logically coherent.\nOur main conclusion is that, as usually configured, CLIP cannot perform such\nreasoning.\n","authors":["Justin Brody"],"pdf_url":"https://arxiv.org/pdf/2308.15887v1.pdf","comment":"In Proceedings ICLP 2023, arXiv:2308.14898"},{"id":"http://arxiv.org/abs/2308.15885v1","updated":"2023-08-30T09:04:06Z","published":"2023-08-30T09:04:06Z","title":"Towards One-Shot Learning for Text Classification using Inductive Logic\n Programming","summary":" With the ever-increasing potential of AI to perform personalised tasks, it is\nbecoming essential to develop new machine learning techniques which are\ndata-efficient and do not require hundreds or thousands of training data. In\nthis paper, we explore an Inductive Logic Programming approach for one-shot\ntext classification. In particular, we explore the framework of\nMeta-Interpretive Learning (MIL), along with using common-sense background\nknowledge extracted from ConceptNet. Results indicate that MIL can learn text\nclassification rules from a small number of training examples. Moreover, the\nhigher complexity of chosen examples, the higher accuracy of the outcome.\n","authors":["Ghazal Afroozi Milani","Daniel Cyrus","Alireza Tamaddoni-Nezhad"],"pdf_url":"https://arxiv.org/pdf/2308.15885v1.pdf","comment":"In Proceedings ICLP 2023, arXiv:2308.14898"},{"id":"http://arxiv.org/abs/2308.15883v1","updated":"2023-08-30T09:03:45Z","published":"2023-08-30T09:03:45Z","title":"\"Would life be more interesting if I were in AI?\" Answering\n Counterfactuals based on Probabilistic Inductive Logic Programming","summary":" Probabilistic logic programs are logic programs where some facts hold with a\nspecified probability. Here, we investigate these programs with a causal\nframework that allows counterfactual queries. Learning the program structure\nfrom observational data is usually done through heuristic search relying on\nstatistical tests. However, these statistical tests lack information about the\ncausal mechanism generating the data, which makes it unfeasible to use the\nresulting programs for counterfactual reasoning. To address this, we propose a\nlanguage fragment that allows reconstructing a program from its induced\ndistribution. This further enables us to learn programs supporting\ncounterfactual queries.\n","authors":["Kilian Rückschloß","Felix Weitkämper"],"pdf_url":"https://arxiv.org/pdf/2308.15883v1.pdf","comment":"In Proceedings ICLP 2023, arXiv:2308.14898"},{"id":"http://arxiv.org/abs/2308.15873v1","updated":"2023-08-30T08:58:23Z","published":"2023-08-30T08:58:23Z","title":"Minimum Width for Deep, Narrow MLP: A Diffeomorphism and the Whitney\n Embedding Theorem Approach","summary":" Recently, there has been significant attention on determining the minimum\nwidth for the universal approximation property of deep, narrow MLPs. Among\nthese challenges, approximating a continuous function under the uniform norm is\nimportant and challenging, with the gap between its lower and upper bound being\nhard to narrow. In this regard, we propose a novel upper bound for the minimum\nwidth, given by $\\operatorname{max}(2d_x+1, d_y) + \\alpha(\\sigma)$, to achieve\nuniform approximation in deep narrow MLPs, where $0\\leq \\alpha(\\sigma)\\leq 2$\nrepresents the constant depending on the activation function. We demonstrate\nthis bound through two key proofs. First, we establish that deep, narrow MLPs\nwith little additional width can approximate diffeomorphisms. Secondly, we\nutilize the Whitney embedding theorem to show that any continuous function can\nbe approximated by embeddings, further decomposed into linear transformations\nand diffeomorphisms.\n","authors":["Geonho Hwang"],"pdf_url":"https://arxiv.org/pdf/2308.15873v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15856v1","updated":"2023-08-30T08:46:46Z","published":"2023-08-30T08:46:46Z","title":"Domain Generalization without Excess Empirical Risk","summary":" Given data from diverse sets of distinct distributions, domain generalization\naims to learn models that generalize to unseen distributions. A common approach\nis designing a data-driven surrogate penalty to capture generalization and\nminimize the empirical risk jointly with the penalty. We argue that a\nsignificant failure mode of this recipe is an excess risk due to an erroneous\npenalty or hardness in joint optimization. We present an approach that\neliminates this problem. Instead of jointly minimizing empirical risk with the\npenalty, we minimize the penalty under the constraint of optimality of the\nempirical risk. This change guarantees that the domain generalization penalty\ncannot impair optimization of the empirical risk, i.e., in-distribution\nperformance. To solve the proposed optimization problem, we demonstrate an\nexciting connection to rate-distortion theory and utilize its tools to design\nan efficient method. Our approach can be applied to any penalty-based domain\ngeneralization method, and we demonstrate its effectiveness by applying it to\nthree examplar methods from the literature, showing significant improvements.\n","authors":["Ozan Sener","Vladlen Koltun"],"pdf_url":"https://arxiv.org/pdf/2308.15856v1.pdf","comment":"Published at NeurIPS 2022"},{"id":"http://arxiv.org/abs/2302.08811v2","updated":"2023-08-30T08:23:19Z","published":"2023-02-17T11:09:59Z","title":"G-Signatures: Global Graph Propagation With Randomized Signatures","summary":" Graph neural networks (GNNs) have evolved into one of the most popular deep\nlearning architectures. However, GNNs suffer from over-smoothing node\ninformation and, therefore, struggle to solve tasks where global graph\nproperties are relevant. We introduce G-Signatures, a novel graph learning\nmethod that enables global graph propagation via randomized signatures.\nG-Signatures use a new graph conversion concept to embed graph structured\ninformation which can be interpreted as paths in latent space. We further\nintroduce the idea of latent space path mapping. This allows us to iteratively\ntraverse latent space paths, and, thus globally process information.\nG-Signatures excel at extracting and processing global graph properties, and\neffectively scale to large graph problems. Empirically, we confirm the\nadvantages of G-Signatures at several classification and regression tasks.\n","authors":["Bernhard Schäfl","Lukas Gruber","Johannes Brandstetter","Sepp Hochreiter"],"pdf_url":"https://arxiv.org/pdf/2302.08811v2.pdf","comment":"7 pages (+ appendix); 4 figures"},{"id":"http://arxiv.org/abs/2308.15840v1","updated":"2023-08-30T08:21:56Z","published":"2023-08-30T08:21:56Z","title":"MSGNN: Multi-scale Spatio-temporal Graph Neural Network for Epidemic\n Forecasting","summary":" Infectious disease forecasting has been a key focus and proved to be crucial\nin controlling epidemic. A recent trend is to develop forecast-ing models based\non graph neural networks (GNNs). However, existing GNN-based methods suffer\nfrom two key limitations: (1) Current models broaden receptive fields by\nscaling the depth of GNNs, which is insuffi-cient to preserve the semantics of\nlong-range connectivity between distant but epidemic related areas. (2)\nPrevious approaches model epidemics within single spatial scale, while ignoring\nthe multi-scale epidemic pat-terns derived from different scales. To address\nthese deficiencies, we devise the Multi-scale Spatio-temporal Graph Neural\nNetwork (MSGNN) based on an innovative multi-scale view. To be specific, in the\nproposed MSGNN model, we first devise a novel graph learning module, which\ndirectly captures long-range connectivity from trans-regional epidemic signals\nand integrates them into a multi-scale graph. Based on the learned multi-scale\ngraph, we utilize a newly designed graph convolution module to exploit\nmulti-scale epidemic patterns. This module allows us to facilitate multi-scale\nepidemic modeling by mining both scale-shared and scale-specific pat-terns.\nExperimental results on forecasting new cases of COVID-19 in United State\ndemonstrate the superiority of our method over state-of-arts. Further analyses\nand visualization also show that MSGNN offers not only accurate, but also\nrobust and interpretable forecasting result.\n","authors":["Mingjie Qiu","Zhiyi Tan","Bing-kun Bao"],"pdf_url":"https://arxiv.org/pdf/2308.15840v1.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2308.15838v1","updated":"2023-08-30T08:21:46Z","published":"2023-08-30T08:21:46Z","title":"Adaptive Lasso, Transfer Lasso, and Beyond: An Asymptotic Perspective","summary":" This paper presents a comprehensive exploration of the theoretical properties\ninherent in the Adaptive Lasso and the Transfer Lasso. The Adaptive Lasso, a\nwell-established method, employs regularization divided by initial estimators\nand is characterized by asymptotic normality and variable selection\nconsistency. In contrast, the recently proposed Transfer Lasso employs\nregularization subtracted by initial estimators with the demonstrated capacity\nto curtail non-asymptotic estimation errors. A pivotal question thus emerges:\nGiven the distinct ways the Adaptive Lasso and the Transfer Lasso employ\ninitial estimators, what benefits or drawbacks does this disparity confer upon\neach method? This paper conducts a theoretical examination of the asymptotic\nproperties of the Transfer Lasso, thereby elucidating its differentiation from\nthe Adaptive Lasso. Informed by the findings of this analysis, we introduce a\nnovel method, one that amalgamates the strengths and compensates for the\nweaknesses of both methods. The paper concludes with validations of our theory\nand comparisons of the methods via simulation experiments.\n","authors":["Masaaki Takada","Hironori Fujisawa"],"pdf_url":"https://arxiv.org/pdf/2308.15838v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.11479v2","updated":"2023-08-30T08:21:40Z","published":"2023-01-27T00:51:48Z","title":"Alien Coding","summary":" We introduce a self-learning algorithm for synthesizing programs for OEIS\nsequences. The algorithm starts from scratch initially generating programs at\nrandom. Then it runs many iterations of a self-learning loop that interleaves\n(i) training neural machine translation to learn the correspondence between\nsequences and the programs discovered so far, and (ii) proposing many new\nprograms for each OEIS sequence by the trained neural machine translator. The\nalgorithm discovers on its own programs for more than 78000 OEIS sequences,\nsometimes developing unusual programming methods. We analyze its behavior and\nthe invented programs in several experiments.\n","authors":["Thibault Gauthier","Miroslav Olšák","Josef Urban"],"pdf_url":"https://arxiv.org/pdf/2301.11479v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15321v2","updated":"2023-08-30T08:20:30Z","published":"2023-08-29T14:16:09Z","title":"Elucidating the Exposure Bias in Diffusion Models","summary":" Diffusion models have demonstrated impressive generative capabilities, but\ntheir 'exposure bias' problem, described as the input mismatch between training\nand sampling, lacks in-depth exploration. In this paper, we systematically\ninvestigate the exposure bias problem in diffusion models by first analytically\nmodelling the sampling distribution, based on which we then attribute the\nprediction error at each sampling step as the root cause of the exposure bias\nissue. Furthermore, we discuss potential solutions to this issue and propose an\nintuitive metric for it. Along with the elucidation of exposure bias, we\npropose a simple, yet effective, training-free method called Epsilon Scaling to\nalleviate the exposure bias. We show that Epsilon Scaling explicitly moves the\nsampling trajectory closer to the vector field learned in the training phase by\nscaling down the network output (Epsilon), mitigating the input mismatch\nbetween training and sampling. Experiments on various diffusion frameworks\n(ADM, DDPM/DDIM, LDM), unconditional and conditional settings, and\ndeterministic vs. stochastic sampling verify the effectiveness of our method.\n","authors":["Mang Ning","Mingxiao Li","Jianlin Su","Albert Ali Salah","Itir Onal Ertugrul"],"pdf_url":"https://arxiv.org/pdf/2308.15321v2.pdf","comment":"7 pages, code available soon"},{"id":"http://arxiv.org/abs/2204.09398v2","updated":"2023-08-30T08:18:15Z","published":"2022-04-20T11:43:58Z","title":"Case-Aware Adversarial Training","summary":" The neural network (NN) becomes one of the most heated type of models in\nvarious signal processing applications. However, NNs are extremely vulnerable\nto adversarial examples (AEs). To defend AEs, adversarial training (AT) is\nbelieved to be the most effective method while due to the intensive\ncomputation, AT is limited to be applied in most applications. In this paper,\nto resolve the problem, we design a generic and efficient AT improvement\nscheme, namely case-aware adversarial training (CAT). Specifically, the\nintuition stems from the fact that a very limited part of informative samples\ncan contribute to most of model performance. Alternatively, if only the most\ninformative AEs are used in AT, we can lower the computation complexity of AT\nsignificantly as maintaining the defense effect. To achieve this, CAT achieves\ntwo breakthroughs. First, a method to estimate the information degree of\nadversarial examples is proposed for AE filtering. Second, to further enrich\nthe information that the NN can obtain from AEs, CAT involves a weight\nestimation and class-level balancing based sampling strategy to increase the\ndiversity of AT at each iteration. Extensive experiments show that CAT is\nfaster than vanilla AT by up to 3x while achieving competitive defense effect.\n","authors":["Mingyuan Fan","Yang Liu","Cen Chen"],"pdf_url":"https://arxiv.org/pdf/2204.09398v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.14624v2","updated":"2023-08-30T08:01:22Z","published":"2022-09-29T08:38:30Z","title":"Is Complexity Required for Neural Network Pruning? A Case Study on\n Global Magnitude Pruning","summary":" Pruning neural networks has become popular in the last decade when it was\nshown that a large number of weights can be safely removed from modern neural\nnetworks without compromising accuracy. Numerous pruning methods have been\nproposed since then, each claiming to be better than the previous. Many\nstate-of-the-art (SOTA) techniques today rely on complex pruning methodologies\nutilizing importance scores, getting feedback through back-propagation or\nhaving heuristics-based pruning rules amongst others. In this work, we question\nwhether this pattern of introducing complexity is really necessary to achieve\nbetter pruning results. We benchmark these SOTA techniques against a naive\npruning baseline, namely, Global Magnitude Pruning (Global MP). Global MP ranks\nweights in order of their magnitudes and prunes the smallest ones. Hence, in\nits vanilla form, it is one of the simplest pruning techniques. Surprisingly,\nwe find that vanilla Global MP outperforms all the other SOTA techniques and\nachieves a new SOTA result. It also achieves promising performance on FLOPs\nsparsification, which we find is enhanced, when pruning is conducted in a\ngradual fashion. We also find that Global MP is generalizable across tasks,\ndatasets, and models with superior performance. Moreover, a common issue that\nmany pruning algorithms run into at high sparsity rates, namely,\nlayer-collapse, can be easily fixed in Global MP by setting a minimum threshold\nof weights to be retained in each layer. Lastly, unlike many other SOTA\ntechniques, Global MP does not require any additional algorithm specific\nhyper-parameters and is very straightforward to tune and implement. We showcase\nour findings on various models (WRN-28-8, ResNet-32, ResNet-50, MobileNet-V1\nand FastGRNN) and multiple datasets (CIFAR-10, ImageNet and HAR-2). Code is\navailable at https://github.com/manasgupta-1/GlobalMP.\n","authors":["Manas Gupta","Efe Camci","Vishandi Rudy Keneta","Abhishek Vaidyanathan","Ritwik Kanodia","Chuan-Sheng Foo","Wu Min","Lin Jie"],"pdf_url":"https://arxiv.org/pdf/2209.14624v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15821v1","updated":"2023-08-30T07:46:32Z","published":"2023-08-30T07:46:32Z","title":"Federated Two Stage Decoupling With Adaptive Personalization Layers","summary":" Federated learning has gained significant attention due to its groundbreaking\nability to enable distributed learning while maintaining privacy constraints.\nHowever, as a consequence of data heterogeneity among decentralized devices, it\ninherently experiences significant learning degradation and slow convergence\nspeed. Therefore, it is natural to employ the concept of clustering homogeneous\nclients into the same group, allowing only the model weights within each group\nto be aggregated. While most existing clustered federated learning methods\nemploy either model gradients or inference outputs as metrics for client\npartitioning, with the goal of grouping similar devices together, may still\nhave heterogeneity within each cluster. Moreover, there is a scarcity of\nresearch exploring the underlying reasons for determining the appropriate\ntiming for clustering, resulting in the common practice of assigning each\nclient to its own individual cluster, particularly in the context of highly non\nindependent and identically distributed (Non-IID) data. In this paper, we\nintroduce a two-stage decoupling federated learning algorithm with adaptive\npersonalization layers named FedTSDP, where client clustering is performed\ntwice according to inference outputs and model weights, respectively. Hopkins\namended sampling is adopted to determine the appropriate timing for clustering\nand the sampling weight of public unlabeled data. In addition, a simple yet\neffective approach is developed to adaptively adjust the personalization layers\nbased on varying degrees of data skew. Experimental results show that our\nproposed method has reliable performance on both IID and non-IID scenarios.\n","authors":["Hangyu Zhu","Yuxiang Fan","Zhenping Xie"],"pdf_url":"https://arxiv.org/pdf/2308.15821v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18415v2","updated":"2023-08-30T07:39:14Z","published":"2023-05-28T18:48:50Z","title":"Geometric Algebra Transformers","summary":" Problems involving geometric data arise in physics, chemistry, robotics,\ncomputer vision, and many other fields. Such data can take numerous forms, such\nas points, direction vectors, translations, or rotations, but to date there is\nno single architecture that can be applied to such a wide variety of geometric\ntypes while respecting their symmetries. In this paper we introduce the\nGeometric Algebra Transformer (GATr), a general-purpose architecture for\ngeometric data. GATr represents inputs, outputs, and hidden states in the\nprojective geometric (or Clifford) algebra, which offers an efficient\n16-dimensional vector-space representation of common geometric objects as well\nas operators acting on them. GATr is equivariant with respect to E(3), the\nsymmetry group of 3D Euclidean space. As a Transformer, GATr is versatile,\nefficient, and scalable. We demonstrate GATr in problems from n-body modeling\nto wall-shear-stress estimation on large arterial meshes to robotic motion\nplanning. GATr consistently outperforms both non-geometric and equivariant\nbaselines in terms of error, data efficiency, and scalability.\n","authors":["Johann Brehmer","Pim de Haan","Sönke Behrends","Taco Cohen"],"pdf_url":"https://arxiv.org/pdf/2305.18415v2.pdf","comment":"v2: more experiments, more baselines"},{"id":"http://arxiv.org/abs/2308.15812v1","updated":"2023-08-30T07:35:32Z","published":"2023-08-30T07:35:32Z","title":"Peering Through Preferences: Unraveling Feedback Acquisition for\n Aligning Large Language Models","summary":" Aligning large language models (LLMs) with human values and intents\ncritically involves the use of human or AI feedback. While dense feedback\nannotations are expensive to acquire and integrate, sparse feedback presents a\nstructural design choice between ratings (e.g., score Response A on a scale of\n1-7) and rankings (e.g., is Response A better than Response B?). In this work,\nwe analyze the effect of this design choice for the alignment and evaluation of\nLLMs. We uncover an inconsistency problem wherein the preferences inferred from\nratings and rankings significantly disagree 60% for both human and AI\nannotators. Our subsequent analysis identifies various facets of annotator\nbiases that explain this phenomena, such as human annotators would rate denser\nresponses higher while preferring accuracy during pairwise judgments. To our\nsurprise, we also observe that the choice of feedback protocol also has a\nsignificant effect on the evaluation of aligned LLMs. In particular, we find\nthat LLMs that leverage rankings data for alignment (say model X) are preferred\nover those that leverage ratings data (say model Y), with a rank-based\nevaluation protocol (is X/Y's response better than reference response?) but not\nwith a rating-based evaluation protocol (score Rank X/Y's response on a scale\nof 1-7). Our findings thus shed light on critical gaps in methods for\nevaluating the real-world utility of language models and their strong\ndependence on the feedback protocol used for alignment. Our code and data are\navailable at https://github.com/Hritikbansal/sparse_feedback.\n","authors":["Hritik Bansal","John Dang","Aditya Grover"],"pdf_url":"https://arxiv.org/pdf/2308.15812v1.pdf","comment":"24 pages, 12 Tables, 3 Figures"},{"id":"http://arxiv.org/abs/2212.04614v4","updated":"2023-08-30T07:30:28Z","published":"2022-12-09T00:43:49Z","title":"Is Bio-Inspired Learning Better than Backprop? Benchmarking Bio Learning\n vs. Backprop","summary":" Bio-inspired learning has been gaining popularity recently given that\nBackpropagation (BP) is not considered biologically plausible. Many algorithms\nhave been proposed in the literature which are all more biologically plausible\nthan BP. However, apart from overcoming the biological implausibility of BP, a\nstrong motivation for using Bio-inspired algorithms remains lacking. In this\nstudy, we undertake a holistic comparison of BP vs. multiple Bio-inspired\nalgorithms to answer the question of whether Bio-learning offers additional\nbenefits over BP. We test Bio-algorithms under different design choices such as\naccess to only partial training data, resource constraints in terms of the\nnumber of training epochs, sparsification of the neural network parameters and\naddition of noise to input samples. Through these experiments, we notably find\ntwo key advantages of Bio-algorithms over BP. Firstly, Bio-algorithms perform\nmuch better than BP when the entire training dataset is not supplied. Four of\nthe five Bio-algorithms tested outperform BP by upto 5% accuracy when only 20%\nof the training dataset is available. Secondly, even when the full dataset is\navailable, Bio-algorithms learn much quicker and converge to a stable accuracy\nin far lesser training epochs than BP. Hebbian learning, specifically, is able\nto learn in just 5 epochs compared to around 100 epochs required by BP. These\ninsights present practical reasons for utilising Bio-learning beyond just their\nbiological plausibility and also point towards interesting new directions for\nfuture work on Bio-learning.\n","authors":["Manas Gupta","Sarthak Ketanbhai Modi","Hang Zhang","Joon Hei Lee","Joo Hwee Lim"],"pdf_url":"https://arxiv.org/pdf/2212.04614v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15793v1","updated":"2023-08-30T06:53:24Z","published":"2023-08-30T06:53:24Z","title":"HAlf-MAsked Model for Named Entity Sentiment analysis","summary":" Named Entity Sentiment analysis (NESA) is one of the most actively developing\napplication domains in Natural Language Processing (NLP). Social media NESA is\na significant field of opinion analysis since detecting and tracking sentiment\ntrends in the news flow is crucial for building various analytical systems and\nmonitoring the media image of specific people or companies. In this paper, we\nstudy different transformers-based solutions NESA in RuSentNE-23 evaluation.\nDespite the effectiveness of the BERT-like models, they can still struggle with\ncertain challenges, such as overfitting, which appeared to be the main obstacle\nin achieving high accuracy on the RuSentNE-23 data. We present several\napproaches to overcome this problem, among which there is a novel technique of\nadditional pass over given data with masked entity before making the final\nprediction so that we can combine logits from the model when it knows the exact\nentity it predicts sentiment for and when it does not. Utilizing this\ntechnique, we ensemble multiple BERT- like models trained on different subsets\nof data to improve overall performance. Our proposed model achieves the best\nresult on RuSentNE-23 evaluation data and demonstrates improved consistency in\nentity-level sentiment analysis.\n","authors":["Anton Kabaev","Pavel Podberezko","Andrey Kaznacheev","Sabina Abdullayeva"],"pdf_url":"https://arxiv.org/pdf/2308.15793v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.14417v5","updated":"2023-08-30T06:40:56Z","published":"2021-12-29T06:43:29Z","title":"Control Theoretic Analysis of Temporal Difference Learning","summary":" The goal of this manuscript is to conduct a controltheoretic analysis of\nTemporal Difference (TD) learning algorithms. TD-learning serves as a\ncornerstone in the realm of reinforcement learning, offering a methodology for\napproximating the value function associated with a given policy in a Markov\nDecision Process. Despite several existing works that have contributed to the\ntheoretical understanding of TD-learning, it is only in recent years that\nresearchers have been able to establish concrete guarantees on its statistical\nefficiency. In this paper, we introduce a finite-time, control-theoretic\nframework for analyzing TD-learning, leveraging established concepts from the\nfield of linear systems control. Consequently, this paper provides additional\ninsights into the mechanics of TD learning and the broader landscape of\nreinforcement learning, all while employing straightforward analytical tools\nderived from control theory.\n","authors":["Donghwan Lee","Do Wan Kim"],"pdf_url":"https://arxiv.org/pdf/2112.14417v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15786v1","updated":"2023-08-30T06:36:32Z","published":"2023-08-30T06:36:32Z","title":"FedCiR: Client-Invariant Representation Learning for Federated Non-IID\n Features","summary":" Federated learning (FL) is a distributed learning paradigm that maximizes the\npotential of data-driven models for edge devices without sharing their raw\ndata. However, devices often have non-independent and identically distributed\n(non-IID) data, meaning their local data distributions can vary significantly.\nThe heterogeneity in input data distributions across devices, commonly referred\nto as the feature shift problem, can adversely impact the training convergence\nand accuracy of the global model. To analyze the intrinsic causes of the\nfeature shift problem, we develop a generalization error bound in FL, which\nmotivates us to propose FedCiR, a client-invariant representation learning\nframework that enables clients to extract informative and client-invariant\nfeatures. Specifically, we improve the mutual information term between\nrepresentations and labels to encourage representations to carry essential\nclassification knowledge, and diminish the mutual information term between the\nclient set and representations conditioned on labels to promote representations\nof clients to be client-invariant. We further incorporate two regularizers into\nthe FL framework to bound the mutual information terms with an approximate\nglobal representation distribution to compensate for the absence of the\nground-truth global representation distribution, thus achieving informative and\nclient-invariant feature extraction. To achieve global representation\ndistribution approximation, we propose a data-free mechanism performed by the\nserver without compromising privacy. Extensive experiments demonstrate the\neffectiveness of our approach in achieving client-invariant representation\nlearning and solving the data heterogeneity issue.\n","authors":["Zijian Li","Zehong Lin","Jiawei Shao","Yuyi Mao","Jun Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.15786v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15783v1","updated":"2023-08-30T06:28:42Z","published":"2023-08-30T06:28:42Z","title":"Split Without a Leak: Reducing Privacy Leakage in Split Learning","summary":" The popularity of Deep Learning (DL) makes the privacy of sensitive data more\nimperative than ever. As a result, various privacy-preserving techniques have\nbeen implemented to preserve user data privacy in DL. Among various\nprivacy-preserving techniques, collaborative learning techniques, such as Split\nLearning (SL) have been utilized to accelerate the learning and prediction\nprocess. Initially, SL was considered a promising approach to data privacy.\nHowever, subsequent research has demonstrated that SL is susceptible to many\ntypes of attacks and, therefore, it cannot serve as a privacy-preserving\ntechnique. Meanwhile, countermeasures using a combination of SL and encryption\nhave also been introduced to achieve privacy-preserving deep learning. In this\nwork, we propose a hybrid approach using SL and Homomorphic Encryption (HE).\nThe idea behind it is that the client encrypts the activation map (the output\nof the split layer between the client and the server) before sending it to the\nserver. Hence, during both forward and backward propagation, the server cannot\nreconstruct the client's input data from the intermediate activation map. This\nimprovement is important as it reduces privacy leakage compared to other\nSL-based works, where the server can gain valuable information about the\nclient's input. In addition, on the MIT-BIH dataset, our proposed hybrid\napproach using SL and HE yields faster training time (about 6 times) and\nsignificantly reduced communication overhead (almost 160 times) compared to\nother HE-based approaches, thereby offering improved privacy protection for\nsensitive data in DL.\n","authors":["Khoa Nguyen","Tanveer Khan","Antonis Michalas"],"pdf_url":"https://arxiv.org/pdf/2308.15783v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2011.01710v4","updated":"2023-08-30T05:08:47Z","published":"2020-11-03T13:54:01Z","title":"BCGGAN: Ballistocardiogram artifact removal in simultaneous EEG-fMRI\n using generative adversarial network","summary":" Due to its advantages of high temporal and spatial resolution, the technology\nof simultaneous electroencephalogram-functional magnetic resonance imaging\n(EEG-fMRI) acquisition and analysis has attracted much attention, and has been\nwidely used in various research fields of brain science. However, during the\nfMRI of the brain, ballistocardiogram (BCG) artifacts can seriously contaminate\nthe EEG. As an unpaired problem, BCG artifact removal now remains a\nconsiderable challenge. Aiming to provide a solution, this paper proposed a\nnovel modular generative adversarial network (GAN) and corresponding training\nstrategy to improve the network performance by optimizing the parameters of\neach module. In this manner, we hope to improve the local representation\nability of the network model, thereby improving its overall performance and\nobtaining a reliable generator for BCG artifact removal. Moreover, the proposed\nmethod does not rely on additional reference signal or complex hardware\nequipment. Experimental results show that, compared with multiple methods, the\ntechnique presented in this paper can remove the BCG artifact more effectively\nwhile retaining essential EEG information.\n","authors":["Guang Lin","Jianhai Zhang","Yuxi Liu","Tianyang Gao","Wanzeng Kong","Xu Lei","Tao Qiu"],"pdf_url":"https://arxiv.org/pdf/2011.01710v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.04027v3","updated":"2023-08-30T04:55:04Z","published":"2023-04-08T14:40:35Z","title":"Estimating 3D Dental Structures using Simulated Panoramic Radiographs\n and Neural Ray Tracing","summary":" Panoramic radiography (Panoramic X-ray, PX) is a widely used imaging modality\nfor dental examination. However, PX only provides a flattened 2D image, lacking\nin a 3D view of the oral structure. In this paper, we propose a framework to\nestimate 3D oral structures from real-world PX. Our framework tackles full 3D\nreconstruction for varying subjects (patients) where each reconstruction is\nbased only on a single panoramic image. We create an intermediate\nrepresentation called simulated PX (SimPX) from 3D Cone-beam computed\ntomography (CBCT) data based on the Beer-Lambert law of X-ray rendering and\nrotational principles of PX imaging. SimPX aims at not only truthfully\nsimulating PX, but also facilitates the reverting process back to 3D data. We\npropose a novel neural model based on ray tracing which exploits both global\nand local input features to convert SimPX to 3D output. At inference, a real PX\nimage is translated to a SimPX-style image with semantic regularization, and\nthe translated image is processed by generation module to produce high-quality\noutputs. Experiments show that our method outperforms prior state-of-the-art in\nreconstruction tasks both quantitatively and qualitatively. Unlike prior\nmethods, Our method does not require any prior information such as the shape of\ndental arches, nor the matched PX-CBCT dataset for training, which is difficult\nto obtain in clinical practice.\n","authors":["Sihwa Park","Seongjun Kim","Doeyoung Kwon","Yohan Jang","In-Seok Song","Seungjun Baek"],"pdf_url":"https://arxiv.org/pdf/2304.04027v3.pdf","comment":"20 pages, 16 figures"},{"id":"http://arxiv.org/abs/2306.01762v2","updated":"2023-08-30T04:53:15Z","published":"2023-05-27T06:00:51Z","title":"Pre-trained transformer for adversarial purification","summary":" With more and more deep neural networks being deployed as various daily\nservices, their reliability is essential. It's frightening that deep neural\nnetworks are vulnerable and sensitive to adversarial attacks, the most common\none of which for the services is evasion-based. Recent works usually strengthen\nthe robustness by adversarial training or leveraging the knowledge of an amount\nof clean data. However, in practical terms, retraining and redeploying the\nmodel need a large computational budget, leading to heavy losses to the online\nservice. In addition, when adversarial examples of a certain attack are\ndetected, only limited adversarial examples are available for the service\nprovider, while much clean data may not be accessible. Given the mentioned\nproblems, we propose a new scenario, RaPiD (Rapid Plug-in Defender), which is\nto rapidly defend against a certain attack for the frozen original service\nmodel with limitations of few clean and adversarial examples. Motivated by the\ngeneralization and the universal computation ability of pre-trained transformer\nmodels, we come up with a new defender method, CeTaD, which stands for\nConsidering Pre-trained Transformers as Defenders. In particular, we evaluate\nthe effectiveness and the transferability of CeTaD in the case of one-shot\nadversarial examples and explore the impact of different parts of CeTaD as well\nas training data conditions. CeTaD is flexible, able to be embedded into an\narbitrary differentiable model, and suitable for various types of attacks.\n","authors":["Kai Wu","Yujian Betterest Li","Xiaoyu Zhang","Handing Wang","Jing Liu"],"pdf_url":"https://arxiv.org/pdf/2306.01762v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10235v4","updated":"2023-08-30T04:32:36Z","published":"2023-05-15T15:44:51Z","title":"Assessing Hidden Risks of LLMs: An Empirical Study on Robustness,\n Consistency, and Credibility","summary":" The recent popularity of large language models (LLMs) has brought a\nsignificant impact to boundless fields, particularly through their open-ended\necosystem such as the APIs, open-sourced models, and plugins. However, with\ntheir widespread deployment, there is a general lack of research that\nthoroughly discusses and analyzes the potential risks concealed. In that case,\nwe intend to conduct a preliminary but pioneering study covering the\nrobustness, consistency, and credibility of LLMs systems. With most of the\nrelated literature in the era of LLM uncharted, we propose an automated\nworkflow that copes with an upscaled number of queries/responses. Overall, we\nconduct over a million queries to the mainstream LLMs including ChatGPT, LLaMA,\nand OPT. Core to our workflow consists of a data primitive, followed by an\nautomated interpreter that evaluates these LLMs under different adversarial\nmetrical systems. As a result, we draw several, and perhaps unfortunate,\nconclusions that are quite uncommon from this trendy community. Briefly, they\nare: (i)-the minor but inevitable error occurrence in the user-generated query\ninput may, by chance, cause the LLM to respond unexpectedly; (ii)-LLMs possess\npoor consistency when processing semantically similar query input. In addition,\nas a side finding, we find that ChatGPT is still capable to yield the correct\nanswer even when the input is polluted at an extreme level. While this\nphenomenon demonstrates the powerful memorization of the LLMs, it raises\nserious concerns about using such data for LLM-involved evaluation in academic\ndevelopment. To deal with it, we propose a novel index associated with a\ndataset that roughly decides the feasibility of using such data for\nLLM-involved evaluation. Extensive empirical studies are tagged to support the\naforementioned claims.\n","authors":["Wentao Ye","Mingfeng Ou","Tianyi Li","Yipeng chen","Xuetao Ma","Yifan Yanggong","Sai Wu","Jie Fu","Gang Chen","Haobo Wang","Junbo Zhao"],"pdf_url":"https://arxiv.org/pdf/2305.10235v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03188v2","updated":"2023-08-30T03:47:34Z","published":"2023-08-06T18:38:52Z","title":"Automatically Correcting Large Language Models: Surveying the landscape\n of diverse self-correction strategies","summary":" Large language models (LLMs) have demonstrated remarkable performance across\na wide array of NLP tasks. However, their efficacy is undermined by undesired\nand inconsistent behaviors, including hallucination, unfaithful reasoning, and\ntoxic content. A promising approach to rectify these flaws is self-correction,\nwhere the LLM itself is prompted or guided to fix problems in its own output.\nTechniques leveraging automated feedback -- either produced by the LLM itself\nor some external system -- are of particular interest as they are a promising\nway to make LLM-based solutions more practical and deployable with minimal\nhuman feedback. This paper presents a comprehensive review of this emerging\nclass of techniques. We analyze and taxonomize a wide array of recent work\nutilizing these strategies, including training-time, generation-time, and\npost-hoc correction. We also summarize the major applications of this strategy\nand conclude by discussing future directions and challenges.\n","authors":["Liangming Pan","Michael Saxon","Wenda Xu","Deepak Nathani","Xinyi Wang","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.03188v2.pdf","comment":"Work in Progress. Version 2"},{"id":"http://arxiv.org/abs/2308.15734v1","updated":"2023-08-30T03:21:45Z","published":"2023-08-30T03:21:45Z","title":"Efficient and Explainable Graph Neural Architecture Search via\n Monte-Carlo Tree Search","summary":" Graph neural networks (GNNs) are powerful tools for performing data science\ntasks in various domains. Although we use GNNs in wide application scenarios,\nit is a laborious task for researchers and practitioners to design/select\noptimal GNN rchitectures in diverse graphs. To save human efforts and\ncomputational costs, graph neural architecture search (Graph NAS) has been used\nto search for a sub-optimal GNN architecture that combines existing components.\nHowever, there are no existing Graph NAS methods that satisfy explainability,\nefficiency, and adaptability to various graphs. Therefore, we propose an\nefficient and explainable Graph NAS method, called ExGNAS, which consists of\n(i) a simple search space that can adapt to various graphs and (ii) a search\nalgorithm that makes the decision process explainable. The search space\nincludes only fundamental functions that can handle homophilic and heterophilic\ngraphs. The search algorithm efficiently searches for the best GNN architecture\nvia Monte-Carlo tree search without neural models. The combination of our\nsearch space and algorithm achieves finding accurate GNN models and the\nimportant functions within the search space. We comprehensively evaluate our\nmethod compared with twelve hand-crafted GNN architectures and three Graph NAS\nmethods in four graphs. Our experimental results show that ExGNAS increases AUC\nup to 3.6 and reduces run time up to 78\\% compared with the state-of-the-art\nGraph NAS methods. Furthermore, we show ExGNAS is effective in analyzing the\ndifference between GNN architectures in homophilic and heterophilic graphs.\n","authors":["Yuya Sasaki"],"pdf_url":"https://arxiv.org/pdf/2308.15734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15730v1","updated":"2023-08-30T03:14:02Z","published":"2023-08-30T03:14:02Z","title":"Fully Embedded Time-Series Generative Adversarial Networks","summary":" Generative Adversarial Networks (GANs) should produce synthetic data that\nfits the underlying distribution of the data being modeled. For real valued\ntime-series data, this implies the need to simultaneously capture the static\ndistribution of the data, but also the full temporal distribution of the data\nfor any potential time horizon. This temporal element produces a more complex\nproblem that can potentially leave current solutions under-constrained,\nunstable during training, or prone to varying degrees of mode collapse. In\nFETSGAN, entire sequences are translated directly to the generator's sampling\nspace using a seq2seq style adversarial auto encoder (AAE), where adversarial\ntraining is used to match the training distribution in both the feature space\nand the lower dimensional sampling space. This additional constraint provides a\nloose assurance that the temporal distribution of the synthetic samples will\nnot collapse. In addition, the First Above Threshold (FAT) operator is\nintroduced to supplement the reconstruction of encoded sequences, which\nimproves training stability and the overall quality of the synthetic data being\ngenerated. These novel contributions demonstrate a significant improvement to\nthe current state of the art for adversarial learners in qualitative measures\nof temporal similarity and quantitative predictive ability of data generated\nthrough FETSGAN.\n","authors":["Joe Beck","Subhadeep Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2308.15730v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.07543v4","updated":"2023-08-30T03:12:34Z","published":"2023-03-14T00:13:57Z","title":"WDiscOOD: Out-of-Distribution Detection via Whitened Linear Discriminant\n Analysis","summary":" Deep neural networks are susceptible to generating overconfident yet\nerroneous predictions when presented with data beyond known concepts. This\nchallenge underscores the importance of detecting out-of-distribution (OOD)\nsamples in the open world. In this work, we propose a novel feature-space OOD\ndetection score based on class-specific and class-agnostic information.\nSpecifically, the approach utilizes Whitened Linear Discriminant Analysis to\nproject features into two subspaces - the discriminative and residual subspaces\n- for which the in-distribution (ID) classes are maximally separated and\nclosely clustered, respectively. The OOD score is then determined by combining\nthe deviation from the input data to the ID pattern in both subspaces. The\nefficacy of our method, named WDiscOOD, is verified on the large-scale\nImageNet-1k benchmark, with six OOD datasets that cover a variety of\ndistribution shifts. WDiscOOD demonstrates superior performance on deep\nclassifiers with diverse backbone architectures, including CNN and vision\ntransformer. Furthermore, we also show that WDiscOOD more effectively detects\nnovel concepts in representation spaces trained with contrastive objectives,\nincluding supervised contrastive loss and multi-modality contrastive loss.\n","authors":["Yiye Chen","Yunzhi Lin","Ruinian Xu","Patricio A. Vela"],"pdf_url":"https://arxiv.org/pdf/2303.07543v4.pdf","comment":"Accepted by ICCV 2023. Code is available at:\n https://github.com/ivalab/WDiscOOD.git"},{"id":"http://arxiv.org/abs/2308.15720v1","updated":"2023-08-30T02:50:54Z","published":"2023-08-30T02:50:54Z","title":"Surrogate-based Autotuning for Randomized Sketching Algorithms in\n Regression Problems","summary":" Algorithms from Randomized Numerical Linear Algebra (RandNLA) are known to be\neffective in handling high-dimensional computational problems, providing\nhigh-quality empirical performance as well as strong probabilistic guarantees.\nHowever, their practical application is complicated by the fact that the user\nneeds to set various algorithm-specific tuning parameters which are different\nthan those used in traditional NLA. This paper demonstrates how a\nsurrogate-based autotuning approach can be used to address fundamental problems\nof parameter selection in RandNLA algorithms. In particular, we provide a\ndetailed investigation of surrogate-based autotuning for\nsketch-and-precondition (SAP) based randomized least squares methods, which\nhave been one of the great success stories in modern RandNLA. Empirical results\nshow that our surrogate-based autotuning approach can achieve near-optimal\nperformance with much less tuning cost than a random search (up to about 4x\nfewer trials of different parameter configurations). Moreover, while our\nexperiments focus on least squares, our results demonstrate a general-purpose\nautotuning pipeline applicable to any kind of RandNLA algorithm.\n","authors":["Younghyun Cho","James W. Demmel","Michał Dereziński","Haoyun Li","Hengrui Luo","Michael W. Mahoney","Riley J. Murray"],"pdf_url":"https://arxiv.org/pdf/2308.15720v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.08108v2","updated":"2023-08-30T02:43:29Z","published":"2022-12-15T19:49:27Z","title":"Dataflow Analysis-Inspired Deep Learning for Efficient Vulnerability\n Detection","summary":" Deep learning-based vulnerability detection has shown great performance and,\nin some studies, outperformed static analysis tools. However, the\nhighest-performing approaches use token-based transformer models, which are not\nthe most efficient to capture code semantics required for vulnerability\ndetection. Classical program analysis techniques such as dataflow analysis can\ndetect many types of bugs based on their root causes. In this paper, we propose\nto combine such causal-based vulnerability detection algorithms with deep\nlearning, aiming to achieve more efficient and effective vulnerability\ndetection. Specifically, we designed DeepDFA, a dataflow analysis-inspired\ngraph learning framework and an embedding technique that enables graph learning\nto simulate dataflow computation. We show that DeepDFA is both performant and\nefficient. DeepDFA outperformed all non-transformer baselines. It was trained\nin 9 minutes, 75x faster than the highest-performing baseline model. When using\nonly 50+ vulnerable and several hundreds of total examples as training data,\nthe model retained the same performance as 100% of the dataset. DeepDFA also\ngeneralized to real-world vulnerabilities in DBGBench; it detected 8.7 out of\n17 vulnerabilities on average across folds and was able to distinguish between\npatched and buggy versions, while the highest-performing baseline models did\nnot detect any vulnerabilities. By combining DeepDFA with a large language\nmodel, we surpassed the state-of-the-art vulnerability detection performance on\nthe Big-Vul dataset with 96.46 F1 score, 97.82 precision, and 95.14 recall. Our\nreplication package is located at https://figshare.com/s/e7953b4d345b00990d17.\n","authors":["Benjamin Steenhoek","Hongyang Gao","Wei Le"],"pdf_url":"https://arxiv.org/pdf/2212.08108v2.pdf","comment":"11 pages, 9 figures. Accepted as a conference paper at ICSE 2024"},{"id":"http://arxiv.org/abs/2308.15712v1","updated":"2023-08-30T02:24:09Z","published":"2023-08-30T02:24:09Z","title":"Exploring Deep Learning for Full-disk Solar Flare Prediction with\n Empirical Insights from Guided Grad-CAM Explanations","summary":" This study progresses solar flare prediction research by presenting a\nfull-disk deep-learning model to forecast $\\geq$M-class solar flares and\nevaluating its efficacy on both central (within $\\pm$70$^\\circ$) and near-limb\n(beyond $\\pm$70$^\\circ$) events, showcasing qualitative assessment of post hoc\nexplanations for the model's predictions, and providing empirical findings from\nhuman-centered quantitative assessments of these explanations. Our model is\ntrained using hourly full-disk line-of-sight magnetogram images to predict\n$\\geq$M-class solar flares within the subsequent 24-hour prediction window.\nAdditionally, we apply the Guided Gradient-weighted Class Activation Mapping\n(Guided Grad-CAM) attribution method to interpret our model's predictions and\nevaluate the explanations. Our analysis unveils that full-disk solar flare\npredictions correspond with active region characteristics. The following points\nrepresent the most important findings of our study: (1) Our deep learning\nmodels achieved an average true skill statistic (TSS) of $\\sim$0.51 and a\nHeidke skill score (HSS) of $\\sim$0.38, exhibiting skill to predict solar\nflares where for central locations the average recall is $\\sim$0.75 (recall\nvalues for X- and M-class are 0.95 and 0.73 respectively) and for the near-limb\nflares the average recall is $\\sim$0.52 (recall values for X- and M-class are\n0.74 and 0.50 respectively); (2) qualitative examination of the model's\nexplanations reveals that it discerns and leverages features linked to active\nregions in both central and near-limb locations within full-disk magnetograms\nto produce respective predictions. In essence, our models grasp the shape and\ntexture-based properties of flaring active regions, even in proximity to limb\nareas -- a novel and essential capability with considerable significance for\noperational forecasting systems.\n","authors":["Chetraj Pandey","Anli Ji","Trisha Nandakumar","Rafal A. Angryk","Berkay Aydin"],"pdf_url":"https://arxiv.org/pdf/2308.15712v1.pdf","comment":"This is a preprint accepted at the 10th IEEE International Conference\n On Data Science And Advanced Analytics (DSAA 2023). The conference\n proceedings will be published by the IEEE Xplore Digital Library with ISBN:\n 979-8-3503-4503-2. 10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.15710v1","updated":"2023-08-30T02:14:49Z","published":"2023-08-30T02:14:49Z","title":"Speech Wikimedia: A 77 Language Multilingual Speech Dataset","summary":" The Speech Wikimedia Dataset is a publicly available compilation of audio\nwith transcriptions extracted from Wikimedia Commons. It includes 1780 hours\n(195 GB) of CC-BY-SA licensed transcribed speech from a diverse set of\nscenarios and speakers, in 77 different languages. Each audio file has one or\nmore transcriptions in different languages, making this dataset suitable for\ntraining speech recognition, speech translation, and machine translation\nmodels.\n","authors":["Rafael Mosquera Gómez","Julián Eusse","Juan Ciro","Daniel Galvez","Ryan Hileman","Kurt Bollacker","David Kanter"],"pdf_url":"https://arxiv.org/pdf/2308.15710v1.pdf","comment":"Data-Centric Machine Learning Workshop at the International Machine\n Learning Conference 2023 (ICML)"},{"id":"http://arxiv.org/abs/2308.15709v1","updated":"2023-08-30T02:12:00Z","published":"2023-08-30T02:12:00Z","title":"Threshold KNN-Shapley: A Linear-Time and Privacy-Friendly Approach to\n Data Valuation","summary":" Data valuation, a critical aspect of data-centric ML research, aims to\nquantify the usefulness of individual data sources in training machine learning\n(ML) models. However, data valuation faces significant yet frequently\noverlooked privacy challenges despite its importance. This paper studies these\nchallenges with a focus on KNN-Shapley, one of the most practical data\nvaluation methods nowadays. We first emphasize the inherent privacy risks of\nKNN-Shapley, and demonstrate the significant technical difficulties in adapting\nKNN-Shapley to accommodate differential privacy (DP). To overcome these\nchallenges, we introduce TKNN-Shapley, a refined variant of KNN-Shapley that is\nprivacy-friendly, allowing for straightforward modifications to incorporate DP\nguarantee (DP-TKNN-Shapley). We show that DP-TKNN-Shapley has several\nadvantages and offers a superior privacy-utility tradeoff compared to naively\nprivatized KNN-Shapley in discerning data quality. Moreover, even non-private\nTKNN-Shapley achieves comparable performance as KNN-Shapley. Overall, our\nfindings suggest that TKNN-Shapley is a promising alternative to KNN-Shapley,\nparticularly for real-world applications involving sensitive data.\n","authors":["Jiachen T. Wang","Yuqing Zhu","Yu-Xiang Wang","Ruoxi Jia","Prateek Mittal"],"pdf_url":"https://arxiv.org/pdf/2308.15709v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11197v2","updated":"2023-08-30T02:07:28Z","published":"2023-08-22T05:14:42Z","title":"Toward Generalizable Machine Learning Models in Speech, Language, and\n Hearing Sciences: Sample Size Estimation and Reducing Overfitting","summary":" This study's first purpose is to provide quantitative evidence that would\nincentivize researchers to instead use the more robust method of nested\ncross-validation. The second purpose is to present methods and MATLAB codes for\ndoing power analysis for ML-based analysis during the design of a study. Monte\nCarlo simulations were used to quantify the interactions between the employed\ncross-validation method, the discriminative power of features, the\ndimensionality of the feature space, and the dimensionality of the model. Four\ndifferent cross-validations (single holdout, 10-fold, train-validation-test,\nand nested 10-fold) were compared based on the statistical power and\nstatistical confidence of the ML models. Distributions of the null and\nalternative hypotheses were used to determine the minimum required sample size\nfor obtaining a statistically significant outcome ({\\alpha}=0.05,\n1-\\b{eta}=0.8). Statistical confidence of the model was defined as the\nprobability of correct features being selected and hence being included in the\nfinal model. Our analysis showed that the model generated based on the single\nholdout method had very low statistical power and statistical confidence and\nthat it significantly overestimated the accuracy. Conversely, the nested\n10-fold cross-validation resulted in the highest statistical confidence and the\nhighest statistical power, while providing an unbiased estimate of the\naccuracy. The required sample size with a single holdout could be 50% higher\nthan what would be needed if nested cross-validation were used. Confidence in\nthe model based on nested cross-validation was as much as four times higher\nthan the confidence in the single holdout-based model. A computational model,\nMATLAB codes, and lookup tables are provided to assist researchers with\nestimating the sample size during the design of their future studies.\n","authors":["Hamzeh Ghasemzadeh","Robert E. Hillman","Daryush D. Mehta"],"pdf_url":"https://arxiv.org/pdf/2308.11197v2.pdf","comment":"Under review at JSLHR"},{"id":"http://arxiv.org/abs/2308.15704v1","updated":"2023-08-30T01:59:42Z","published":"2023-08-30T01:59:42Z","title":"Towards a Rigorous Analysis of Mutual Information in Contrastive\n Learning","summary":" Contrastive learning has emerged as a cornerstone in recent achievements of\nunsupervised representation learning. Its primary paradigm involves an instance\ndiscrimination task with a mutual information loss. The loss is known as\nInfoNCE and it has yielded vital insights into contrastive learning through the\nlens of mutual information analysis. However, the estimation of mutual\ninformation can prove challenging, creating a gap between the elegance of its\nmathematical foundation and the complexity of its estimation. As a result,\ndrawing rigorous insights or conclusions from mutual information analysis\nbecomes intricate. In this study, we introduce three novel methods and a few\nrelated theorems, aimed at enhancing the rigor of mutual information analysis.\nDespite their simplicity, these methods can carry substantial utility.\nLeveraging these approaches, we reassess three instances of contrastive\nlearning analysis, illustrating their capacity to facilitate deeper\ncomprehension or to rectify pre-existing misconceptions. Specifically, we\ninvestigate small batch size, mutual information as a measure, and the InfoMin\nprinciple.\n","authors":["Kyungeun Lee","Jaeill Kim","Suhyun Kang","Wonjong Rhee"],"pdf_url":"https://arxiv.org/pdf/2308.15704v1.pdf","comment":"18 pages, 7 figures, Under review"},{"id":"http://arxiv.org/abs/2308.15703v1","updated":"2023-08-30T01:56:57Z","published":"2023-08-30T01:56:57Z","title":"Fragment and Integrate Network (FIN): A Novel Spatial-Temporal Modeling\n Based on Long Sequential Behavior for Online Food Ordering Click-Through Rate\n Prediction","summary":" Spatial-temporal information has been proven to be of great significance for\nclick-through rate prediction tasks in online Location-Based Services (LBS),\nespecially in mainstream food ordering platforms such as DoorDash, Uber Eats,\nMeituan, and Ele.me. Modeling user spatial-temporal preferences with sequential\nbehavior data has become a hot topic in recommendation systems and online\nadvertising. However, most of existing methods either lack the representation\nof rich spatial-temporal information or only handle user behaviors with limited\nlength, e.g. 100. In this paper, we tackle these problems by designing a new\nspatial-temporal modeling paradigm named Fragment and Integrate Network (FIN).\nFIN consists of two networks: (i) Fragment Network (FN) extracts Multiple\nSub-Sequences (MSS) from lifelong sequential behavior data, and captures the\nspecific spatial-temporal representation by modeling each MSS respectively.\nHere both a simplified attention and a complicated attention are adopted to\nbalance the performance gain and resource consumption. (ii) Integrate Network\n(IN) builds a new integrated sequence by utilizing spatial-temporal interaction\non MSS and captures the comprehensive spatial-temporal representation by\nmodeling the integrated sequence with a complicated attention. Both public\ndatasets and production datasets have demonstrated the accuracy and scalability\nof FIN. Since 2022, FIN has been fully deployed in the recommendation\nadvertising system of Ele.me, one of the most popular online food ordering\nplatforms in China, obtaining 5.7% improvement on Click-Through Rate (CTR) and\n7.3% increase on Revenue Per Mille (RPM).\n","authors":["Jun Li","Jingjian Wang","Hongwei Wang","Xing Deng","Jielong Chen","Bing Cao","Zekun Wang","Guanjie Xu","Ge Zhang","Feng Shi","Hualei Liu"],"pdf_url":"https://arxiv.org/pdf/2308.15703v1.pdf","comment":"Accepted by CIKM 2023 Applied Research Paper"},{"id":"http://arxiv.org/abs/2308.15700v1","updated":"2023-08-30T01:54:31Z","published":"2023-08-30T01:54:31Z","title":"Training Towards Critical Use: Learning to Situate AI Predictions\n Relative to Human Knowledge","summary":" A growing body of research has explored how to support humans in making\nbetter use of AI-based decision support, including via training and onboarding.\nExisting research has focused on decision-making tasks where it is possible to\nevaluate \"appropriate reliance\" by comparing each decision against a ground\ntruth label that cleanly maps to both the AI's predictive target and the human\ndecision-maker's goals. However, this assumption does not hold in many\nreal-world settings where AI tools are deployed today (e.g., social work,\ncriminal justice, and healthcare). In this paper, we introduce a\nprocess-oriented notion of appropriate reliance called critical use that\ncenters the human's ability to situate AI predictions against knowledge that is\nuniquely available to them but unavailable to the AI model. To explore how\ntraining can support critical use, we conduct a randomized online experiment in\na complex social decision-making setting: child maltreatment screening. We find\nthat, by providing participants with accelerated, low-stakes opportunities to\npractice AI-assisted decision-making in this setting, novices came to exhibit\npatterns of disagreement with AI that resemble those of experienced workers. A\nqualitative examination of participants' explanations for their AI-assisted\ndecisions revealed that they drew upon qualitative case narratives, to which\nthe AI model did not have access, to learn when (not) to rely on AI\npredictions. Our findings open new questions for the study and design of\ntraining for real-world AI-assisted decision-making.\n","authors":["Anna Kawakami","Luke Guerdan","Yanghuidi Cheng","Matthew Lee","Scott Carter","Nikos Arechiga","Kate Glazko","Haiyi Zhu","Kenneth Holstein"],"pdf_url":"https://arxiv.org/pdf/2308.15700v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14945v2","updated":"2023-08-30T01:48:21Z","published":"2023-08-28T23:51:33Z","title":"Noise-Free Sampling Algorithms via Regularized Wasserstein Proximals","summary":" We consider the problem of sampling from a distribution governed by a\npotential function. This work proposes an explicit score-based MCMC method that\nis deterministic, resulting in a deterministic evolution for particles rather\nthan a stochastic differential equation evolution. The score term is given in\nclosed form by a regularized Wasserstein proximal, using a kernel convolution\nthat is approximated by sampling. We demonstrate fast convergence on various\nproblems and show improved dimensional dependence of mixing time bounds for the\ncase of Gaussian distributions compared to the unadjusted Langevin algorithm\n(ULA) and the Metropolis-adjusted Langevin algorithm (MALA). We additionally\nderive closed form expressions for the distributions at each iterate for\nquadratic potential functions, characterizing the variance reduction. Empirical\nresults demonstrate that the particles behave in an organized manner, lying on\nlevel set contours of the potential. Moreover, the posterior mean estimator of\nthe proposed method is shown to be closer to the maximum a-posteriori estimator\ncompared to ULA and MALA, in the context of Bayesian logistic regression.\n","authors":["Hong Ye Tan","Stanley Osher","Wuchen Li"],"pdf_url":"https://arxiv.org/pdf/2308.14945v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.04150v4","updated":"2023-08-30T01:47:53Z","published":"2023-03-07T01:38:42Z","title":"Evolutionary Reinforcement Learning: A Survey","summary":" Reinforcement learning (RL) is a machine learning approach that trains agents\nto maximize cumulative rewards through interactions with environments. The\nintegration of RL with deep learning has recently resulted in impressive\nachievements in a wide range of challenging tasks, including board games,\narcade games, and robot control. Despite these successes, there remain several\ncrucial challenges, including brittle convergence properties caused by\nsensitive hyperparameters, difficulties in temporal credit assignment with long\ntime horizons and sparse rewards, a lack of diverse exploration, especially in\ncontinuous search space scenarios, difficulties in credit assignment in\nmulti-agent reinforcement learning, and conflicting objectives for rewards.\nEvolutionary computation (EC), which maintains a population of learning agents,\nhas demonstrated promising performance in addressing these limitations. This\narticle presents a comprehensive survey of state-of-the-art methods for\nintegrating EC into RL, referred to as evolutionary reinforcement learning\n(EvoRL). We categorize EvoRL methods according to key research fields in RL,\nincluding hyperparameter optimization, policy search, exploration, reward\nshaping, meta-RL, and multi-objective RL. We then discuss future research\ndirections in terms of efficient methods, benchmarks, and scalable platforms.\nThis survey serves as a resource for researchers and practitioners interested\nin the field of EvoRL, highlighting the important challenges and opportunities\nfor future research. With the help of this survey, researchers and\npractitioners can develop more efficient methods and tailored benchmarks for\nEvoRL, further advancing this promising cross-disciplinary research field.\n","authors":["Hui Bai","Ran Cheng","Yaochu Jin"],"pdf_url":"https://arxiv.org/pdf/2303.04150v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15697v1","updated":"2023-08-30T01:40:38Z","published":"2023-08-30T01:40:38Z","title":"Segmenting mechanically heterogeneous domains via unsupervised learning","summary":" From biological organs to soft robotics, highly deformable materials are\nessential components of natural and engineered systems. These highly deformable\nmaterials can have heterogeneous material properties, and can experience\nheterogeneous deformations with or without underlying material heterogeneity.\nMany recent works have established that computational modeling approaches are\nwell suited for understanding and predicting the consequences of material\nheterogeneity and for interpreting observed heterogeneous strain fields. In\nparticular, there has been significant work towards developing inverse analysis\napproaches that can convert observed kinematic quantities (e.g., displacement,\nstrain) to material properties and mechanical state. Despite the success of\nthese approaches, they are not necessarily generalizable and often rely on\ntight control and knowledge of boundary conditions. Here, we will build on the\nrecent advances (and ubiquity) of machine learning approaches to explore\nalternative approaches to detect patterns in heterogeneous material properties\nand mechanical behavior. Specifically, we will explore unsupervised learning\napproaches to clustering and ensemble clutering to identify heterogeneous\nregions. Overall, we find that these approaches are effective, yet limited in\ntheir abilities. Through this initial exploration (where all data and code is\npublished alongside this manuscript), we set the stage for future studies that\nmore specifically adapt these methods to mechanical data.\n","authors":["Quan Nguyen","Emma Lejeune"],"pdf_url":"https://arxiv.org/pdf/2308.15697v1.pdf","comment":"26 pages, 10 figures"},{"id":"http://arxiv.org/abs/2304.06767v3","updated":"2023-08-30T01:25:29Z","published":"2023-04-13T18:22:40Z","title":"RAFT: Reward rAnked FineTuning for Generative Foundation Model Alignment","summary":" Generative foundation models are susceptible to implicit biases that can\narise from extensive unsupervised training data. Such biases can produce\nsuboptimal samples, skewed outcomes, and unfairness, with potentially serious\nconsequences. Consequently, aligning these models with human ethics and\npreferences is an essential step toward ensuring their responsible and\neffective deployment in real-world applications. Prior research has primarily\nemployed Reinforcement Learning from Human Feedback (RLHF) to address this\nproblem, where generative models are fine-tuned with RL algorithms guided by a\nhuman-feedback-informed reward model. However, the inefficiencies and\ninstabilities associated with RL algorithms frequently present substantial\nobstacles to the successful alignment, necessitating the development of a more\nrobust and streamlined approach. To this end, we introduce a new framework,\nReward rAnked FineTuning (RAFT), designed to align generative models\neffectively. Utilizing a reward model and a sufficient number of samples, our\napproach selects the high-quality samples, discarding those that exhibit\nundesired behavior, and subsequently enhancing the model by fine-tuning on\nthese filtered samples. Our studies show that RAFT can effectively improve the\nmodel performance in both reward learning and other automated metrics in both\nlarge language models and diffusion models.\n","authors":["Hanze Dong","Wei Xiong","Deepanshu Goyal","Yihan Zhang","Winnie Chow","Rui Pan","Shizhe Diao","Jipeng Zhang","Kashun Shum","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2304.06767v3.pdf","comment":"26 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.15690v1","updated":"2023-08-30T01:14:32Z","published":"2023-08-30T01:14:32Z","title":"CongNaMul: A Dataset for Advanced Image Processing of Soybean Sprouts","summary":" We present 'CongNaMul', a comprehensive dataset designed for various tasks in\nsoybean sprouts image analysis. The CongNaMul dataset is curated to facilitate\ntasks such as image classification, semantic segmentation, decomposition, and\nmeasurement of length and weight. The classification task provides four classes\nto determine the quality of soybean sprouts: normal, broken, spotted, and\nbroken and spotted, for the development of AI-aided automatic quality\ninspection technology. For semantic segmentation, images with varying\ncomplexity, from single sprout images to images with multiple sprouts, along\nwith human-labelled mask images, are included. The label has 4 different\nclasses: background, head, body, tail. The dataset also provides images and\nmasks for the image decomposition task, including two separate sprout images\nand their combined form. Lastly, 5 physical features of sprouts (head length,\nbody length, body thickness, tail length, weight) are provided for image-based\nmeasurement tasks. This dataset is expected to be a valuable resource for a\nwide range of research and applications in the advanced analysis of images of\nsoybean sprouts. Also, we hope that this dataset can assist researchers\nstudying classification, semantic segmentation, decomposition, and physical\nfeature measurement in other industrial fields, in evaluating their models. The\ndataset is available at the authors' repository. (https://bhban.kr/data)\n","authors":["Byunghyun Ban","Donghun Ryu","Su-won Hwang"],"pdf_url":"https://arxiv.org/pdf/2308.15690v1.pdf","comment":"Accepted to International Conference on ICT Convergence 2023"},{"id":"http://arxiv.org/abs/2001.10474v3","updated":"2023-08-30T00:10:47Z","published":"2020-01-28T17:31:23Z","title":"Coagent Networks Revisited","summary":" Coagent networks formalize the concept of arbitrary networks of stochastic\nagents that collaborate to take actions in a reinforcement learning\nenvironment. Prominent examples of coagent networks in action include\napproaches to hierarchical reinforcement learning (HRL), such as those using\noptions, which attempt to address the exploration exploitation trade-off by\nintroducing abstract actions at different levels by sequencing multiple\nstochastic networks within the HRL agents. We first provide a unifying\nperspective on the many diverse examples that fall under coagent networks. We\ndo so by formalizing the rules of execution in a coagent network, enabled by\nthe novel and intuitive idea of execution paths in a coagent network. Motivated\nby parameter sharing in the hierarchical option-critic architecture, we revisit\nthe coagent network theory and achieve a much shorter proof of the policy\ngradient theorem using our idea of execution paths, without any assumption on\nhow parameters are shared among coagents. We then generalize our setting and\nproof to include the scenario where coagents act asynchronously. This new\nperspective and theorem also lead to more mathematically accurate and\nperformant algorithms than those in the existing literature. Lastly, by running\nnonstationary RL experiments, we survey the performance and properties of\ndifferent generalizations of option-critic models.\n","authors":["Modjtaba Shokrian Zini","Mohammad Pedramfar","Matthew Riemer","Ahmadreza Moradipari","Miao Liu"],"pdf_url":"https://arxiv.org/pdf/2001.10474v3.pdf","comment":"Reformatted paper significantly and clarified results on the\n asynchronous case"},{"id":"http://arxiv.org/abs/2305.10455v3","updated":"2023-08-30T00:05:26Z","published":"2023-05-17T02:53:58Z","title":"Towards Generalist Robots: A Promising Paradigm via Generative\n Simulation","summary":" This document serves as a position paper that outlines the authors' vision\nfor a potential pathway towards generalist robots. The purpose of this document\nis to share the excitement of the authors with the community and highlight a\npromising research direction in robotics and AI. The authors believe the\nproposed paradigm is a feasible path towards accomplishing the long-standing\ngoal of robotics research: deploying robots, or embodied AI agents more\nbroadly, in various non-factory real-world settings to perform diverse tasks.\nThis document presents a specific idea for mining knowledge in the latest\nlarge-scale foundation models for robotics research. Instead of directly using\nor adapting these models to produce low-level policies and actions, it\nadvocates for a fully automated generative pipeline (termed as generative\nsimulation), which uses these models to generate diversified tasks, scenes and\ntraining supervisions at scale, thereby scaling up low-level skill learning and\nultimately leading to a foundation model for robotics that empowers generalist\nrobots. The authors are actively pursuing this direction, but in the meantime,\nthey recognize that the ambitious goal of building generalist robots with\nlarge-scale policy training demands significant resources such as computing\npower and hardware, and research groups in academia alone may face severe\nresource constraints in implementing the entire vision. Therefore, the authors\nbelieve sharing their thoughts at this early stage could foster discussions,\nattract interest towards the proposed pathway and related topics from industry\ngroups, and potentially spur significant technical advancements in the field.\n","authors":["Zhou Xian","Theophile Gervet","Zhenjia Xu","Yi-Ling Qiao","Tsun-Hsuan Wang","Yian Wang"],"pdf_url":"https://arxiv.org/pdf/2305.10455v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15673v1","updated":"2023-08-30T00:03:03Z","published":"2023-08-30T00:03:03Z","title":"MDTD: A Multi Domain Trojan Detector for Deep Neural Networks","summary":" Machine learning models that use deep neural networks (DNNs) are vulnerable\nto backdoor attacks. An adversary carrying out a backdoor attack embeds a\npredefined perturbation called a trigger into a small subset of input samples\nand trains the DNN such that the presence of the trigger in the input results\nin an adversary-desired output class. Such adversarial retraining however needs\nto ensure that outputs for inputs without the trigger remain unaffected and\nprovide high classification accuracy on clean samples. In this paper, we\npropose MDTD, a Multi-Domain Trojan Detector for DNNs, which detects inputs\ncontaining a Trojan trigger at testing time. MDTD does not require knowledge of\ntrigger-embedding strategy of the attacker and can be applied to a pre-trained\nDNN model with image, audio, or graph-based inputs. MDTD leverages an insight\nthat input samples containing a Trojan trigger are located relatively farther\naway from a decision boundary than clean samples. MDTD estimates the distance\nto a decision boundary using adversarial learning methods and uses this\ndistance to infer whether a test-time input sample is Trojaned or not. We\nevaluate MDTD against state-of-the-art Trojan detection methods across five\nwidely used image-based datasets: CIFAR100, CIFAR10, GTSRB, SVHN, and\nFlowers102; four graph-based datasets: AIDS, WinMal, Toxicant, and COLLAB; and\nthe SpeechCommand audio dataset. MDTD effectively identifies samples that\ncontain different types of Trojan triggers. We evaluate MDTD against adaptive\nattacks where an adversary trains a robust DNN to increase (decrease) distance\nof benign (Trojan) inputs from a decision boundary.\n","authors":["Arezoo Rajabi","Surudhi Asokraj","Fengqing Jiang","Luyao Niu","Bhaskar Ramasubramanian","Jim Ritcey","Radha Poovendran"],"pdf_url":"https://arxiv.org/pdf/2308.15673v1.pdf","comment":"Accepted to ACM Conference on Computer and Communications Security\n (ACM CCS) 2023"}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.14480v2","updated":"2023-08-30T15:33:01Z","published":"2023-08-28T10:40:16Z","title":"Priority-Centric Human Motion Generation in Discrete Latent Space","summary":" Text-to-motion generation is a formidable task, aiming to produce human\nmotions that align with the input text while also adhering to human\ncapabilities and physical laws. While there have been advancements in diffusion\nmodels, their application in discrete spaces remains underexplored. Current\nmethods often overlook the varying significance of different motions, treating\nthem uniformly. It is essential to recognize that not all motions hold the same\nrelevance to a particular textual description. Some motions, being more salient\nand informative, should be given precedence during generation. In response, we\nintroduce a Priority-Centric Motion Discrete Diffusion Model (M2DM), which\nutilizes a Transformer-based VQ-VAE to derive a concise, discrete motion\nrepresentation, incorporating a global self-attention mechanism and a\nregularization term to counteract code collapse. We also present a motion\ndiscrete diffusion model that employs an innovative noise schedule, determined\nby the significance of each motion token within the entire motion sequence.\nThis approach retains the most salient motions during the reverse diffusion\nprocess, leading to more semantically rich and varied motions. Additionally, we\nformulate two strategies to gauge the importance of motion tokens, drawing from\nboth textual and visual indicators. Comprehensive experiments on the HumanML3D\nand KIT-ML datasets confirm that our model surpasses existing techniques in\nfidelity and diversity, particularly for intricate textual descriptions.\n","authors":["Hanyang Kong","Kehong Gong","Dongze Lian","Michael Bi Mi","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14480v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.15851v1","updated":"2023-08-30T08:35:31Z","published":"2023-08-30T08:35:31Z","title":"Prompting Vision Language Model with Knowledge from Large Language Model\n for Knowledge-Based VQA","summary":" Knowledge-based visual question answering is a very challenging and widely\nconcerned task. Previous methods adopts the implicit knowledge in large\nlanguage models (LLM) to achieve excellent results, but we argue that existing\nmethods may suffer from biasing understanding of the image and insufficient\nknowledge to solve the problem. In this paper, we propose PROOFREAD -PROmpting\nvision language model with knOwledge From laRgE lAnguage moDel, a novel,\nlightweight and efficient kowledge-based VQA framework, which make the vision\nlanguage model and the large language model cooperate to give full play to\ntheir respective strengths and bootstrap each other. In detail, our proposed\nmethod uses LLM to obtain knowledge explicitly, uses the vision language model\nwhich can see the image to get the knowledge answer, and introduces knowledge\nperceiver to filter out knowledge that is harmful for getting the correct final\nanswer. Experimental results on two datasets prove the effectiveness of our\napproach. Our method outperforms all state-of-the-art methods on the A-OKVQA\ndataset in two settings and also achieves relatively good performance on the\nOKVQA dataset.\n","authors":["Yang Zhou","Pengfei Cao","Yubo Chen","Kang Liu","Jun Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.15851v1.pdf","comment":null}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..7f5166c7afa0cda370aafaf91ba8d66cdeff74e5 GIT binary patch literal 15086 zcmeHO33yaRwyq9g34stc4G>5efuJa|8H50X3WB1d4n)8Z5JVvktB7$yKtv!Kb`lau zdP!$Nc9B&OSrq|+r=TLJ;K~kR2@vF;|J<9Kba%RAH(}4!8Q=syhFvI(6#Q zsX{4}Dx;b;Q+$T2oQ6t8Dy7213w{SH^#k7p^C{m4`z!S>3p8dKR#E*)@?QIEpg)}c zg{r6iyXi3T|KFt>>TsDWWe%JEGeI;m)t_ z#K0v35xKK9{I2==#4YqlUA%3Xh?MkH#4d|P#d8&Xs_&s!ylR8}jrLpHV^;bsWZSYa zH!TUx_B8jZuJHA{>W61Pj6tR~6PcNrNKa44y2nHqn=Z;*^b+JZFPz5nh)ES%qX=#z&q(>zlfC!?&@YSrplEEcre-mb8` zuXubZFMZU1X@6u`GYT;qc;sp5g4h(J-Ri$pM?zXcp{_ZWm%PAnhXUA$>^4n&yX>&2hmV*Ry0tB zDc2*Jr;1N^DfmP%o?7)3+HY@<`rp+@r%jzP%QCA_hD^#Z(KZo(JM=fG&)C8vq&l}j zJwF&~<0nw3PebMBv;9AHx_+HHM>2k2y$bcquTUQ>g6h^CsupeGV1<^;S|Zt!?1a7Z z#?J81^LLBW9d_fL={n^r&=JYsxAQ)IUfZ*@bwK9T6E8jvRhMmd?FO}(eVmu4AjvwHnm*c+SZaI_{G2dio}E$k=@I4b)RlA>{TDjCf@P9^3fXP4&AYX4kyP&|*&u zL_Z&m!0N<4WeU`#OS-PO!zb88j|}~hyr;2|GQa;071I}ibplcL)3QG6j4NJuzfD_B zC=*%^D*iPcyDJ`}Kd)TT$K}u=sD1mO_U@%EJplFd&rlaHy4N$2?^n)?N2!-$3x0El zpcL=UvTj#WH?}W2Bm5luU4G~0LM>eiHE+x9X!aoTcDDYxjsAu_3X1y$Bwy|VK(IpqYlX$aVwJaeLK?Nm$>FoH>BT1SA+z+ zU=vI)(6%2wtjj0%MZP(b^zV%uI__S*bQz3sFxr#nAAdmI&v6?@p6=F4Uu9a$c0#M_ zYms0O{AbSSdL;Y>%a9?ohv$p;r=yM;d1*uX{*gzhC!9-CPjph+lrr*t+6=DYwBtv~ zyR_+Lw$R}Ly?yB);gOO8)vups_cUD>9g)5^F#gq3Fz(vLe!d?nI$Cc_+LU_I&i?)M zIch^KE+zVltlyC|I^G%I@#DH)3(vSXsLPkV$8N|bbwuZ+4Uu2klyA~U=gyHYb#inm z@gHOTMt<~hZ2FpM@D?7T<1$AF4AA)*-@JVaMzK}WhO}jjts%pUoNtelKmDVdPWxH2 zUPby@A3Nh09x~3tJ0qiLUVDpO%84zIy3&TL?uk4TCquO+|J<8Kuls0W!BE?_7q^=R zR>LM4G8ykZJsq(+)^#i|_{CpsPVA>jf&X*X4WnPYb(?4W24BGk+NDH$2J`E zf`8mZuHWQ;fpoJ;{E)k7hv&^N8NXnQkB3QdfAN5etrc7{H)-GHn^uNpi|M>0e#!TL zUf<(*vuE`rpX~9(?-?@G**>`PqAzOd-d)F5zrMZ>JLy9R#yRq)UYk01*0I&Dt@{+N_~~bu_)WvlvL5G&ri-7^ z->MF^<`&@J!8Sr^LszWytV3LjOg($**cvs`*CSW_T%%0(R| z7fJph+p5D@i1_zn8yvAoUifk!XnKY+uH-m5_PtS7-tn7OM)r*E%1GHa#zNgmoALcE z#4q!>Kk2^KMLx2D%Xms3%l^vv?dd6HJdB}Qx1PEh0yX;DK9ZoqOLJHETi*8l?M+!q*#o zC6zI-cj$nK1`W|ZyFL7`-F)mM@LV85j)p*DxN;%mZkENXqy&csb zsD_E}O+J!}{T|s1i|rJAB99{pc8M?U{DQvC+q9AQaBsmro~7V_${$@ebz$tS zD0VAxxY*^fnl6;TP##r}S4F+U^$_|)D9TLDq~u-ur`m{J5sTMs zuVdouiO8@OoM~_@pI-BHF}c0L>wncBZML_?6w4IMOrMFU?NJb4z)d@OeCL^Ns63wI}D zBv}ESDELfpe#mbj`F_{^oZh>Z^W}G7ZeV`Y?soZMN5fs)bg~^4FI2?vWy3K&V>+50 zU`*>4B=PI|ujbte-Xj>la6GD=q@VASBzQbE(Mou3Mu)rQ&jISBtLywuzSE(YL* zRWHzQDO&Hv*NR_EzfbO-(Ql3ZI2xH2{XUVbt8zbPa`t3YJ<0PO*Cc+F#GZd3NgVut zNOB${@er3J{oZq9PuOfW&DRq@LnzyljUgWmDclQ1?vKPg+dRz&)b3@`ACx*V>tj&< zQh{G_jjc<}=nbyZ(YJA*)|_Wd4{~4Ik$L+3y{kb@W`7DE$|U3Y$hJq3Zlm8!pKa`- zv1q-oHJX3j0-ZkZ?7)E-Ue)=q) z1McS8?eE9wOY)5BEYBN$`Hivk&!HwUHvc$vb{y}ht!;-idz!|3=!&7Jc7pgyNTLgZ zL&}654a0gZHLP z#fT3_pz0|%<5&U~4Z|;Ccy3ZZ)RDUXWm zuW1r%$T_63N0z;(oDoczz`f8=o@319zR0eVoPCet-frwzJsvA%1%sSn_Upy_AU<-J z`Se6X?t8y0>UX)zFl-nU@1AMxy7s@^n~*a*Gj(PbecRHl3 z)RAxUQWpboZ1o5#BsAxUFp)gfC+3&x=I=7_IiV()^N6pLIfV24e(_kM$kW2W1{+TyObG z%18SuI2`rMz##ABo2)s!CwhC^NWA&#Ye>jRK*XU4w{bZGSA|Oz&~HsYEykEkKg?pY zXufJvMiN@@Z_T@=EZKu=$l!rI`}>%4?++b|p?{Y+CP#nP?M%$mP|pP*d{r3U&v{>q zi@lfm9`4_JKPsK8gz6`1fO`u_9Kqm!&w+bjW;{vaQU+P+dv)2-r6{&=nx(CzzLj}D zrv-UL^G+<+sG)xM6 z?TJFFEmf0C{C}YwOAfki>*iPzQOx^KY$zohMj$PFW zCBJEte(!S2disF0r>^Ov+jX9@#tC1!$1G3B{B^EFJGawD(vNmcn=A5eJ=^~ChPFSD z`yJZd2HtPb^0MEMZ)+A3XVDr_*beuF%KQ@h`>O7b$(~E@NRv#Gm$SfJ`dkb8=(^#^ zpZemTUj}!jMxoMTuBTTxc8|>VvOmu-_V5+=E%E4@&Z01YYUJsU+fLnv2}>u?uI6Cm+L8KM zAc2-+N1Mj9EDb0eKE% zBWGqVy3+V)V + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Computation and Language 44 + +
+
+
+ + ☆ Quantifying Uncertainty in Answers from any Language Model via Intrinsic + and Extrinsic Confidence Assessment + + +
+ We introduce BSDetector, a method for detecting bad and speculative answers +from a pretrained Large Language Model by estimating a numeric confidence score +for any output it generated. Our uncertainty quantification technique works for +any LLM accessible only via a black-box API, and combines intrinsic and +extrinsic assessments of confidence into a single trustworthiness estimate for +any LLM response to a given prompt. Our method is extremely general and can +applied to all of the best LLMs available today (whose training data remains +unknown). By expending a bit of extra computation, users of any LLM API can now +get the same response as they would ordinarily, as well as a confidence +estimate that caution when not to trust this response. Experiments on both +closed and open-form Question-Answer benchmarks reveal that BSDetector more +accurately identifies incorrect LLM responses than alternative uncertainty +estimation procedures (for both GPT-3 and ChatGPT). By sampling multiple +responses from the LLM and considering the one with the highest confidence +score, we can additionally obtain more accurate responses from the same LLM, +without any extra training steps. + +
+
+
+
+
+ + ☆ Jais and Jais-chat: Arabic-Centric Foundation and Instruction-Tuned Open + Generative Large Language Models + + +
+ We introduce Jais and Jais-chat, new state-of-the-art Arabic-centric +foundation and instruction-tuned open generative large language models (LLMs). +The models are based on the GPT-3 decoder-only architecture and are pretrained +on a mixture of Arabic and English texts, including source code in various +programming languages. With 13 billion parameters, they demonstrate better +knowledge and reasoning capabilities in Arabic than any existing open Arabic +and multilingual models by a sizable margin, based on extensive evaluation. +Moreover, the models are competitive in English compared to English-centric +open models of similar size, despite being trained on much less English data. +We provide a detailed description of the training, the tuning, the safety +alignment, and the evaluation of the models. We release two open versions of +the model -- the foundation Jais model, and an instruction-tuned Jais-chat +variant -- with the aim of promoting research on Arabic LLMs. Available at +https://huggingface.co/inception-mbzuai/jais-13b-chat + +
+
+ comment: Arabic-centric, foundation model, large-language model, LLM, + generative model, instruction-tuned, Jais, Jais-chat +
+
+
+
+
+ + ☆ LM-Infinite: Simple On-the-Fly Length Generalization for Large Language + Models + + +
+ In recent years, there have been remarkable advancements in the performance +of Transformer-based Large Language Models (LLMs) across various domains. As +these LLMs are deployed for increasingly complex tasks, they often face the +needs to conduct longer reasoning processes or understanding larger contexts. +In these situations, the length generalization failure of LLMs on long +sequences become more prominent. Most pre-training schemes truncate training +sequences to a fixed length (such as 2048 for LLaMa). LLMs often struggle to +generate fluent texts, let alone carry out downstream tasks, after longer +contexts, even with relative positional encoding which is designed to cope with +this problem. Common solutions such as finetuning on longer corpora often +involves daunting hardware and time costs and requires careful training process +design. To more efficiently leverage the generation capacity of existing LLMs, +we theoretically and empirically investigate the main out-of-distribution (OOD) +factors contributing to this problem. Inspired by this diagnosis, we propose a +simple yet effective solution for on-the-fly length generalization, +LM-Infinite, which involves only a $\Lambda$-shaped attention mask and a +distance limit while requiring no parameter updates or learning. We find it +applicable to a variety of LLMs using relative-position encoding methods. +LM-Infinite is computational efficient with $O(n)$ time and space, and +demonstrates consistent fluency and generation quality to as long as 32k tokens +on ArXiv and OpenWebText2 datasets, with 2.72x decoding speedup. On downstream +task such as passkey retrieval, it continues to work on inputs much longer than +training lengths where vanilla models fail immediately. + +
+
+ comment: 9 pages, 4 figures +
+
+
+
+
+ + ☆ Response: Emergent analogical reasoning in large language models + + +
+ In their recent Nature Human Behaviour paper, "Emergent analogical reasoning +in large language models," (Webb, Holyoak, and Lu, 2023) the authors argue that +"large language models such as GPT-3 have acquired an emergent ability to find +zero-shot solutions to a broad range of analogy problems." In this response, we +provide counterexamples of the letter string analogies. In our tests, GPT-3 +fails to solve even the easiest variants of the problems presented in the +original paper. Zero-shot reasoning is an extraordinary claim that requires +extraordinary evidence. We do not see that evidence in our experiments. To +strengthen claims of humanlike reasoning such as zero-shot reasoning, it is +important that the field develop approaches that rule out data memorization. + +
+
+ comment: Response to publication in Nature Human Behaviour titled "Emergent + analogical reasoning in large language models," (Webb, Holyoak, and Lu, 2023, + arXiv:2212.09196). 9 pages +
+
+
+
+
+ + ☆ Grandma Karl is 27 years old -- research agenda for pseudonymization of + research data + + +
+ Accessibility of research data is critical for advances in many research +fields, but textual data often cannot be shared due to the personal and +sensitive information which it contains, e.g names or political opinions. +General Data Protection Regulation (GDPR) suggests pseudonymization as a +solution to secure open access to research data, but we need to learn more +about pseudonymization as an approach before adopting it for manipulation of +research data. This paper outlines a research agenda within pseudonymization, +namely need of studies into the effects of pseudonymization on unstructured +data in relation to e.g. readability and language assessment, as well as the +effectiveness of pseudonymization as a way of protecting writer identity, while +also exploring different ways of developing context-sensitive algorithms for +detection, labelling and replacement of personal information in unstructured +data. The recently granted project on pseudonymization Grandma Karl is 27 years +old addresses exactly those challenges. + +
+
+ comment: Big DataService 2023 conference, 2023 Workshop on Big Data and + Machine Learning with Privacy Enhancing Tech, IEEE Catalog Number: + CFP23A91-ART, ISBN: 979-8-3503-3379-4 +
+
+
+
+
+ + ☆ Impact of Visual Context on Noisy Multimodal NMT: An Empirical Study for + English to Indian Languages + + +
+ The study investigates the effectiveness of utilizing multimodal information +in Neural Machine Translation (NMT). While prior research focused on using +multimodal data in low-resource scenarios, this study examines how image +features impact translation when added to a large-scale, pre-trained unimodal +NMT system. Surprisingly, the study finds that images might be redundant in +this context. Additionally, the research introduces synthetic noise to assess +whether images help the model deal with textual noise. Multimodal models +slightly outperform text-only models in noisy settings, even with random +images. The study's experiments translate from English to Hindi, Bengali, and +Malayalam, outperforming state-of-the-art benchmarks significantly. +Interestingly, the effect of visual context varies with source text noise: no +visual context works best for non-noisy translations, cropped image features +are optimal for low noise, and full image features work better in high-noise +scenarios. This sheds light on the role of visual context, especially in noisy +settings, opening up a new research direction for Noisy Neural Machine +Translation in multimodal setups. The research emphasizes the importance of +combining visual and textual information for improved translation in various +environments. + +
+
+
+
+
+ + ☆ Conti Inc.: Understanding the Internal Discussions of a large + Ransomware-as-a-Service Operator with Machine Learning + + +
+ Ransomware-as-a-service (RaaS) is increasing the scale and complexity of +ransomware attacks. Understanding the internal operations behind RaaS has been +a challenge due to the illegality of such activities. The recent chat leak of +the Conti RaaS operator, one of the most infamous ransomware operators on the +international scene, offers a key opportunity to better understand the inner +workings of such organizations. This paper analyzes the main topic discussions +in the Conti chat leak using machine learning techniques such as Natural +Language Processing (NLP) and Latent Dirichlet Allocation (LDA), as well as +visualization strategies. Five discussion topics are found: 1) Business, 2) +Technical, 3) Internal tasking/Management, 4) Malware, and 5) Customer +Service/Problem Solving. Moreover, the distribution of topics among Conti +members shows that only 4% of individuals have specialized discussions while +almost all individuals (96%) are all-rounders, meaning that their discussions +revolve around the five topics. The results also indicate that a significant +proportion of Conti discussions are non-tech related. This study thus +highlights that running such large RaaS operations requires a workforce skilled +beyond technical abilities, with individuals involved in various tasks, from +management to customer service or problem solving. The discussion topics also +show that the organization behind the Conti RaaS oper5086933ator shares +similarities with a large firm. We conclude that, although RaaS represents an +example of specialization in the cybercrime industry, only a few members are +specialized in one topic, while the rest runs and coordinates the RaaS +operation. + +
+
+
+
+
+ + ☆ Text-to-OverpassQL: A Natural Language Interface for Complex Geodata + Querying of OpenStreetMap + + +
+ We present Text-to-OverpassQL, a task designed to facilitate a natural +language interface for querying geodata from OpenStreetMap (OSM). The Overpass +Query Language (OverpassQL) allows users to formulate complex database queries +and is widely adopted in the OSM ecosystem. Generating Overpass queries from +natural language input serves multiple use-cases. It enables novice users to +utilize OverpassQL without prior knowledge, assists experienced users with +crafting advanced queries, and enables tool-augmented large language models to +access information stored in the OSM database. In order to assess the +performance of current sequence generation models on this task, we propose +OverpassNL, a dataset of 8,352 queries with corresponding natural language +inputs. We further introduce task specific evaluation metrics and ground the +evaluation of the Text-to-OverpassQL task by executing the queries against the +OSM database. We establish strong baselines by finetuning sequence-to-sequence +models and adapting large language models with in-context examples. The +detailed evaluation reveals strengths and weaknesses of the considered learning +strategies, laying the foundations for further research into the +Text-to-OverpassQL task. + +
+
+
+
+
+ + ☆ AsyncET: Asynchronous Learning for Knowledge Graph Entity Typing with + Auxiliary Relations + + +
+ Knowledge graph entity typing (KGET) is a task to predict the missing entity +types in knowledge graphs (KG). Previously, KG embedding (KGE) methods tried to +solve the KGET task by introducing an auxiliary relation, 'hasType', to model +the relationship between entities and their types. However, a single auxiliary +relation has limited expressiveness for diverse entity-type patterns. We +improve the expressiveness of KGE methods by introducing multiple auxiliary +relations in this work. Similar entity types are grouped to reduce the number +of auxiliary relations and improve their capability to model entity-type +patterns with different granularities. With the presence of multiple auxiliary +relations, we propose a method adopting an Asynchronous learning scheme for +Entity Typing, named AsyncET, which updates the entity and type embeddings +alternatively to keep the learned entity embedding up-to-date and informative +for entity type prediction. Experiments are conducted on two commonly used KGET +datasets to show that the performance of KGE methods on the KGET task can be +substantially improved by the proposed multiple auxiliary relations and +asynchronous embedding learning. Furthermore, our method has a significant +advantage over state-of-the-art methods in model sizes and time complexity. + +
+
+
+
+
+ + ☆ FPTQ: Fine-grained Post-Training Quantization for Large Language Models + + +
+ In the era of large-scale language models, the substantial parameter size +poses significant challenges for deployment. Being a prevalent compression +technique, quantization has emerged as the mainstream practice to tackle this +issue, which is mainly centered on two recipes W8A8 and W4A16 (i.e. weights and +activations in such bit widths). In this study, we propose a novel W4A8 +post-training quantization method for the available open-sourced LLMs, which +combines the advantages of both two recipes. Therefore, we can leverage the +benefit in the I/O utilization of 4-bit weight quantization and the +acceleration due to 8-bit matrix computation. Nevertheless, the W4A8 faces +notorious performance degradation. As a remedy, we involve layerwise activation +quantization strategies which feature a novel logarithmic equalization for most +intractable layers, and we combine them with fine-grained weight quantization. +Without whistles and bells, we eliminate the necessity for further fine-tuning +and obtain the state-of-the-art W4A8 quantized performance on BLOOM, LLaMA, and +LLaMA-2 on standard benchmarks. We confirm that the W4A8 quantization is +achievable for the deployment of large language models, fostering their +wide-spreading real-world applications. + +
+
+
+
+
+ + ☆ MerA: Merging Pretrained Adapters For Few-Shot Learning + + +
+ Adapter tuning, which updates only a few parameters, has become a mainstream +method for fine-tuning pretrained language models to downstream tasks. However, +it often yields subpar results in few-shot learning. AdapterFusion, which +assembles pretrained adapters using composition layers tailored to specific +tasks, is a possible solution but significantly increases trainable parameters +and deployment costs. Despite this, our preliminary study reveals that even +single adapters can outperform Adapterfusion in few-shot learning, urging us to +propose \textbf{\texttt{Merging Pretrained Adapters}} (MerA) that efficiently +incorporates pretrained adapters to a single model through model fusion. +Extensive experiments on two PLMs demonstrate that MerA achieves substantial +improvements compared to both single adapters and AdapterFusion. To further +enhance the capacity of MerA, we also introduce a simple yet effective +technique, referred to as the "\textit{same-track}" setting, that merges +adapters from the same track of pretraining tasks. With the implementation of +the "\textit{same-track}" setting, we observe even more impressive gains, +surpassing the performance of both full fine-tuning and adapter tuning by a +substantial margin, e.g., 3.5\% in MRPC and 5.0\% in MNLI. + +
+
+
+
+
+ + ☆ Finding-Aware Anatomical Tokens for Chest X-Ray Automated Reporting + + +
+ The task of radiology reporting comprises describing and interpreting the +medical findings in radiographic images, including description of their +location and appearance. Automated approaches to radiology reporting require +the image to be encoded into a suitable token representation for input to the +language model. Previous methods commonly use convolutional neural networks to +encode an image into a series of image-level feature map representations. +However, the generated reports often exhibit realistic style but imperfect +accuracy. Inspired by recent works for image captioning in the general domain +in which each visual token corresponds to an object detected in an image, we +investigate whether using local tokens corresponding to anatomical structures +can improve the quality of the generated reports. We introduce a novel +adaptation of Faster R-CNN in which finding detection is performed for the +candidate bounding boxes extracted during anatomical structure localisation. We +use the resulting bounding box feature representations as our set of +finding-aware anatomical tokens. This encourages the extracted anatomical +tokens to be informative about the findings they contain (required for the +final task of radiology reporting). Evaluating on the MIMIC-CXR dataset of +chest X-Ray images, we show that task-aware anatomical tokens give +state-of-the-art performance when integrated into an automated reporting +pipeline, yielding generated reports with improved clinical accuracy. + +
+
+
+
+
+ + ☆ Benchmarking Multilabel Topic Classification in the Kyrgyz Language + + +
+ Kyrgyz is a very underrepresented language in terms of modern natural +language processing resources. In this work, we present a new public benchmark +for topic classification in Kyrgyz, introducing a dataset based on collected +and annotated data from the news site 24.KG and presenting several baseline +models for news classification in the multilabel setting. We train and evaluate +both classical statistical and neural models, reporting the scores, discussing +the results, and proposing directions for future work. + +
+
+ comment: Accepted to AIST 2023 +
+
+
+
+
+ + ☆ LLaSM: Large Language and Speech Model + + +
+ Multi-modal large language models have garnered significant interest +recently. Though, most of the works focus on vision-language multi-modal models +providing strong capabilities in following vision-and-language instructions. +However, we claim that speech is also an important modality through which +humans interact with the world. Hence, it is crucial for a general-purpose +assistant to be able to follow multi-modal speech-and-language instructions. In +this work, we propose Large Language and Speech Model (LLaSM). LLaSM is an +end-to-end trained large multi-modal speech-language model with cross-modal +conversational abilities, capable of following speech-and-language +instructions. Our early experiments show that LLaSM demonstrates a more +convenient and natural way for humans to interact with artificial intelligence. +Specifically, we also release a large Speech Instruction Following dataset +LLaSM-Audio-Instructions. Code and demo are available at +https://github.com/LinkSoul-AI/LLaSM and +https://huggingface.co/spaces/LinkSoul/LLaSM. The LLaSM-Audio-Instructions +dataset is available at +https://huggingface.co/datasets/LinkSoul/LLaSM-Audio-Instructions. + +
+
+
+
+
+ + ☆ Is the U.S. Legal System Ready for AI's Challenges to Human Values? + + +
+ Our interdisciplinary study investigates how effectively U.S. laws confront +the challenges posed by Generative AI to human values. Through an analysis of +diverse hypothetical scenarios crafted during an expert workshop, we have +identified notable gaps and uncertainties within the existing legal framework +regarding the protection of fundamental values, such as autonomy, privacy, +dignity, diversity, equality, and physical/mental well-being. Constitutional +and civil rights, it appears, may not provide sufficient protection against +AI-generated discriminatory outputs. Furthermore, even if we exclude the +liability shield provided by Section 230, proving causation for defamation and +product liability claims is a challenging endeavor due to the intricate and +opaque nature of AI systems. To address the unique and unforeseeable threats +posed by Generative AI, we advocate for legal frameworks that evolve to +recognize new threat and provide proactive, auditable guidelines to industry +stakeholders. Addressing these issues requires deep interdisciplinary +collaborations to identify harms, values, and mitigation strategies. + +
+
+ comment: 26 pages, 7 figures +
+
+
+
+
+ + ☆ Towards One-Shot Learning for Text Classification using Inductive Logic + Programming + + +
+ With the ever-increasing potential of AI to perform personalised tasks, it is +becoming essential to develop new machine learning techniques which are +data-efficient and do not require hundreds or thousands of training data. In +this paper, we explore an Inductive Logic Programming approach for one-shot +text classification. In particular, we explore the framework of +Meta-Interpretive Learning (MIL), along with using common-sense background +knowledge extracted from ConceptNet. Results indicate that MIL can learn text +classification rules from a small number of training examples. Moreover, the +higher complexity of chosen examples, the higher accuracy of the outcome. + +
+
+ comment: In Proceedings ICLP 2023, arXiv:2308.14898 +
+
+
+
+
+ + ☆ Knowledge-grounded Natural Language Recommendation Explanation + + +
+ Explanations accompanied by a recommendation can assist users in +understanding the decision made by recommendation systems, which in turn +increases a user's confidence and trust in the system. Recently, research has +focused on generating natural language explanations in a human-readable format. +Thus far, the proposed approaches leverage item reviews written by users, which +are often subjective, sparse in language, and unable to account for new items +that have not been purchased or reviewed before. Instead, we aim to generate +fact-grounded recommendation explanations that are objectively described with +item features while implicitly considering a user's preferences, based on the +user's purchase history. To achieve this, we propose a knowledge graph (KG) +approach to natural language explainable recommendation. Our approach draws on +user-item features through a novel collaborative filtering-based KG +representation to produce fact-grounded, personalized explanations, while +jointly learning user-item representations for recommendation scoring. +Experimental results show that our approach consistently outperforms previous +state-of-the-art models on natural language explainable recommendation. + +
+
+
+
+
+ + ☆ Peering Through Preferences: Unraveling Feedback Acquisition for + Aligning Large Language Models + + +
+ Aligning large language models (LLMs) with human values and intents +critically involves the use of human or AI feedback. While dense feedback +annotations are expensive to acquire and integrate, sparse feedback presents a +structural design choice between ratings (e.g., score Response A on a scale of +1-7) and rankings (e.g., is Response A better than Response B?). In this work, +we analyze the effect of this design choice for the alignment and evaluation of +LLMs. We uncover an inconsistency problem wherein the preferences inferred from +ratings and rankings significantly disagree 60% for both human and AI +annotators. Our subsequent analysis identifies various facets of annotator +biases that explain this phenomena, such as human annotators would rate denser +responses higher while preferring accuracy during pairwise judgments. To our +surprise, we also observe that the choice of feedback protocol also has a +significant effect on the evaluation of aligned LLMs. In particular, we find +that LLMs that leverage rankings data for alignment (say model X) are preferred +over those that leverage ratings data (say model Y), with a rank-based +evaluation protocol (is X/Y's response better than reference response?) but not +with a rating-based evaluation protocol (score Rank X/Y's response on a scale +of 1-7). Our findings thus shed light on critical gaps in methods for +evaluating the real-world utility of language models and their strong +dependence on the feedback protocol used for alignment. Our code and data are +available at https://github.com/Hritikbansal/sparse_feedback. + +
+
+ comment: 24 pages, 12 Tables, 3 Figures +
+
+
+
+
+ + ☆ HAlf-MAsked Model for Named Entity Sentiment analysis + + +
+ Named Entity Sentiment analysis (NESA) is one of the most actively developing +application domains in Natural Language Processing (NLP). Social media NESA is +a significant field of opinion analysis since detecting and tracking sentiment +trends in the news flow is crucial for building various analytical systems and +monitoring the media image of specific people or companies. In this paper, we +study different transformers-based solutions NESA in RuSentNE-23 evaluation. +Despite the effectiveness of the BERT-like models, they can still struggle with +certain challenges, such as overfitting, which appeared to be the main obstacle +in achieving high accuracy on the RuSentNE-23 data. We present several +approaches to overcome this problem, among which there is a novel technique of +additional pass over given data with masked entity before making the final +prediction so that we can combine logits from the model when it knows the exact +entity it predicts sentiment for and when it does not. Utilizing this +technique, we ensemble multiple BERT- like models trained on different subsets +of data to improve overall performance. Our proposed model achieves the best +result on RuSentNE-23 evaluation data and demonstrates improved consistency in +entity-level sentiment analysis. + +
+
+
+
+
+ + ☆ Task-Based MoE for Multitask Multilingual Machine Translation + + +
+ Mixture-of-experts (MoE) architecture has been proven a powerful method for +diverse tasks in training deep models in many applications. However, current +MoE implementations are task agnostic, treating all tokens from different tasks +in the same manner. In this work, we instead design a novel method that +incorporates task information into MoE models at different granular levels with +shared dynamic task-based adapters. Our experiments and analysis show the +advantages of our approaches over the dense and canonical MoE models on +multi-task multilingual machine translations. With task-specific adapters, our +models can additionally generalize to new tasks efficiently. + +
+
+
+
+
+ + ☆ Cyberbullying Detection for Low-resource Languages and Dialects: Review + of the State of the Art + + +
+ The struggle of social media platforms to moderate content in a timely +manner, encourages users to abuse such platforms to spread vulgar or abusive +language, which, when performed repeatedly becomes cyberbullying a social +problem taking place in virtual environments, yet with real-world consequences, +such as depression, withdrawal, or even suicide attempts of its victims. +Systems for the automatic detection and mitigation of cyberbullying have been +developed but, unfortunately, the vast majority of them are for the English +language, with only a handful available for low-resource languages. To estimate +the present state of research and recognize the needs for further development, +in this paper we present a comprehensive systematic survey of studies done so +far for automatic cyberbullying detection in low-resource languages. We +analyzed all studies on this topic that were available. We investigated more +than seventy published studies on automatic detection of cyberbullying or +related language in low-resource languages and dialects that were published +between around 2017 and January 2023. There are 23 low-resource languages and +dialects covered by this paper, including Bangla, Hindi, Dravidian languages +and others. In the survey, we identify some of the research gaps of previous +studies, which include the lack of reliable definitions of cyberbullying and +its relevant subcategories, biases in the acquisition, and annotation of data. +Based on recognizing those research gaps, we provide some suggestions for +improving the general research conduct in cyberbullying detection, with a +primary focus on low-resource languages. Based on those proposed suggestions, +we collect and release a cyberbullying dataset in the Chittagonian dialect of +Bangla and propose a number of initial ML solutions trained on that dataset. In +addition, pre-trained transformer-based the BanglaBERT model was also +attempted. + +
+
+ comment: 52 Pages +
+
+
+
+
+ + ☆ Quantifying and Analyzing Entity-level Memorization in Large Language + Models + + +
+ Large language models (LLMs) have been proven capable of memorizing their +training data, which can be extracted through specifically designed prompts. As +the scale of datasets continues to grow, privacy risks arising from +memorization have attracted increasing attention. Quantifying language model +memorization helps evaluate potential privacy risks. However, prior works on +quantifying memorization require access to the precise original data or incur +substantial computational overhead, making it difficult for applications in +real-world language models. To this end, we propose a fine-grained, +entity-level definition to quantify memorization with conditions and metrics +closer to real-world scenarios. In addition, we also present an approach for +efficiently extracting sensitive entities from autoregressive language models. +We conduct extensive experiments based on the proposed, probing language +models' ability to reconstruct sensitive entities under different settings. We +find that language models have strong memorization at the entity level and are +able to reproduce the training data even with partial leakages. The results +demonstrate that LLMs not only memorize their training data but also understand +associations between entities. These findings necessitate that trainers of LLMs +exercise greater prudence regarding model memorization, adopting memorization +mitigation techniques to preclude privacy violations. + +
+
+ comment: 9 pages, 7 figures +
+
+
+
+
+ + ☆ Optimizing Factual Accuracy in Text Generation through Dynamic Knowledge + Selection + + +
+ Language models (LMs) have revolutionized the way we interact with +information, but they often generate nonfactual text, raising concerns about +their reliability. Previous methods use external knowledge as references for +text generation to enhance factuality but often struggle with the knowledge +mix-up(e.g., entity mismatch) of irrelevant references. Besides,as the length +of the output text grows, the randomness of sampling can escalate, +detrimentally impacting the factual accuracy of the generated text. In this +paper, we present DKGen, which divide the text generation process into an +iterative process. In each iteration, DKGen takes the input query, the +previously generated text and a subset of the reference passages as input to +generate short text. During the process, the subset is dynamically selected +from the full passage set based on their relevance to the previously generated +text and the query, largely eliminating the irrelevant references from input. +To further enhance DKGen's ability to correctly use these external knowledge, +DKGen distills the relevance order of reference passages to the cross-attention +distribution of decoder. We train and evaluate DKGen on a large-scale benchmark +dataset. Experiment results show that DKGen outperforms all baseline models. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ♻ ☆ Large Language Models in Cryptocurrency Securities Cases: Can ChatGPT + Replace Lawyers? + + +
+ Large Language Models (LLMs) could enhance access to the legal system. +However, empirical research on their effectiveness in conducting legal tasks is +scant. We study securities cases involving cryptocurrencies as one of numerous +contexts where AI could support the legal process, studying LLMs' legal +reasoning and drafting capabilities. We examine whether a) an LLM can +accurately determine which laws are potentially being violated from a fact +pattern, and b) whether there is a difference in juror decision-making based on +complaints written by a lawyer compared to an LLM. We feed fact patterns from +real-life cases to GPT-3.5 and evaluate its ability to determine correct +potential violations from the scenario and exclude spurious violations. Second, +we had mock jurors assess complaints written by the LLM and lawyers. GPT-3.5's +legal reasoning skills proved weak, though we expect improvement in future +models, particularly given the violations it suggested tended to be correct (it +merely missed additional, correct violations). GPT-3.5 performed better at +legal drafting, and jurors' decisions were not statistically significantly +associated with the author of the document upon which they based their +decisions. Because LLMs cannot satisfactorily conduct legal reasoning tasks, +they would be unable to replace lawyers at this stage. However, their drafting +skills (though, perhaps, still inferior to lawyers), could provide access to +justice for more individuals by reducing the cost of legal services. Our +research is the first to systematically study LLMs' legal drafting and +reasoning capabilities in litigation, as well as in securities law and +cryptocurrency-related misconduct. + +
+
+
+
+
+ + ♻ ☆ Going Beyond Nouns With Vision & Language Models Using Synthetic Data ICCV 2023 + + +
+ Large-scale pre-trained Vision & Language (VL) models have shown remarkable +performance in many applications, enabling replacing a fixed set of supported +classes with zero-shot open vocabulary reasoning over (almost arbitrary) +natural language prompts. However, recent works have uncovered a fundamental +weakness of these models. For example, their difficulty to understand Visual +Language Concepts (VLC) that go 'beyond nouns' such as the meaning of +non-object words (e.g., attributes, actions, relations, states, etc.), or +difficulty in performing compositional reasoning such as understanding the +significance of the order of the words in a sentence. In this work, we +investigate to which extent purely synthetic data could be leveraged to teach +these models to overcome such shortcomings without compromising their zero-shot +capabilities. We contribute Synthetic Visual Concepts (SyViC) - a million-scale +synthetic dataset and data generation codebase allowing to generate additional +suitable data to improve VLC understanding and compositional reasoning of VL +models. Additionally, we propose a general VL finetuning strategy for +effectively leveraging SyViC towards achieving these improvements. Our +extensive experiments and ablations on VL-Checklist, Winoground, and ARO +benchmarks demonstrate that it is possible to adapt strong pre-trained VL +models with synthetic data significantly enhancing their VLC understanding +(e.g. by 9.9% on ARO and 4.3% on VL-Checklist) with under 1% drop in their +zero-shot accuracy. + +
+
+ comment: Accepted to ICCV 2023. Project page: https://synthetic-vic.github.io/ +
+
+
+
+
+ + ♻ ☆ Evaluating GPT-3 Generated Explanations for Hateful Content Moderation IJCAI + + +
+ Recent research has focused on using large language models (LLMs) to generate +explanations for hate speech through fine-tuning or prompting. Despite the +growing interest in this area, these generated explanations' effectiveness and +potential limitations remain poorly understood. A key concern is that these +explanations, generated by LLMs, may lead to erroneous judgments about the +nature of flagged content by both users and content moderators. For instance, +an LLM-generated explanation might inaccurately convince a content moderator +that a benign piece of content is hateful. In light of this, we propose an +analytical framework for examining hate speech explanations and conducted an +extensive survey on evaluating such explanations. Specifically, we prompted +GPT-3 to generate explanations for both hateful and non-hateful content, and a +survey was conducted with 2,400 unique respondents to evaluate the generated +explanations. Our findings reveal that (1) human evaluators rated the +GPT-generated explanations as high quality in terms of linguistic fluency, +informativeness, persuasiveness, and logical soundness, (2) the persuasive +nature of these explanations, however, varied depending on the prompting +strategy employed, and (3) this persuasiveness may result in incorrect +judgments about the hatefulness of the content. Our study underscores the need +for caution in applying LLM-generated explanations for content moderation. Code +and results are available at https://github.com/Social-AI-Studio/GPT3-HateEval. + +
+
+ comment: 9 pages, 2 figures, Accepted by International Joint Conference on + Artificial Intelligence(IJCAI) +
+
+
+
+
+ + ♻ ☆ Effect of Attention and Self-Supervised Speech Embeddings on + Non-Semantic Speech Tasks + + +
+ Human emotion understanding is pivotal in making conversational technology +mainstream. We view speech emotion understanding as a perception task which is +a more realistic setting. With varying contexts (languages, demographics, etc.) +different share of people perceive the same speech segment as a non-unanimous +emotion. As part of the ACM Multimedia 2023 Computational Paralinguistics +ChallengE (ComParE) in the EMotion Share track, we leverage their rich dataset +of multilingual speakers and multi-label regression target of 'emotion share' +or perception of that emotion. We demonstrate that the training scheme of +different foundation models dictates their effectiveness for tasks beyond +speech recognition, especially for non-semantic speech tasks like emotion +understanding. This is a very complex task due to multilingual speakers, +variability in the target labels, and inherent imbalance in the regression +dataset. Our results show that HuBERT-Large with a self-attention-based +light-weight sequence model provides 4.6% improvement over the reported +baseline. + +
+
+ comment: Accepted to appear at ACM Multimedia 2023 Multimedia Grand Challenges + Track +
+
+
+
+
+ + ♻ ☆ Context-VQA: Towards Context-Aware and Purposeful Visual Question + Answering ICCV 2023 + + +
+ Visual question answering (VQA) has the potential to make the Internet more +accessible in an interactive way, allowing people who cannot see images to ask +questions about them. However, multiple studies have shown that people who are +blind or have low-vision prefer image explanations that incorporate the context +in which an image appears, yet current VQA datasets focus on images in +isolation. We argue that VQA models will not fully succeed at meeting people's +needs unless they take context into account. To further motivate and analyze +the distinction between different contexts, we introduce Context-VQA, a VQA +dataset that pairs images with contexts, specifically types of websites (e.g., +a shopping website). We find that the types of questions vary systematically +across contexts. For example, images presented in a travel context garner 2 +times more "Where?" questions, and images on social media and news garner 2.8 +and 1.8 times more "Who?" questions than the average. We also find that context +effects are especially important when participants can't see the image. These +results demonstrate that context affects the types of questions asked and that +VQA models should be context-sensitive to better meet people's needs, +especially in accessibility settings. + +
+
+ comment: Proceedings of ICCV 2023 Workshop on Closing the Loop Between Vision + and Language +
+
+
+
+
+ + ♻ ☆ MPI-rical: Data-Driven MPI Distributed Parallelism Assistance with + Transformers + + +
+ Message Passing Interface (MPI) plays a crucial role in distributed memory +parallelization across multiple nodes. However, parallelizing MPI code +manually, and specifically, performing domain decomposition, is a challenging, +error-prone task. In this paper, we address this problem by developing +MPI-RICAL, a novel data-driven, programming-assistance tool that assists +programmers in writing domain decomposition based distributed memory +parallelization code. Specifically, we train a supervised language model to +suggest MPI functions and their proper locations in the code on the fly. We +also introduce MPICodeCorpus, the first publicly available corpus of MPI-based +parallel programs that is created by mining more than 15,000 open-source +repositories on GitHub. Experimental results have been done on MPICodeCorpus +and more importantly, on a compiled benchmark of MPI-based parallel programs +for numerical computations that represent real-world scientific applications. +MPI-RICAL achieves F1 scores between 0.87-0.91 on these programs, demonstrating +its accuracy in suggesting correct MPI functions at appropriate code +locations.. The source code used in this work, as well as other relevant +sources, are available at: +https://github.com/Scientific-Computing-Lab-NRCN/MPI-rical + +
+
+
+
+
+ + ♻ ☆ LibriSQA: Advancing Free-form and Open-ended Spoken Question Answering + with a Novel Dataset and Framework + + +
+ While Large Language Models (LLMs) have demonstrated commendable performance +across a myriad of domains and tasks, existing LLMs still exhibit a palpable +deficit in handling multimodal functionalities, especially for the Spoken +Question Answering (SQA) task which necessitates precise alignment and deep +interaction between speech and text features. To address the SQA challenge on +LLMs, we initially curated the free-form and open-ended LibriSQA dataset from +Librispeech, comprising Part I with natural conversational formats and Part II +encompassing multiple-choice questions followed by answers and analytical +segments. Both parts collectively include 107k SQA pairs that cover various +topics. Given the evident paucity of existing speech-text LLMs, we propose a +lightweight, end-to-end framework to execute the SQA task on the LibriSQA, +witnessing significant results. By reforming ASR into the SQA format, we +further substantiate our framework's capability in handling ASR tasks. Our +empirical findings bolster the LLMs' aptitude for aligning and comprehending +multimodal information, paving the way for the development of universal +multimodal LLMs. The dataset and demo can be found at +https://github.com/ZihanZhaoSJTU/LibriSQA. + +
+
+
+
+
+ + ♻ ☆ Large Language Models are not Fair Evaluators + + +
+ In this paper, we uncover a systematic bias in the evaluation paradigm of +adopting large language models~(LLMs), e.g., GPT-4, as a referee to score and +compare the quality of responses generated by candidate models. We find that +the quality ranking of candidate responses can be easily hacked by simply +altering their order of appearance in the context. This manipulation allows us +to skew the evaluation result, making one model appear considerably superior to +the other, e.g., Vicuna-13B could beat ChatGPT on 66 over 80 tested queries +with ChatGPT as an evaluator. To address this issue, we propose a calibration +framework with three simple yet effective strategies: 1) Multiple Evidence +Calibration, which requires the evaluator model to generate multiple evaluation +evidence before assigning ratings; 2) Balanced Position Calibration, which +aggregates results across various orders to determine the final score; 3) +Human-in-the-Loop Calibration, which introduces a balanced position diversity +entropy to measure the difficulty of each example and seeks human assistance +when needed. We also manually annotate the "win/tie/lose" outcomes of responses +from ChatGPT and Vicuna-13B in the Vicuna Benchmark's question prompt, and +extensive experiments demonstrate that our approach successfully mitigates +evaluation bias, resulting in closer alignment with human judgments. We release +our code and human annotation at \url{https://github.com/i-Eval/FairEval} to +facilitate future research. + +
+
+
+
+
+ + ♻ ☆ Formal specification terminology for demographic agent-based models of + fixed-step single-clocked simulations + + +
+ This document presents adequate formal terminology for the mathematical +specification of a subset of Agent Based Models (ABMs) in the field of +Demography. The simulation of the targeted ABMs follows a fixed-step +single-clocked pattern. The proposed terminology further improves the model +understanding and can act as a stand-alone methodology for the specification +and optionally the documentation of a significant set of (demographic) ABMs. +Nevertheless, it is imaginable the this terminology probably with further +extensions can be merged with the largely-informal widely-used model +documentation and communication O.D.D. protocol [Grimm and et al., 2020, +Amouroux et al., 2010] to reduce many sources of ambiguity, hindering model +replications by other modelers. A published demographic model documentation, +largely simplified version of the Lone Parent Model [Gostoli and Silverman, +2020] is separately published in [Elsheikh, 2023b] as illustration for the +formal terminology. The model was implemented in the Julia language [Elsheikh, +2023a] based on the Agents.jl julia package [Datseris et al., 2022]. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2307.16548 +
+
+
+
+
+ + ♻ ☆ FurChat: An Embodied Conversational Agent using LLMs, Combining Open and + Closed-Domain Dialogue with Facial Expressions SIGDIAL 2023 + + +
+ We demonstrate an embodied conversational agent that can function as a +receptionist and generate a mixture of open and closed-domain dialogue along +with facial expressions, by using a large language model (LLM) to develop an +engaging conversation. We deployed the system onto a Furhat robot, which is +highly expressive and capable of using both verbal and nonverbal cues during +interaction. The system was designed specifically for the National Robotarium +to interact with visitors through natural conversations, providing them with +information about the facilities, research, news, upcoming events, etc. The +system utilises the state-of-the-art GPT-3.5 model to generate such information +along with domain-general conversations and facial expressions based on prompt +engineering. + +
+
+ comment: 5 pages, 2 figures, Accepted at SIGDIAL 2023 (24th Meeting of the + Special Interest Group on Discourse and Dialogue), for the demo video, see + https://youtu.be/fwtUl1kl22s +
+
+
+
+
+ + ♻ ☆ CLSE: Corpus of Linguistically Significant Entities EMNLP 2022 + + +
+ One of the biggest challenges of natural language generation (NLG) is the +proper handling of named entities. Named entities are a common source of +grammar mistakes such as wrong prepositions, wrong article handling, or +incorrect entity inflection. Without factoring linguistic representation, such +errors are often underrepresented when evaluating on a small set of arbitrarily +picked argument values, or when translating a dataset from a linguistically +simpler language, like English, to a linguistically complex language, like +Russian. However, for some applications, broadly precise grammatical +correctness is critical -- native speakers may find entity-related grammar +errors silly, jarring, or even offensive. + To enable the creation of more linguistically diverse NLG datasets, we +release a Corpus of Linguistically Significant Entities (CLSE) annotated by +linguist experts. The corpus includes 34 languages and covers 74 different +semantic types to support various applications from airline ticketing to video +games. To demonstrate one possible use of CLSE, we produce an augmented version +of the Schema-Guided Dialog Dataset, SGD-CLSE. Using the CLSE's entities and a +small number of human translations, we create a linguistically representative +NLG evaluation benchmark in three languages: French (high-resource), Marathi +(low-resource), and Russian (highly inflected language). We establish quality +baselines for neural, template-based, and hybrid NLG systems and discuss the +strengths and weaknesses of each approach. + +
+
+ comment: Proceedings of the 2nd Workshop on Natural Language Generation, + Evaluation, and Metrics (GEM 2022) at EMNLP 2022 +
+
+
+
+
+ + ♻ ☆ Red-Teaming Large Language Models using Chain of Utterances for + Safety-Alignment + + +
+ Larger language models (LLMs) have taken the world by storm with their +massive multi-tasking capabilities simply by optimizing over a next-word +prediction objective. With the emergence of their properties and encoded +knowledge, the risk of LLMs producing harmful outputs increases, making them +unfit for scalable deployment for the public. In this work, we propose a new +safety evaluation benchmark RED-EVAL that carries out red-teaming. We show that +even widely deployed models are susceptible to the Chain of Utterances-based +(CoU) prompting, jailbreaking closed source LLM-based systems such as GPT-4 and +ChatGPT to unethically respond to more than 65% and 73% of harmful queries. We +also demonstrate the consistency of the RED-EVAL across 8 open-source LLMs in +generating harmful responses in more than 86% of the red-teaming attempts. +Next, we propose RED-INSTRUCT--An approach for the safety alignment of LLMs. It +constitutes two phases: 1) HARMFULQA data collection: Leveraging CoU prompting, +we collect a dataset that consists of 1.9K harmful questions covering a wide +range of topics, 9.5K safe and 7.3K harmful conversations from ChatGPT; 2) +SAFE-ALIGN: We demonstrate how the conversational dataset can be used for the +safety alignment of LLMs by minimizing the negative log-likelihood over helpful +responses and penalizing over harmful responses by gradient accent over sample +loss. Our model STARLING, a fine-tuned Vicuna-7B, is observed to be more safely +aligned when evaluated on RED-EVAL and HHH benchmarks while preserving the +utility of the baseline models (TruthfulQA, MMLU, and BBH). + +
+
+
+
+
+ + ♻ ☆ SpikeBERT: A Language Spikformer Trained with Two-Stage Knowledge + Distillation from BERT + + +
+ Spiking neural networks (SNNs) offer a promising avenue to implement deep +neural networks in a more energy-efficient way. However, the network +architectures of existing SNNs for language tasks are too simplistic, and deep +architectures have not been fully explored, resulting in a significant +performance gap compared to mainstream transformer-based networks such as BERT. +To this end, we improve a recently-proposed spiking transformer (i.e., +Spikformer) to make it possible to process language tasks and propose a +two-stage knowledge distillation method for training it, which combines +pre-training by distilling knowledge from BERT with a large collection of +unlabelled texts and fine-tuning with task-specific instances via knowledge +distillation again from the BERT fine-tuned on the same training examples. +Through extensive experimentation, we show that the models trained with our +method, named SpikeBERT, outperform state-of-the-art SNNs and even achieve +comparable results to BERTs on text classification tasks for both English and +Chinese with much less energy consumption. + +
+
+
+
+
+ + ♻ ☆ Reliable Natural Language Understanding with Large Language Models and + Answer Set Programming + + +
+ Humans understand language by extracting information (meaning) from +sentences, combining it with existing commonsense knowledge, and then +performing reasoning to draw conclusions. While large language models (LLMs) +such as GPT-3 and ChatGPT are able to leverage patterns in the text to solve a +variety of NLP tasks, they fall short in problems that require reasoning. They +also cannot reliably explain the answers generated for a given question. In +order to emulate humans better, we propose STAR, a framework that combines LLMs +with Answer Set Programming (ASP). We show how LLMs can be used to effectively +extract knowledge -- represented as predicates -- from language. Goal-directed +ASP is then employed to reliably reason over this knowledge. We apply the STAR +framework to three different NLU tasks requiring reasoning: qualitative +reasoning, mathematical reasoning, and goal-directed conversation. Our +experiments reveal that STAR is able to bridge the gap of reasoning in NLU +tasks, leading to significant performance improvements, especially for smaller +LLMs, i.e., LLMs with a smaller number of parameters. NLU applications +developed using the STAR framework are also explainable: along with the +predicates generated, a justification in the form of a proof tree can be +produced for a given output. + +
+
+ comment: In Proceedings ICLP 2023, arXiv:2308.14898 +
+
+
+
+
+ + ♻ ☆ A Survey of Knowledge Enhanced Pre-trained Language Models + + +
+ Pre-trained Language Models (PLMs) which are trained on large text corpus via +self-supervised learning method, have yielded promising performance on various +tasks in Natural Language Processing (NLP). However, though PLMs with huge +parameters can effectively possess rich knowledge learned from massive training +text and benefit downstream tasks at the fine-tuning stage, they still have +some limitations such as poor reasoning ability due to the lack of external +knowledge. Research has been dedicated to incorporating knowledge into PLMs to +tackle these issues. In this paper, we present a comprehensive review of +Knowledge Enhanced Pre-trained Language Models (KE-PLMs) to provide a clear +insight into this thriving field. We introduce appropriate taxonomies +respectively for Natural Language Understanding (NLU) and Natural Language +Generation (NLG) to highlight these two main tasks of NLP. For NLU, we divide +the types of knowledge into four categories: linguistic knowledge, text +knowledge, knowledge graph (KG), and rule knowledge. The KE-PLMs for NLG are +categorized into KG-based and retrieval-based methods. Finally, we point out +some promising future directions of KE-PLMs. + +
+
+
+
+
+ + ♻ ☆ ONCE: Boosting Content-based Recommendation with Both Open- and + Closed-source Large Language Models + + +
+ Personalized content-based recommender systems have become indispensable +tools for users to navigate through the vast amount of content available on +platforms like daily news websites and book recommendation services. However, +existing recommenders face significant challenges in understanding the content +of items. Large language models (LLMs), which possess deep semantic +comprehension and extensive knowledge from pretraining, have proven to be +effective in various natural language processing tasks. In this study, we +explore the potential of leveraging both open- and closed-source LLMs to +enhance content-based recommendation. With open-source LLMs, we utilize their +deep layers as content encoders, enriching the representation of content at the +embedding level. For closed-source LLMs, we employ prompting techniques to +enrich the training data at the token level. Through comprehensive experiments, +we demonstrate the high effectiveness of both types of LLMs and show the +synergistic relationship between them. Notably, we observed a significant +relative improvement of up to 19.32% compared to existing state-of-the-art +recommendation models. These findings highlight the immense potential of both +open- and closed-source of LLMs in enhancing content-based recommendation +systems. We will make our code and LLM-generated data available for other +researchers to reproduce our results. + +
+
+
+
+
+ + ♻ ☆ Adapting Text-based Dialogue State Tracker for Spoken Dialogues SIGDIAL 2023 + + +
+ Although there have been remarkable advances in dialogue systems through the +dialogue systems technology competition (DSTC), it remains one of the key +challenges to building a robust task-oriented dialogue system with a speech +interface. Most of the progress has been made for text-based dialogue systems +since there are abundant datasets with written corpora while those with spoken +dialogues are very scarce. However, as can be seen from voice assistant systems +such as Siri and Alexa, it is of practical importance to transfer the success +to spoken dialogues. In this paper, we describe our engineering effort in +building a highly successful model that participated in the speech-aware +dialogue systems technology challenge track in DSTC11. Our model consists of +three major modules: (1) automatic speech recognition error correction to +bridge the gap between the spoken and the text utterances, (2) text-based +dialogue system (D3ST) for estimating the slots and values using slot +descriptions, and (3) post-processing for recovering the error of the estimated +slot value. Our experiments show that it is important to use an explicit +automatic speech recognition error correction module, post-processing, and data +augmentation to adapt a text-based dialogue state tracker for spoken dialogue +corpora. + +
+
+ comment: 8 pages, 5 figures, Accepted at the DSTC 11 Workshop to be located at + SIGDIAL 2023 +
+
+
+
+
+ + ♻ ☆ Marshall-Olkin Power-Law Distributions in Length-Frequency of Entities + + +
+ Entities involve important concepts with concrete meanings and play important +roles in numerous linguistic tasks. Entities have different forms in different +linguistic tasks and researchers treat those different forms as different +concepts. In this paper, we are curious to know whether there are some common +characteristics that connect those different forms of entities. Specifically, +we investigate the underlying distributions of entities from different types +and different languages, trying to figure out some common characteristics +behind those diverse entities. After analyzing twelve datasets about different +types of entities and eighteen datasets about entities in different languages, +we find that while these entities are dramatically diverse from each other in +many aspects, their length-frequencies can be well characterized by a family of +Marshall-Olkin power-law (MOPL) distributions. We conduct experiments on those +thirty datasets about entities in different types and different languages, and +experimental results demonstrate that MOPL models characterize the +length-frequencies of entities much better than two state-of-the-art power-law +models and an alternative log-normal model. Experimental results also +demonstrate that MOPL models are scalable to the length-frequency of entities +in large-scale real-world datasets. + +
+
+ comment: 33 pages, 3 figures (30 subfigures), 8 tables. To appear in + Knowledge-Based Systems +
+
+
+
+
+ + ♻ ☆ Automatically Correcting Large Language Models: Surveying the landscape + of diverse self-correction strategies + + +
+ Large language models (LLMs) have demonstrated remarkable performance across +a wide array of NLP tasks. However, their efficacy is undermined by undesired +and inconsistent behaviors, including hallucination, unfaithful reasoning, and +toxic content. A promising approach to rectify these flaws is self-correction, +where the LLM itself is prompted or guided to fix problems in its own output. +Techniques leveraging automated feedback -- either produced by the LLM itself +or some external system -- are of particular interest as they are a promising +way to make LLM-based solutions more practical and deployable with minimal +human feedback. This paper presents a comprehensive review of this emerging +class of techniques. We analyze and taxonomize a wide array of recent work +utilizing these strategies, including training-time, generation-time, and +post-hoc correction. We also summarize the major applications of this strategy +and conclude by discussing future directions and challenges. + +
+
+ comment: Work in Progress. Version 2 +
+
+
+
+
+ + ♻ ☆ WeaverBird: Empowering Financial Decision-Making with Large Language + Model, Knowledge Base, and Search Engine + + +
+ We present WeaverBird, an intelligent dialogue system designed specifically +for the finance domain. Our system harnesses a large language model of GPT +architecture that has been tuned using extensive corpora of finance-related +text. As a result, our system possesses the capability to understand complex +financial queries, such as "How should I manage my investments during +inflation?", and provide informed responses. Furthermore, our system +incorporates a local knowledge base and a search engine to retrieve relevant +information. The final responses are conditioned on the search results and +include proper citations to the sources, thus enjoying an enhanced credibility. +Through a range of finance-related questions, we have demonstrated the superior +performance of our system compared to other models. To experience our system +firsthand, users can interact with our live demo at +https://weaverbird.ttic.edu, as well as watch our 2-min video illustration at +https://www.youtube.com/watch?v=fyV2qQkX6Tc. + +
+
+
+
+
+ + ♻ ☆ RAFT: Reward rAnked FineTuning for Generative Foundation Model Alignment + + +
+ Generative foundation models are susceptible to implicit biases that can +arise from extensive unsupervised training data. Such biases can produce +suboptimal samples, skewed outcomes, and unfairness, with potentially serious +consequences. Consequently, aligning these models with human ethics and +preferences is an essential step toward ensuring their responsible and +effective deployment in real-world applications. Prior research has primarily +employed Reinforcement Learning from Human Feedback (RLHF) to address this +problem, where generative models are fine-tuned with RL algorithms guided by a +human-feedback-informed reward model. However, the inefficiencies and +instabilities associated with RL algorithms frequently present substantial +obstacles to the successful alignment, necessitating the development of a more +robust and streamlined approach. To this end, we introduce a new framework, +Reward rAnked FineTuning (RAFT), designed to align generative models +effectively. Utilizing a reward model and a sufficient number of samples, our +approach selects the high-quality samples, discarding those that exhibit +undesired behavior, and subsequently enhancing the model by fine-tuning on +these filtered samples. Our studies show that RAFT can effectively improve the +model performance in both reward learning and other automated metrics in both +large language models and diffusion models. + +
+
+ comment: 26 pages, 8 figures +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 104 + +
+
+
+ + ☆ Boosting Detection in Crowd Analysis via Underutilized Output Features + + +
+ Detection-based methods have been viewed unfavorably in crowd analysis due to +their poor performance in dense crowds. However, we argue that the potential of +these methods has been underestimated, as they offer crucial information for +crowd analysis that is often ignored. Specifically, the area size and +confidence score of output proposals and bounding boxes provide insight into +the scale and density of the crowd. To leverage these underutilized features, +we propose Crowd Hat, a plug-and-play module that can be easily integrated with +existing detection models. This module uses a mixed 2D-1D compression technique +to refine the output features and obtain the spatial and numerical distribution +of crowd-specific information. Based on these features, we further propose +region-adaptive NMS thresholds and a decouple-then-align paradigm that address +the major limitations of detection-based methods. Our extensive evaluations on +various crowd analysis tasks, including crowd counting, localization, and +detection, demonstrate the effectiveness of utilizing output features and the +potential of detection-based methods in crowd analysis. + +
+
+ comment: project page: https://fredfyyang.github.io/Crowd-Hat/ +
+
+
+
+
+ + ☆ SAM-Med2D + + +
+ The Segment Anything Model (SAM) represents a state-of-the-art research +advancement in natural image segmentation, achieving impressive results with +input prompts such as points and bounding boxes. However, our evaluation and +recent research indicate that directly applying the pretrained SAM to medical +image segmentation does not yield satisfactory performance. This limitation +primarily arises from significant domain gap between natural images and medical +images. To bridge this gap, we introduce SAM-Med2D, the most comprehensive +studies on applying SAM to medical 2D images. Specifically, we first collect +and curate approximately 4.6M images and 19.7M masks from public and private +datasets, constructing a large-scale medical image segmentation dataset +encompassing various modalities and objects. Then, we comprehensively fine-tune +SAM on this dataset and turn it into SAM-Med2D. Unlike previous methods that +only adopt bounding box or point prompts as interactive segmentation approach, +we adapt SAM to medical image segmentation through more comprehensive prompts +involving bounding boxes, points, and masks. We additionally fine-tune the +encoder and decoder of the original SAM to obtain a well-performed SAM-Med2D, +leading to the most comprehensive fine-tuning strategies to date. Finally, we +conducted a comprehensive evaluation and analysis to investigate the +performance of SAM-Med2D in medical image segmentation across various +modalities, anatomical structures, and organs. Concurrently, we validated the +generalization capability of SAM-Med2D on 9 datasets from MICCAI 2023 +challenge. Overall, our approach demonstrated significantly superior +performance and generalization capability compared to SAM. + +
+
+
+
+
+ + ☆ GREC: Generalized Referring Expression Comprehension + + +
+ The objective of Classic Referring Expression Comprehension (REC) is to +produce a bounding box corresponding to the object mentioned in a given textual +description. Commonly, existing datasets and techniques in classic REC are +tailored for expressions that pertain to a single target, meaning a sole +expression is linked to one specific object. Expressions that refer to multiple +targets or involve no specific target have not been taken into account. This +constraint hinders the practical applicability of REC. This study introduces a +new benchmark termed as Generalized Referring Expression Comprehension (GREC). +This benchmark extends the classic REC by permitting expressions to describe +any number of target objects. To achieve this goal, we have built the first +large-scale GREC dataset named gRefCOCO. This dataset encompasses a range of +expressions: those referring to multiple targets, expressions with no specific +target, and the single-target expressions. The design of GREC and gRefCOCO +ensures smooth compatibility with classic REC. The proposed gRefCOCO dataset, a +GREC method implementation code, and GREC evaluation code are available at +https://github.com/henghuiding/gRefCOCO. + +
+
+ comment: GREC Technical Report, Project Page: + https://henghuiding.github.io/GRES +
+
+
+
+
+ + ☆ MMVP: Motion-Matrix-based Video Prediction ICCV 2023 + + +
+ A central challenge of video prediction lies where the system has to reason +the objects' future motions from image frames while simultaneously maintaining +the consistency of their appearances across frames. This work introduces an +end-to-end trainable two-stream video prediction framework, Motion-Matrix-based +Video Prediction (MMVP), to tackle this challenge. Unlike previous methods that +usually handle motion prediction and appearance maintenance within the same set +of modules, MMVP decouples motion and appearance information by constructing +appearance-agnostic motion matrices. The motion matrices represent the temporal +similarity of each and every pair of feature patches in the input frames, and +are the sole input of the motion prediction module in MMVP. This design +improves video prediction in both accuracy and efficiency, and reduces the +model size. Results of extensive experiments demonstrate that MMVP outperforms +state-of-the-art systems on public data sets by non-negligible large margins +(about 1 db in PSNR, UCF Sports) in significantly smaller model sizes (84% the +size or smaller). Please refer to +https://github.com/Kay1794/MMVP-motion-matrix-based-video-prediction for the +official code and the datasets used in this paper. + +
+
+ comment: ICCV 2023 (Oral) +
+
+
+
+
+ + ☆ Modality Cycles with Masked Conditional Diffusion for Unsupervised + Anomaly Segmentation in MRI MICCAI + 2023 + + +
+ Unsupervised anomaly segmentation aims to detect patterns that are distinct +from any patterns processed during training, commonly called abnormal or +out-of-distribution patterns, without providing any associated manual +segmentations. Since anomalies during deployment can lead to model failure, +detecting the anomaly can enhance the reliability of models, which is valuable +in high-risk domains like medical imaging. This paper introduces Masked +Modality Cycles with Conditional Diffusion (MMCCD), a method that enables +segmentation of anomalies across diverse patterns in multimodal MRI. The method +is based on two fundamental ideas. First, we propose the use of cyclic modality +translation as a mechanism for enabling abnormality detection. +Image-translation models learn tissue-specific modality mappings, which are +characteristic of tissue physiology. Thus, these learned mappings fail to +translate tissues or image patterns that have never been encountered during +training, and the error enables their segmentation. Furthermore, we combine +image translation with a masked conditional diffusion model, which attempts to +`imagine' what tissue exists under a masked area, further exposing unknown +patterns as the generative model fails to recreate them. We evaluate our method +on a proxy task by training on healthy-looking slices of BraTS2021 +multi-modality MRIs and testing on slices with tumors. We show that our method +compares favorably to previous unsupervised approaches based on image +reconstruction and denoising with autoencoders and diffusion models. + +
+
+ comment: Accepted in Multiscale Multimodal Medical Imaging workshop in MICCAI + 2023 +
+
+
+
+
+ + ☆ CircleFormer: Circular Nuclei Detection in Whole Slide Images with + Circle Queries and Attention MICCAI 2023 + + +
+ Both CNN-based and Transformer-based object detection with bounding box +representation have been extensively studied in computer vision and medical +image analysis, but circular object detection in medical images is still +underexplored. Inspired by the recent anchor free CNN-based circular object +detection method (CircleNet) for ball-shape glomeruli detection in renal +pathology, in this paper, we present CircleFormer, a Transformer-based circular +medical object detection with dynamic anchor circles. Specifically, queries +with circle representation in Transformer decoder iteratively refine the +circular object detection results, and a circle cross attention module is +introduced to compute the similarity between circular queries and image +features. A generalized circle IoU (gCIoU) is proposed to serve as a new +regression loss of circular object detection as well. Moreover, our approach is +easy to generalize to the segmentation task by adding a simple segmentation +branch to CircleFormer. We evaluate our method in circular nuclei detection and +segmentation on the public MoNuSeg dataset, and the experimental results show +that our method achieves promising performance compared with the +state-of-the-art approaches. The effectiveness of each component is validated +via ablation studies as well. Our code is released at: +\url{https://github.com/zhanghx-iim-ahu/CircleFormer}. + +
+
+ comment: Accepted at MICCAI 2023 +
+
+
+
+
+ + ☆ MedShapeNet -- A Large-Scale Dataset of 3D Medical Shapes for Computer + Vision + + +
+ We present MedShapeNet, a large collection of anatomical shapes (e.g., bones, +organs, vessels) and 3D surgical instrument models. Prior to the deep learning +era, the broad application of statistical shape models (SSMs) in medical image +analysis is evidence that shapes have been commonly used to describe medical +data. Nowadays, however, state-of-the-art (SOTA) deep learning algorithms in +medical imaging are predominantly voxel-based. In computer vision, on the +contrary, shapes (including, voxel occupancy grids, meshes, point clouds and +implicit surface models) are preferred data representations in 3D, as seen from +the numerous shape-related publications in premier vision conferences, such as +the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), as +well as the increasing popularity of ShapeNet (about 51,300 models) and +Princeton ModelNet (127,915 models) in computer vision research. MedShapeNet is +created as an alternative to these commonly used shape benchmarks to facilitate +the translation of data-driven vision algorithms to medical applications, and +it extends the opportunities to adapt SOTA vision algorithms to solve critical +medical problems. Besides, the majority of the medical shapes in MedShapeNet +are modeled directly on the imaging data of real patients, and therefore it +complements well existing shape benchmarks comprising of computer-aided design +(CAD) models. MedShapeNet currently includes more than 100,000 medical shapes, +and provides annotations in the form of paired data. It is therefore also a +freely available repository of 3D models for extended reality (virtual reality +- VR, augmented reality - AR, mixed reality - MR) and medical 3D printing. This +white paper describes in detail the motivations behind MedShapeNet, the shape +acquisition procedures, the use cases, as well as the usage of the online shape +search portal: https://medshapenet.ikim.nrw/ + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ CorrEmbed: Evaluating Pre-trained Model Image Similarity Efficacy with a + Novel Metric + + +
+ Detecting visually similar images is a particularly useful attribute to look +to when calculating product recommendations. Embedding similarity, which +utilizes pre-trained computer vision models to extract high-level image +features, has demonstrated remarkable efficacy in identifying images with +similar compositions. However, there is a lack of methods for evaluating the +embeddings generated by these models, as conventional loss and performance +metrics do not adequately capture their performance in image similarity search +tasks. + In this paper, we evaluate the viability of the image embeddings from +numerous pre-trained computer vision models using a novel approach named +CorrEmbed. Our approach computes the correlation between distances in image +embeddings and distances in human-generated tag vectors. We extensively +evaluate numerous pre-trained Torchvision models using this metric, revealing +an intuitive relationship of linear scaling between ImageNet1k accuracy scores +and tag-correlation scores. Importantly, our method also identifies deviations +from this pattern, providing insights into how different models capture +high-level image features. + By offering a robust performance evaluation of these pre-trained models, +CorrEmbed serves as a valuable tool for researchers and practitioners seeking +to develop effective, data-driven approaches to similar item recommendations in +fashion retail. + +
+
+ comment: Accepted to AI-2023 Forty-third SGAI International Conference on + Artificial Intelligence +
+
+
+
+
+ + ☆ Improving Few-shot Image Generation by Structural Discrimination and + Textural Modulation ACM MM 2023 + + +
+ Few-shot image generation, which aims to produce plausible and diverse images +for one category given a few images from this category, has drawn extensive +attention. Existing approaches either globally interpolate different images or +fuse local representations with pre-defined coefficients. However, such an +intuitive combination of images/features only exploits the most relevant +information for generation, leading to poor diversity and coarse-grained +semantic fusion. To remedy this, this paper proposes a novel textural +modulation (TexMod) mechanism to inject external semantic signals into internal +local representations. Parameterized by the feedback from the discriminator, +our TexMod enables more fined-grained semantic injection while maintaining the +synthesis fidelity. Moreover, a global structural discriminator (StructD) is +developed to explicitly guide the model to generate images with reasonable +layout and outline. Furthermore, the frequency awareness of the model is +reinforced by encouraging the model to distinguish frequency signals. Together +with these techniques, we build a novel and effective model for few-shot image +generation. The effectiveness of our model is identified by extensive +experiments on three popular datasets and various settings. Besides achieving +state-of-the-art synthesis performance on these datasets, our proposed +techniques could be seamlessly integrated into existing models for a further +performance boost. + +
+
+ comment: To appear in ACM MM 2023, code is available at + https://github.com/kobeshegu/SDTM-GAN-ACMMM-2023 +
+
+
+
+
+ + ☆ Learned Image Reasoning Prior Penetrates Deep Unfolding Network for + Panchromatic and Multi-Spectral Image Fusion ICCV 2023 + + +
+ The success of deep neural networks for pan-sharpening is commonly in a form +of black box, lacking transparency and interpretability. To alleviate this +issue, we propose a novel model-driven deep unfolding framework with image +reasoning prior tailored for the pan-sharpening task. Different from existing +unfolding solutions that deliver the proximal operator networks as the +uncertain and vague priors, our framework is motivated by the content reasoning +ability of masked autoencoders (MAE) with insightful designs. Specifically, the +pre-trained MAE with spatial masking strategy, acting as intrinsic reasoning +prior, is embedded into unfolding architecture. Meanwhile, the pre-trained MAE +with spatial-spectral masking strategy is treated as the regularization term +within loss function to constrain the spatial-spectral consistency. Such +designs penetrate the image reasoning prior into deep unfolding networks while +improving its interpretability and representation capability. The uniqueness of +our framework is that the holistic learning process is explicitly integrated +with the inherent physical mechanism underlying the pan-sharpening task. +Extensive experiments on multiple satellite datasets demonstrate the +superiority of our method over the existing state-of-the-art approaches. Code +will be released at \url{https://manman1995.github.io/}. + +
+
+ comment: 10 pages; Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ SignDiff: Learning Diffusion Models for American Sign Language + Production + + +
+ The field of Sign Language Production (SLP) lacked a large-scale, pre-trained +model based on deep learning for continuous American Sign Language (ASL) +production in the past decade. This limitation hampers communication for all +individuals with disabilities relying on ASL. To address this issue, we +undertook the secondary development and utilization of How2Sign, one of the +largest publicly available ASL datasets. Despite its significance, prior +researchers in the field of sign language have not effectively employed this +corpus due to the intricacies involved in American Sign Language Production +(ASLP). + To conduct large-scale ASLP, we propose SignDiff based on the latest work in +related fields, which is a dual-condition diffusion pre-training model that can +generate human sign language speakers from a skeleton pose. SignDiff has a +novel Frame Reinforcement Network called FR-Net, similar to dense human pose +estimation work, which enhances the correspondence between text lexical symbols +and sign language dense pose frames reduce the occurrence of multiple fingers +in the diffusion model. In addition, our ASLP method proposes two new improved +modules and a new loss function to improve the accuracy and quality of sign +language skeletal posture and enhance the ability of the model to train on +large-scale data. + We propose the first baseline for ASL production and report the scores of +17.19 and 12.85 on BLEU-4 on the How2Sign dev/test sets. We also evaluated our +model on the previous mainstream dataset called PHOENIX14T, and the main +experiments achieved the results of SOTA. In addition, our image quality far +exceeds all previous results by 10 percentage points on the SSIM indicator. +Finally, we conducted ablation studies and qualitative evaluations for +discussion. + +
+
+
+
+
+ + ☆ Impact of Visual Context on Noisy Multimodal NMT: An Empirical Study for + English to Indian Languages + + +
+ The study investigates the effectiveness of utilizing multimodal information +in Neural Machine Translation (NMT). While prior research focused on using +multimodal data in low-resource scenarios, this study examines how image +features impact translation when added to a large-scale, pre-trained unimodal +NMT system. Surprisingly, the study finds that images might be redundant in +this context. Additionally, the research introduces synthetic noise to assess +whether images help the model deal with textual noise. Multimodal models +slightly outperform text-only models in noisy settings, even with random +images. The study's experiments translate from English to Hindi, Bengali, and +Malayalam, outperforming state-of-the-art benchmarks significantly. +Interestingly, the effect of visual context varies with source text noise: no +visual context works best for non-noisy translations, cropped image features +are optimal for low noise, and full image features work better in high-noise +scenarios. This sheds light on the role of visual context, especially in noisy +settings, opening up a new research direction for Noisy Neural Machine +Translation in multimodal setups. The research emphasizes the importance of +combining visual and textual information for improved translation in various +environments. + +
+
+
+
+
+ + ☆ Semantic Image Synthesis via Class-Adaptive Cross-Attention + + +
+ In semantic image synthesis, the state of the art is dominated by methods +that use spatially-adaptive normalization layers, which allow for excellent +visual generation quality and editing versatility. Granted their efficacy, +recent research efforts have focused toward finer-grained local style control +and multi-modal generation. By construction though, such layers tend to +overlook global image statistics leading to unconvincing local style editing +and causing global inconsistencies such as color or illumination distribution +shifts. Also, the semantic layout is required for mapping styles in the +generator, putting a strict alignment constraint over the features. In +response, we designed a novel architecture where cross-attention layers are +used in place of de-normalization ones for conditioning the image generation. +Our model inherits the advantages of both solutions, retaining state-of-the-art +reconstruction quality, as well as improved global and local style transfer. +Code and models available at https://github.com/TFonta/CA2SIS. + +
+
+
+
+
+ + ☆ From Pixels to Portraits: A Comprehensive Survey of Talking Head + Generation Techniques and Applications + + +
+ Recent advancements in deep learning and computer vision have led to a surge +of interest in generating realistic talking heads. This paper presents a +comprehensive survey of state-of-the-art methods for talking head generation. +We systematically categorises them into four main approaches: image-driven, +audio-driven, video-driven and others (including neural radiance fields (NeRF), +and 3D-based methods). We provide an in-depth analysis of each method, +highlighting their unique contributions, strengths, and limitations. +Furthermore, we thoroughly compare publicly available models, evaluating them +on key aspects such as inference time and human-rated quality of the generated +outputs. Our aim is to provide a clear and concise overview of the current +landscape in talking head generation, elucidating the relationships between +different approaches and identifying promising directions for future research. +This survey will serve as a valuable reference for researchers and +practitioners interested in this rapidly evolving field. + +
+
+
+
+
+ + ☆ Topology-aware MLP for Skeleton-based Action Recognition + + +
+ Graph convolution networks (GCNs) have achieved remarkable performance in +skeleton-based action recognition. However, existing previous GCN-based methods +have relied excessively on elaborate human body priors and constructed complex +feature aggregation mechanisms, which limits the generalizability of networks. +To solve these problems, we propose a novel Spatial Topology Gating Unit +(STGU), which is an MLP-based variant without extra priors, to capture the +co-occurrence topology features that encode the spatial dependency across all +joints. In STGU, to model the sample-specific and completely independent +point-wise topology attention, a new gate-based feature interaction mechanism +is introduced to activate the features point-to-point by the attention map +generated from the input. Based on the STGU, in this work, we propose the first +topology-aware MLP-based model, Ta-MLP, for skeleton-based action recognition. +In comparison with existing previous methods on three large-scale datasets, +Ta-MLP achieves competitive performance. In addition, Ta-MLP reduces the +parameters by up to 62.5% with favorable results. Compared with previous +state-of-the-art (SOAT) approaches, Ta-MLP pushes the frontier of real-time +action recognition. The code will be available at +https://github.com/BUPTSJZhang/Ta-MLP. + +
+
+
+
+
+ + ☆ DTrOCR: Decoder-only Transformer for Optical Character Recognition WACV2024 + + +
+ Typical text recognition methods rely on an encoder-decoder structure, in +which the encoder extracts features from an image, and the decoder produces +recognized text from these features. In this study, we propose a simpler and +more effective method for text recognition, known as the Decoder-only +Transformer for Optical Character Recognition (DTrOCR). This method uses a +decoder-only Transformer to take advantage of a generative language model that +is pre-trained on a large corpus. We examined whether a generative language +model that has been successful in natural language processing can also be +effective for text recognition in computer vision. Our experiments demonstrated +that DTrOCR outperforms current state-of-the-art methods by a large margin in +the recognition of printed, handwritten, and scene text in both English and +Chinese. + +
+
+ comment: Accepted to WACV2024 +
+
+
+
+
+ + ☆ DiffuVolume: Diffusion Model for Volume based Stereo Matching + + +
+ Stereo matching is a significant part in many computer vision tasks and +driving-based applications. Recently cost volume-based methods have achieved +great success benefiting from the rich geometry information in paired images. +However, the redundancy of cost volume also interferes with the model training +and limits the performance. To construct a more precise cost volume, we +pioneeringly apply the diffusion model to stereo matching. Our method, termed +DiffuVolume, considers the diffusion model as a cost volume filter, which will +recurrently remove the redundant information from the cost volume. Two main +designs make our method not trivial. Firstly, to make the diffusion model more +adaptive to stereo matching, we eschew the traditional manner of directly +adding noise into the image but embed the diffusion model into a task-specific +module. In this way, we outperform the traditional diffusion stereo matching +method by 22% EPE improvement and 240 times inference acceleration. Secondly, +DiffuVolume can be easily embedded into any volume-based stereo matching +network with boost performance but slight parameters rise (only 2%). By adding +the DiffuVolume into well-performed methods, we outperform all the published +methods on Scene Flow, KITTI2012, KITTI2015 benchmarks and zero-shot +generalization setting. It is worth mentioning that the proposed model ranks +1st on KITTI 2012 leader board, 2nd on KITTI 2015 leader board since 15, July +2023. + +
+
+ comment: 17 pages, 11 figures +
+
+
+
+
+ + ☆ Learning Structure-from-Motion with Graph Attention Networks + + +
+ In this paper we tackle the problem of learning Structure-from-Motion (SfM) +through the use of graph attention networks. SfM is a classic computer vision +problem that is solved though iterative minimization of reprojection errors, +referred to as Bundle Adjustment (BA), starting from a good initialization. In +order to obtain a good enough initialization to BA, conventional methods rely +on a sequence of sub-problems (such as pairwise pose estimation, pose averaging +or triangulation) which provides an initial solution that can then be refined +using BA. In this work we replace these sub-problems by learning a model that +takes as input the 2D keypoints detected across multiple views, and outputs the +corresponding camera poses and 3D keypoint coordinates. Our model takes +advantage of graph neural networks to learn SfM-specific primitives, and we +show that it can be used for fast inference of the reconstruction for new and +unseen sequences. The experimental results show that the proposed model +outperforms competing learning-based methods, and challenges COLMAP while +having lower runtime. + +
+
+
+
+
+ + ☆ RoboTAP: Tracking Arbitrary Points for Few-Shot Visual Imitation + + +
+ For robots to be useful outside labs and specialized factories we need a way +to teach them new useful behaviors quickly. Current approaches lack either the +generality to onboard new tasks without task-specific engineering, or else lack +the data-efficiency to do so in an amount of time that enables practical use. +In this work we explore dense tracking as a representational vehicle to allow +faster and more general learning from demonstration. Our approach utilizes +Track-Any-Point (TAP) models to isolate the relevant motion in a demonstration, +and parameterize a low-level controller to reproduce this motion across changes +in the scene configuration. We show this results in robust robot policies that +can solve complex object-arrangement tasks such as shape-matching, stacking, +and even full path-following tasks such as applying glue and sticking objects +together, all from demonstrations that can be collected in minutes. + +
+
+ comment: Project website: https://robotap.github.io +
+
+
+
+
+ + ☆ SHARP Challenge 2023: Solving CAD History and pArameters Recovery from + Point clouds and 3D scans. Overview, Datasets, Metrics, and Baselines + + +
+ Recent breakthroughs in geometric Deep Learning (DL) and the availability of +large Computer-Aided Design (CAD) datasets have advanced the research on +learning CAD modeling processes and relating them to real objects. In this +context, 3D reverse engineering of CAD models from 3D scans is considered to be +one of the most sought-after goals for the CAD industry. However, recent +efforts assume multiple simplifications limiting the applications in real-world +settings. The SHARP Challenge 2023 aims at pushing the research a step closer +to the real-world scenario of CAD reverse engineering through dedicated +datasets and tracks. In this paper, we define the proposed SHARP 2023 tracks, +describe the provided datasets, and propose a set of baseline methods along +with suitable evaluation metrics to assess the performance of the track +solutions. All proposed datasets along with useful routines and the evaluation +metrics are publicly available. + +
+
+
+
+
+ + ☆ Finding-Aware Anatomical Tokens for Chest X-Ray Automated Reporting + + +
+ The task of radiology reporting comprises describing and interpreting the +medical findings in radiographic images, including description of their +location and appearance. Automated approaches to radiology reporting require +the image to be encoded into a suitable token representation for input to the +language model. Previous methods commonly use convolutional neural networks to +encode an image into a series of image-level feature map representations. +However, the generated reports often exhibit realistic style but imperfect +accuracy. Inspired by recent works for image captioning in the general domain +in which each visual token corresponds to an object detected in an image, we +investigate whether using local tokens corresponding to anatomical structures +can improve the quality of the generated reports. We introduce a novel +adaptation of Faster R-CNN in which finding detection is performed for the +candidate bounding boxes extracted during anatomical structure localisation. We +use the resulting bounding box feature representations as our set of +finding-aware anatomical tokens. This encourages the extracted anatomical +tokens to be informative about the findings they contain (required for the +final task of radiology reporting). Evaluating on the MIMIC-CXR dataset of +chest X-Ray images, we show that task-aware anatomical tokens give +state-of-the-art performance when integrated into an automated reporting +pipeline, yielding generated reports with improved clinical accuracy. + +
+
+
+
+
+ + ☆ Fusing Pseudo Labels with Weak Supervision for Dynamic Traffic Scenarios ICCV + + +
+ Advanced Driver Assistance Systems (ADAS) have made significant strides, +capitalizing on computer vision to enhance perception and decision-making +capabilities. Nonetheless, the adaptation of these systems to diverse traffic +scenarios poses challenges due to shifts in data distribution stemming from +factors such as location, weather, and road infrastructure. To tackle this, we +introduce a weakly-supervised label unification pipeline that amalgamates +pseudo labels from a multitude of object detection models trained on +heterogeneous datasets. Our pipeline engenders a unified label space through +the amalgamation of labels from disparate datasets, rectifying bias and +enhancing generalization. We fine-tune multiple object detection models on +individual datasets, subsequently crafting a unified dataset featuring pseudo +labels, meticulously validated for precision. Following this, we retrain a +solitary object detection model using the merged label space, culminating in a +resilient model proficient in dynamic traffic scenarios. We put forth a +comprehensive evaluation of our approach, employing diverse datasets +originating from varied Asian countries, effectively demonstrating its efficacy +in challenging road conditions. Notably, our method yields substantial +enhancements in object detection performance, culminating in a model with +heightened resistance against domain shifts. + +
+
+ comment: This work was accepted as an extended abstract at the International + Conference on Computer Vision (ICCV) 2023 BRAVO Workshop, Paris, France +
+
+
+
+
+ + ☆ Latency-aware Unified Dynamic Networks for Efficient Image Recognition + + +
+ Dynamic computation has emerged as a promising avenue to enhance the +inference efficiency of deep networks. It allows selective activation of +computational units, leading to a reduction in unnecessary computations for +each input sample. However, the actual efficiency of these dynamic models can +deviate from theoretical predictions. This mismatch arises from: 1) the lack of +a unified approach due to fragmented research; 2) the focus on algorithm design +over critical scheduling strategies, especially in CUDA-enabled GPU contexts; +and 3) challenges in measuring practical latency, given that most libraries +cater to static operations. Addressing these issues, we unveil the +Latency-Aware Unified Dynamic Networks (LAUDNet), a framework that integrates +three primary dynamic paradigms-spatially adaptive computation, dynamic layer +skipping, and dynamic channel skipping. To bridge the theoretical and practical +efficiency gap, LAUDNet merges algorithmic design with scheduling optimization, +guided by a latency predictor that accurately gauges dynamic operator latency. +We've tested LAUDNet across multiple vision tasks, demonstrating its capacity +to notably reduce the latency of models like ResNet-101 by over 50% on +platforms such as V100, RTX3090, and TX2 GPUs. Notably, LAUDNet stands out in +balancing accuracy and efficiency. Code is available at: +https://www.github.com/LeapLabTHU/LAUDNet. + +
+
+
+
+
+ + ☆ Stage-by-stage Wavelet Optimization Refinement Diffusion Model for + Sparse-View CT Reconstruction + + +
+ Diffusion models have emerged as potential tools to tackle the challenge of +sparse-view CT reconstruction, displaying superior performance compared to +conventional methods. Nevertheless, these prevailing diffusion models +predominantly focus on the sinogram or image domains, which can lead to +instability during model training, potentially culminating in convergence +towards local minimal solutions. The wavelet trans-form serves to disentangle +image contents and features into distinct frequency-component bands at varying +scales, adeptly capturing diverse directional structures. Employing the Wavelet +transform as a guiding sparsity prior significantly enhances the robustness of +diffusion models. In this study, we present an innovative approach named the +Stage-by-stage Wavelet Optimization Refinement Diffusion (SWORD) model for +sparse-view CT reconstruction. Specifically, we establish a unified +mathematical model integrating low-frequency and high-frequency generative +models, achieving the solution with optimization procedure. Furthermore, we +perform the low-frequency and high-frequency generative models on wavelet's +decomposed components rather than sinogram or image domains, ensuring the +stability of model training. Our method rooted in established optimization +theory, comprising three distinct stages, including low-frequency generation, +high-frequency refinement and domain transform. Our experimental results +demonstrate that the proposed method outperforms existing state-of-the-art +methods both quantitatively and qualitatively. + +
+
+
+
+
+ + ☆ AnoVL: Adapting Vision-Language Models for Unified Zero-shot Anomaly + Localization + + +
+ Contrastive Language-Image Pre-training (CLIP) models have shown promising +performance on zero-shot visual recognition tasks by learning visual +representations under natural language supervision. Recent studies attempt the +use of CLIP to tackle zero-shot anomaly detection by matching images with +normal and abnormal state prompts. However, since CLIP focuses on building +correspondence between paired text prompts and global image-level +representations, the lack of patch-level vision to text alignment limits its +capability on precise visual anomaly localization. In this work, we introduce a +training-free adaptation (TFA) framework of CLIP for zero-shot anomaly +localization. In the visual encoder, we innovate a training-free value-wise +attention mechanism to extract intrinsic local tokens of CLIP for patch-level +local description. From the perspective of text supervision, we particularly +design a unified domain-aware contrastive state prompting template. On top of +the proposed TFA, we further introduce a test-time adaptation (TTA) mechanism +to refine anomaly localization results, where a layer of trainable parameters +in the adapter is optimized using TFA's pseudo-labels and synthetic +noise-corrupted tokens. With both TFA and TTA adaptation, we significantly +exploit the potential of CLIP for zero-shot anomaly localization and +demonstrate the effectiveness of our proposed methods on various datasets. + +
+
+
+
+
+ + ☆ Attention-based CT Scan Interpolation for Lesion Segmentation of + Colorectal Liver Metastases + + +
+ Small liver lesions common to colorectal liver metastases (CRLMs) are +challenging for convolutional neural network (CNN) segmentation models, +especially when we have a wide range of slice thicknesses in the computed +tomography (CT) scans. Slice thickness of CT images may vary by clinical +indication. For example, thinner slices are used for presurgical planning when +fine anatomic details of small vessels are required. While keeping the +effective radiation dose in patients as low as possible, various slice +thicknesses are employed in CRLMs due to their limitations. However, +differences in slice thickness across CTs lead to significant performance +degradation in CT segmentation models based on CNNs. This paper proposes a +novel unsupervised attention-based interpolation model to generate intermediate +slices from consecutive triplet slices in CT scans. We integrate segmentation +loss during the interpolation model's training to leverage segmentation labels +in existing slices to generate middle ones. Unlike common interpolation +techniques in CT volumes, our model highlights the regions of interest (liver +and lesions) inside the abdominal CT scans in the interpolated slice. Moreover, +our model's outputs are consistent with the original input slices while +increasing the segmentation performance in two cutting-edge 3D segmentation +pipelines. We tested the proposed model on the CRLM dataset to upsample +subjects with thick slices and create isotropic volume for our segmentation +model. The produced isotropic dataset increases the Dice score in the +segmentation of lesions and outperforms other interpolation approaches in terms +of interpolation metrics. + +
+
+
+
+
+ + ☆ Physics-Informed DeepMRI: Bridging the Gap from Heat Diffusion to + k-Space Interpolation + + +
+ In the field of parallel imaging (PI), alongside image-domain regularization +methods, substantial research has been dedicated to exploring $k$-space +interpolation. However, the interpretability of these methods remains an +unresolved issue. Furthermore, these approaches currently face acceleration +limitations that are comparable to those experienced by image-domain methods. +In order to enhance interpretability and overcome the acceleration limitations, +this paper introduces an interpretable framework that unifies both $k$-space +interpolation techniques and image-domain methods, grounded in the physical +principles of heat diffusion equations. Building upon this foundational +framework, a novel $k$-space interpolation method is proposed. Specifically, we +model the process of high-frequency information attenuation in $k$-space as a +heat diffusion equation, while the effort to reconstruct high-frequency +information from low-frequency regions can be conceptualized as a reverse heat +equation. However, solving the reverse heat equation poses a challenging +inverse problem. To tackle this challenge, we modify the heat equation to align +with the principles of magnetic resonance PI physics and employ the score-based +generative method to precisely execute the modified reverse heat diffusion. +Finally, experimental validation conducted on publicly available datasets +demonstrates the superiority of the proposed approach over traditional +$k$-space interpolation methods, deep learning-based $k$-space interpolation +methods, and conventional diffusion models in terms of reconstruction accuracy, +particularly in high-frequency regions. + +
+
+
+
+
+ + ☆ On the Potential of CLIP for Compositional Logical Reasoning + + +
+ In this paper we explore the possibility of using OpenAI's CLIP to perform +logically coherent grounded visual reasoning. To that end, we formalize our +terms and give a geometric analysis of how embeddings in CLIP's latent space +would need to be configured in order for the system to be logically coherent. +Our main conclusion is that, as usually configured, CLIP cannot perform such +reasoning. + +
+
+ comment: In Proceedings ICLP 2023, arXiv:2308.14898 +
+
+
+
+
+ + ☆ Interpretability-guided Data Augmentation for Robust Segmentation in + Multi-centre Colonoscopy Data MICCAI 2023 + + +
+ Multi-centre colonoscopy images from various medical centres exhibit distinct +complicating factors and overlays that impact the image content, contingent on +the specific acquisition centre. Existing Deep Segmentation networks struggle +to achieve adequate generalizability in such data sets, and the currently +available data augmentation methods do not effectively address these sources of +data variability. As a solution, we introduce an innovative data augmentation +approach centred on interpretability saliency maps, aimed at enhancing the +generalizability of Deep Learning models within the realm of multi-centre +colonoscopy image segmentation. The proposed augmentation technique +demonstrates increased robustness across different segmentation models and +domains. Thorough testing on a publicly available multi-centre dataset for +polyp detection demonstrates the effectiveness and versatility of our approach, +which is observed both in quantitative and qualitative results. The code is +publicly available at: +https://github.com/nki-radiology/interpretability_augmentation + +
+
+ comment: 10 pages, 4 figures, 1 table, accepted at MICCAI 2023 Workshop on + Machine Learning in Medical Imaging (MLMI) +
+
+
+
+
+ + ☆ Feature Attention Network (FA-Net): A Deep-Learning Based Approach for + Underwater Single Image Enhancement + + +
+ Underwater image processing and analysis have been a hotspot of study in +recent years, as more emphasis has been focused to underwater monitoring and +usage of marine resources. Compared with the open environment, underwater image +encountered with more complicated conditions such as light abortion, +scattering, turbulence, nonuniform illumination and color diffusion. Although +considerable advances and enhancement techniques achieved in resolving these +issues, they treat low-frequency information equally across the entire channel, +which results in limiting the network's representativeness. We propose a deep +learning and feature-attention-based end-to-end network (FA-Net) to solve this +problem. In particular, we propose a Residual Feature Attention Block (RFAB), +containing the channel attention, pixel attention, and residual learning +mechanism with long and short skip connections. RFAB allows the network to +focus on learning high-frequency information while skipping low-frequency +information on multi-hop connections. The channel and pixel attention mechanism +considers each channel's different features and the uneven distribution of haze +over different pixels in the image. The experimental results shows that the +FA-Net propose by us provides higher accuracy, quantitatively and qualitatively +and superiority to previous state-of-the-art methods. + +
+
+ comment: Fourteenth International Conference on Digital Image Processing + (ICDIP 2022), 2022, Wuhan, China, May 20-23, 2022.8 pages.5 Figures.doi: + 10.1117/12.2644516 +
+
+
+
+
+ + ☆ Semi-supervised Domain Adaptation with Inter and Intra-domain Mixing for + Semantic Segmentation + + +
+ Despite recent advances in semantic segmentation, an inevitable challenge is +the performance degradation caused by the domain shift in real application. +Current dominant approach to solve this problem is unsupervised domain +adaptation (UDA). However, the absence of labeled target data in UDA is overly +restrictive and limits performance. To overcome this limitation, a more +practical scenario called semi-supervised domain adaptation (SSDA) has been +proposed. Existing SSDA methods are derived from the UDA paradigm and primarily +focus on leveraging the unlabeled target data and source data. In this paper, +we highlight the significance of exploiting the intra-domain information +between the limited labeled target data and unlabeled target data, as it +greatly benefits domain adaptation. Instead of solely using the scarce labeled +data for supervision, we propose a novel SSDA framework that incorporates both +inter-domain mixing and intra-domain mixing, where inter-domain mixing +mitigates the source-target domain gap and intra-domain mixing enriches the +available target domain information. By simultaneously learning from +inter-domain mixing and intra-domain mixing, the network can capture more +domain-invariant features and promote its performance on the target domain. We +also explore different domain mixing operations to better exploit the target +domain information. Comprehensive experiments conducted on the GTA5toCityscapes +and SYNTHIA2Cityscapes benchmarks demonstrate the effectiveness of our method, +surpassing previous methods by a large margin. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ☆ Zero-shot Inversion Process for Image Attribute Editing with Diffusion + Models + + +
+ Denoising diffusion models have shown outstanding performance in image +editing. Existing works tend to use either image-guided methods, which provide +a visual reference but lack control over semantic coherence, or text-guided +methods, which ensure faithfulness to text guidance but lack visual quality. To +address the problem, we propose the Zero-shot Inversion Process (ZIP), a +framework that injects a fusion of generated visual reference and text guidance +into the semantic latent space of a \textit{frozen} pre-trained diffusion +model. Only using a tiny neural network, the proposed ZIP produces diverse +content and attributes under the intuitive control of the text prompt. +Moreover, ZIP shows remarkable robustness for both in-domain and out-of-domain +attribute manipulation on real images. We perform detailed experiments on +various benchmark datasets. Compared to state-of-the-art methods, ZIP produces +images of equivalent quality while providing a realistic editing effect. + +
+
+
+
+
+ + ☆ Exploring Multi-Modal Contextual Knowledge for Open-Vocabulary Object + Detection + + +
+ In this paper, we for the first time explore helpful multi-modal contextual +knowledge to understand novel categories for open-vocabulary object detection +(OVD). The multi-modal contextual knowledge stands for the joint relationship +across regions and words. However, it is challenging to incorporate such +multi-modal contextual knowledge into OVD. The reason is that previous +detection frameworks fail to jointly model multi-modal contextual knowledge, as +object detectors only support vision inputs and no caption description is +provided at test time. To this end, we propose a multi-modal contextual +knowledge distillation framework, MMC-Det, to transfer the learned contextual +knowledge from a teacher fusion transformer with diverse multi-modal masked +language modeling (D-MLM) to a student detector. The diverse multi-modal masked +language modeling is realized by an object divergence constraint upon +traditional multi-modal masked language modeling (MLM), in order to extract +fine-grained region-level visual contexts, which are vital to object detection. +Extensive experiments performed upon various detection datasets show the +effectiveness of our multi-modal context learning strategy, where our approach +well outperforms the recent state-of-the-art methods. + +
+
+
+
+
+ + ☆ Reconstructing Groups of People with Hypergraph Relational Reasoning ICCV2023 + + +
+ Due to the mutual occlusion, severe scale variation, and complex spatial +distribution, the current multi-person mesh recovery methods cannot produce +accurate absolute body poses and shapes in large-scale crowded scenes. To +address the obstacles, we fully exploit crowd features for reconstructing +groups of people from a monocular image. A novel hypergraph relational +reasoning network is proposed to formulate the complex and high-order relation +correlations among individuals and groups in the crowd. We first extract +compact human features and location information from the original +high-resolution image. By conducting the relational reasoning on the extracted +individual features, the underlying crowd collectiveness and interaction +relationship can provide additional group information for the reconstruction. +Finally, the updated individual features and the localization information are +used to regress human meshes in camera coordinates. To facilitate the network +training, we further build pseudo ground-truth on two crowd datasets, which may +also promote future research on pose estimation and human behavior +understanding in crowded scenes. The experimental results show that our +approach outperforms other baseline methods both in crowded and common +scenarios. The code and datasets are publicly available at +https://github.com/boycehbz/GroupRec. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ Utilizing Task-Generic Motion Prior to Recover Full-Body Motion from + Very Sparse Signals + + +
+ The most popular type of devices used to track a user's posture in a virtual +reality experience consists of a head-mounted display and two controllers held +in both hands. However, due to the limited number of tracking sensors (three in +total), faithfully recovering the user in full-body is challenging, limiting +the potential for interactions among simulated user avatars within the virtual +world. Therefore, recent studies have attempted to reconstruct full-body poses +using neural networks that utilize previously learned human poses or accept a +series of past poses over a short period. In this paper, we propose a method +that utilizes information from a neural motion prior to improve the accuracy of +reconstructed user's motions. Our approach aims to reconstruct user's full-body +poses by predicting the latent representation of the user's overall motion from +limited input signals and integrating this information with tracking sensor +inputs. This is based on the premise that the ultimate goal of pose +reconstruction is to reconstruct the motion, which is a series of poses. Our +results show that this integration enables more accurate reconstruction of the +user's full-body motion, particularly enhancing the robustness of lower body +motion reconstruction from impoverished signals. Web: +https://https://mjsh34.github.io/mp-sspe/ + +
+
+
+
+
+ + ☆ Early Detection of Red Palm Weevil Infestations using Deep Learning + Classification of Acoustic Signals + + +
+ The Red Palm Weevil (RPW), also known as the palm weevil, is considered among +the world's most damaging insect pests of palms. Current detection techniques +include the detection of symptoms of RPW using visual or sound inspection and +chemical detection of volatile signatures generated by infested palm trees. +However, efficient detection of RPW diseases at an early stage is considered +one of the most challenging issues for cultivating date palms. In this paper, +an efficient approach to the early detection of RPW is proposed. The proposed +approach is based on RPW sound activities being recorded and analyzed. The +first step involves the conversion of sound data into images based on a +selected set of features. The second step involves the combination of images +from the same sound file but computed by different features into a single +image. The third step involves the application of different Deep Learning (DL) +techniques to classify resulting images into two classes: infested and not +infested. Experimental results show good performances of the proposed approach +for RPW detection using different DL techniques, namely MobileNetV2, +ResNet50V2, ResNet152V2, VGG16, VGG19, DenseNet121, DenseNet201, Xception, and +InceptionV3. The proposed approach outperformed existing techniques for public +datasets. + +
+
+
+
+
+ + ☆ Introducing Language Guidance in Prompt-based Continual Learning ICCV 2023 + + +
+ Continual Learning aims to learn a single model on a sequence of tasks +without having access to data from previous tasks. The biggest challenge in the +domain still remains catastrophic forgetting: a loss in performance on seen +classes of earlier tasks. Some existing methods rely on an expensive replay +buffer to store a chunk of data from previous tasks. This, while promising, +becomes expensive when the number of tasks becomes large or data can not be +stored for privacy reasons. As an alternative, prompt-based methods have been +proposed that store the task information in a learnable prompt pool. This +prompt pool instructs a frozen image encoder on how to solve each task. While +the model faces a disjoint set of classes in each task in this setting, we +argue that these classes can be encoded to the same embedding space of a +pre-trained language encoder. In this work, we propose Language Guidance for +Prompt-based Continual Learning (LGCL) as a plug-in for prompt-based methods. +LGCL is model agnostic and introduces language guidance at the task level in +the prompt pool and at the class level on the output feature of the vision +encoder. We show with extensive experimentation that LGCL consistently improves +the performance of prompt-based continual learning methods to set a new +state-of-the art. LGCL achieves these performance improvements without needing +any additional learnable parameters. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ☆ AMDNet23: A combined deep Contour-based Convolutional Neural Network and + Long Short Term Memory system to diagnose Age-related Macular Degeneration + + +
+ In light of the expanding population, an automated framework of disease +detection can assist doctors in the diagnosis of ocular diseases, yields +accurate, stable, rapid outcomes, and improves the success rate of early +detection. The work initially intended the enhancing the quality of fundus +images by employing an adaptive contrast enhancement algorithm (CLAHE) and +Gamma correction. In the preprocessing techniques, CLAHE elevates the local +contrast of the fundus image and gamma correction increases the intensity of +relevant features. This study operates on a AMDNet23 system of deep learning +that combined the neural networks made up of convolutions (CNN) and short-term +and long-term memory (LSTM) to automatically detect aged macular degeneration +(AMD) disease from fundus ophthalmology. In this mechanism, CNN is utilized for +extracting features and LSTM is utilized to detect the extracted features. The +dataset of this research is collected from multiple sources and afterward +applied quality assessment techniques, 2000 experimental fundus images +encompass four distinct classes equitably. The proposed hybrid deep AMDNet23 +model demonstrates to detection of AMD ocular disease and the experimental +result achieved an accuracy 96.50%, specificity 99.32%, sensitivity 96.5%, and +F1-score 96.49.0%. The system achieves state-of-the-art findings on fundus +imagery datasets to diagnose AMD ocular disease and findings effectively +potential of our method. + +
+
+
+
+
+ + ☆ Improving Underwater Visual Tracking With a Large Scale Dataset and + Image Enhancement + + +
+ This paper presents a new dataset and general tracker enhancement method for +Underwater Visual Object Tracking (UVOT). Despite its significance, underwater +tracking has remained unexplored due to data inaccessibility. It poses distinct +challenges; the underwater environment exhibits non-uniform lighting +conditions, low visibility, lack of sharpness, low contrast, camouflage, and +reflections from suspended particles. Performance of traditional tracking +methods designed primarily for terrestrial or open-air scenarios drops in such +conditions. We address the problem by proposing a novel underwater image +enhancement algorithm designed specifically to boost tracking quality. The +method has resulted in a significant performance improvement, of up to 5.0% +AUC, of state-of-the-art (SOTA) visual trackers. To develop robust and accurate +UVOT methods, large-scale datasets are required. To this end, we introduce a +large-scale UVOT benchmark dataset consisting of 400 video segments and 275,000 +manually annotated frames enabling underwater training and evaluation of deep +trackers. The videos are labelled with several underwater-specific tracking +attributes including watercolor variation, target distractors, camouflage, +target relative size, and low visibility conditions. The UVOT400 dataset, +tracking results, and the code are publicly available on: +https://github.com/BasitAlawode/UWVOT400. + +
+
+
+
+
+ + ☆ ACNPU: A 4.75TOPS/W 1080P@30FPS Super Resolution Accelerator with + Decoupled Asymmetric Convolution + + +
+ Deep learning-driven superresolution (SR) outperforms traditional techniques +but also faces the challenge of high complexity and memory bandwidth. This +challenge leads many accelerators to opt for simpler and shallow models like +FSRCNN, compromising performance for real-time needs, especially for +resource-limited edge devices. This paper proposes an energy-efficient SR +accelerator, ACNPU, to tackle this challenge. The ACNPU enhances image quality +by 0.34dB with a 27-layer model, but needs 36\% less complexity than FSRCNN, +while maintaining a similar model size, with the \textit{decoupled asymmetric +convolution and split-bypass structure}. The hardware-friendly 17K-parameter +model enables \textit{holistic model fusion} instead of localized layer fusion +to remove external DRAM access of intermediate feature maps. The on-chip memory +bandwidth is further reduced with the \textit{input stationary flow} and +\textit{parallel-layer execution} to reduce power consumption. Hardware is +regular and easy to control to support different layers by \textit{processing +elements (PEs) clusters with reconfigurable input and uniform data flow}. The +implementation in the 40 nm CMOS process consumes 2333 K gate counts and 198KB +SRAMs. The ACNPU achieves 31.7 FPS and 124.4 FPS for x2 and x4 scales Full-HD +generation, respectively, which attains 4.75 TOPS/W energy efficiency. + +
+
+ comment: 9 pages, 14 figures +
+
+
+
+
+ + ☆ Occlusion-Aware Detection and Re-ID Calibrated Network for Multi-Object + Tracking + + +
+ Multi-Object Tracking (MOT) is a crucial computer vision task that aims to +predict the bounding boxes and identities of objects simultaneously. While +state-of-the-art methods have made remarkable progress by jointly optimizing +the multi-task problems of detection and Re-ID feature learning, yet, few +approaches explore to tackle the occlusion issue, which is a long-standing +challenge in the MOT field. Generally, occluded objects may hinder the detector +from estimating the bounding boxes, resulting in fragmented trajectories. And +the learned occluded Re-ID embeddings are less distinct since they contain +interferer. To this end, we propose an occlusion-aware detection and Re-ID +calibrated network for multi-object tracking, termed as ORCTrack. Specifically, +we propose an Occlusion-Aware Attention (OAA) module in the detector that +highlights the object features while suppressing the occluded background +regions. OAA can serve as a modulator that enhances the detector for some +potentially occluded objects. Furthermore, we design a Re-ID embedding matching +block based on the optimal transport problem, which focuses on enhancing and +calibrating the Re-ID representations through different adjacent frames +complementarily. To validate the effectiveness of the proposed method, +extensive experiments are conducted on two challenging VisDrone2021-MOT and +KITTI benchmarks. Experimental evaluations demonstrate the superiority of our +approach, which can achieve new state-of-the-art performance and enjoy high +run-time efficiency. + +
+
+
+
+
+ + ☆ Neural Video Compression with Temporal Layer-Adaptive Hierarchical + B-frame Coding + + +
+ Neural video compression (NVC) is a rapidly evolving video coding research +area, with some models achieving superior coding efficiency compared to the +latest video coding standard Versatile Video Coding (VVC). In conventional +video coding standards, the hierarchical B-frame coding, which utilizes a +bidirectional prediction structure for higher compression, had been +well-studied and exploited. In NVC, however, limited research has investigated +the hierarchical B scheme. In this paper, we propose an NVC model exploiting +hierarchical B-frame coding with temporal layer-adaptive optimization. We first +extend an existing unidirectional NVC model to a bidirectional model, which +achieves -21.13% BD-rate gain over the unidirectional baseline model. However, +this model faces challenges when applied to sequences with complex or large +motions, leading to performance degradation. To address this, we introduce +temporal layer-adaptive optimization, incorporating methods such as temporal +layer-adaptive quality scaling (TAQS) and temporal layer-adaptive latent +scaling (TALS). The final model with the proposed methods achieves an +impressive BD-rate gain of -39.86% against the baseline. It also resolves the +challenges in sequences with large or complex motions with up to -49.13% more +BD-rate gains than the simple bidirectional extension. This improvement is +attributed to the allocation of more bits to lower temporal layers, thereby +enhancing overall reconstruction quality with smaller bits. Since our method +has little dependency on a specific NVC model architecture, it can serve as a +general tool for extending unidirectional NVC models to the ones with +hierarchical B-frame coding. + +
+
+
+
+
+ + ☆ Large-scale data extraction from the UNOS organ donor documents + + +
+ The scope of our study is all UNOS data of the USA organ donors since 2008. +The data is not analyzable in a large scale in the past because it was captured +in PDF documents known as "Attachments", whereby every donor is represented by +dozens of PDF documents in heterogenous formats. To make the data analyzable, +one needs to convert the content inside these PDFs to an analyzable data +format, such as a standard SQL database. In this paper we will focus on 2022 +UNOS data comprised of $\approx 400,000$ PDF documents spanning millions of +pages. The totality of UNOS data covers 15 years (2008--20022) and our results +will be quickly extended to the entire data. Our method captures a portion of +the data in DCD flowsheets, kidney perfusion data, and data captured during +patient hospital stay (e.g. vital signs, ventilator settings, etc.). The +current paper assumes that the reader is familiar with the content of the UNOS +data. The overview of the types of data and challenges they present is a +subject of another paper. Here we focus on demonstrating that the goal of +building a comprehensive, analyzable database from UNOS documents is an +attainable task, and we provide an overview of our methodology. The project +resulted in datasets by far larger than previously available even in this +preliminary phase. + +
+
+
+
+
+ + ☆ Beard Segmentation and Recognition Bias + + +
+ A person's facial hairstyle, such as presence and size of beard, can +significantly impact face recognition accuracy. There are publicly-available +deep networks that achieve reasonable accuracy at binary attribute +classification, such as beard / no beard, but few if any that segment the +facial hair region. To investigate the effect of facial hair in a rigorous +manner, we first created a set of fine-grained facial hair annotations to train +a segmentation model and evaluate its accuracy across African-American and +Caucasian face images. We then use our facial hair segmentations to categorize +image pairs according to the degree of difference or similarity in the facial +hairstyle. We find that the False Match Rate (FMR) for image pairs with +different categories of facial hairstyle varies by a factor of over 10 for +African-American males and over 25 for Caucasian males. To reduce the bias +across image pairs with different facial hairstyles, we propose a scheme for +adaptive thresholding based on facial hairstyle similarity. Evaluation on a +subject-disjoint set of images shows that adaptive similarity thresholding +based on facial hairstyles of the image pair reduces the ratio between the +highest and lowest FMR across facial hairstyle categories for African-American +from 10.7 to 1.8 and for Caucasians from 25.9 to 1.3. Facial hair annotations +and facial hair segmentation model will be publicly available. + +
+
+
+
+
+ + ☆ Drone-NeRF: Efficient NeRF Based 3D Scene Reconstruction for Large-Scale + Drone Survey + + +
+ Neural rendering has garnered substantial attention owing to its capacity for +creating realistic 3D scenes. However, its applicability to extensive scenes +remains challenging, with limitations in effectiveness. In this work, we +propose the Drone-NeRF framework to enhance the efficient reconstruction of +unbounded large-scale scenes suited for drone oblique photography using Neural +Radiance Fields (NeRF). Our approach involves dividing the scene into uniform +sub-blocks based on camera position and depth visibility. Sub-scenes are +trained in parallel using NeRF, then merged for a complete scene. We refine the +model by optimizing camera poses and guiding NeRF with a uniform sampler. +Integrating chosen samples enhances accuracy. A hash-coded fusion MLP +accelerates density representation, yielding RGB and Depth outputs. Our +framework accounts for sub-scene constraints, reduces parallel-training noise, +handles shadow occlusion, and merges sub-regions for a polished rendering +result. This Drone-NeRF framework demonstrates promising capabilities in +addressing challenges related to scene complexity, rendering efficiency, and +accuracy in drone-obtained imagery. + +
+
+ comment: 15 pages, 7 figures, in submission +
+
+
+
+
+ + ☆ Background Debiased SAR Target Recognition via Causal Interventional + Regularizer + + +
+ Recent studies have utilized deep learning (DL) techniques to automatically +extract features from synthetic aperture radar (SAR) images, which shows great +promise for enhancing the performance of SAR automatic target recognition +(ATR). However, our research reveals a previously overlooked issue: SAR images +to be recognized include not only the foreground (i.e., the target), but also a +certain size of the background area. When a DL-model is trained exclusively on +foreground data, its recognition performance is significantly superior to a +model trained on original data that includes both foreground and background. +This suggests that the presence of background impedes the ability of the +DL-model to learn additional semantic information about the target. To address +this issue, we construct a structural causal model (SCM) that incorporates the +background as a confounder. Based on the constructed SCM, we propose a causal +intervention based regularization method to eliminate the negative impact of +background on feature semantic learning and achieve background debiased +SAR-ATR. The proposed causal interventional regularizer can be integrated into +any existing DL-based SAR-ATR models to mitigate the impact of background +interference on the feature extraction and recognition accuracy. Experimental +results on the Moving and Stationary Target Acquisition and Recognition (MSTAR) +dataset indicate that the proposed method can enhance the efficiency of +existing DL-based methods in a plug-and-play manner. + +
+
+ comment: 38 pages, 8 figures +
+
+
+
+
+ + ☆ Towards Earlier Detection of Oral Diseases On Smartphones Using Oral and + Dental RGB Images + + +
+ Oral diseases such as periodontal (gum) diseases and dental caries (cavities) +affect billions of people across the world today. However, previous +state-of-the-art models have relied on X-ray images to detect oral diseases, +making them inaccessible to remote monitoring, developing countries, and +telemedicine. To combat this overuse of X-ray imagery, we propose a lightweight +machine learning model capable of detecting calculus (also known as hardened +plaque or tartar) in RGB images while running efficiently on low-end devices. +The model, a modified MobileNetV3-Small neural network transfer learned from +ImageNet, achieved an accuracy of 72.73% (which is comparable to +state-of-the-art solutions) while still being able to run on mobile devices due +to its reduced memory requirements and processing times. A ResNet34-based model +was also constructed and achieved an accuracy of 81.82%. Both of these models +were tested on a mobile app, demonstrating their potential to limit the number +of serious oral disease cases as their predictions can help patients schedule +appointments earlier without the need to go to the clinic. + +
+
+ comment: 10 pages, 6 figures, 1 formula. This research was conducted as a + mentored project performed for a college course and research program at the + University of California Santa Barbara's Summer Research Academies program +
+
+
+
+
+ + ☆ Intriguing Properties of Diffusion Models: A Large-Scale Dataset for + Evaluating Natural Attack Capability in Text-to-Image Generative Models + + +
+ Denoising probabilistic diffusion models have shown breakthrough performance +that can generate more photo-realistic images or human-level illustrations than +the prior models such as GANs. This high image-generation capability has +stimulated the creation of many downstream applications in various areas. +However, we find that this technology is indeed a double-edged sword: We +identify a new type of attack, called the Natural Denoising Diffusion (NDD) +attack based on the finding that state-of-the-art deep neural network (DNN) +models still hold their prediction even if we intentionally remove their robust +features, which are essential to the human visual system (HVS), by text +prompts. The NDD attack can generate low-cost, model-agnostic, and +transferrable adversarial attacks by exploiting the natural attack capability +in diffusion models. Motivated by the finding, we construct a large-scale +dataset, Natural Denoising Diffusion Attack (NDDA) dataset, to systematically +evaluate the risk of the natural attack capability of diffusion models with +state-of-the-art text-to-image diffusion models. We evaluate the natural attack +capability by answering 6 research questions. Through a user study to confirm +the validity of the NDD attack, we find that the NDD attack can achieve an 88% +detection rate while being stealthy to 93% of human subjects. We also find that +the non-robust features embedded by diffusion models contribute to the natural +attack capability. To confirm the model-agnostic and transferrable attack +capability, we perform the NDD attack against an AD vehicle and find that 73% +of the physically printed attacks can be detected as a stop sign. We hope that +our study and dataset can help our community to be aware of the risk of +diffusion models and facilitate further research toward robust DNN models. + +
+
+
+
+
+ + ☆ CongNaMul: A Dataset for Advanced Image Processing of Soybean Sprouts + + +
+ We present 'CongNaMul', a comprehensive dataset designed for various tasks in +soybean sprouts image analysis. The CongNaMul dataset is curated to facilitate +tasks such as image classification, semantic segmentation, decomposition, and +measurement of length and weight. The classification task provides four classes +to determine the quality of soybean sprouts: normal, broken, spotted, and +broken and spotted, for the development of AI-aided automatic quality +inspection technology. For semantic segmentation, images with varying +complexity, from single sprout images to images with multiple sprouts, along +with human-labelled mask images, are included. The label has 4 different +classes: background, head, body, tail. The dataset also provides images and +masks for the image decomposition task, including two separate sprout images +and their combined form. Lastly, 5 physical features of sprouts (head length, +body length, body thickness, tail length, weight) are provided for image-based +measurement tasks. This dataset is expected to be a valuable resource for a +wide range of research and applications in the advanced analysis of images of +soybean sprouts. Also, we hope that this dataset can assist researchers +studying classification, semantic segmentation, decomposition, and physical +feature measurement in other industrial fields, in evaluating their models. The +dataset is available at the authors' repository. (https://bhban.kr/data) + +
+
+ comment: Accepted to International Conference on ICT Convergence 2023 +
+
+
+
+
+ + ♻ ☆ Going Beyond Nouns With Vision & Language Models Using Synthetic Data ICCV 2023 + + +
+ Large-scale pre-trained Vision & Language (VL) models have shown remarkable +performance in many applications, enabling replacing a fixed set of supported +classes with zero-shot open vocabulary reasoning over (almost arbitrary) +natural language prompts. However, recent works have uncovered a fundamental +weakness of these models. For example, their difficulty to understand Visual +Language Concepts (VLC) that go 'beyond nouns' such as the meaning of +non-object words (e.g., attributes, actions, relations, states, etc.), or +difficulty in performing compositional reasoning such as understanding the +significance of the order of the words in a sentence. In this work, we +investigate to which extent purely synthetic data could be leveraged to teach +these models to overcome such shortcomings without compromising their zero-shot +capabilities. We contribute Synthetic Visual Concepts (SyViC) - a million-scale +synthetic dataset and data generation codebase allowing to generate additional +suitable data to improve VLC understanding and compositional reasoning of VL +models. Additionally, we propose a general VL finetuning strategy for +effectively leveraging SyViC towards achieving these improvements. Our +extensive experiments and ablations on VL-Checklist, Winoground, and ARO +benchmarks demonstrate that it is possible to adapt strong pre-trained VL +models with synthetic data significantly enhancing their VLC understanding +(e.g. by 9.9% on ARO and 4.3% on VL-Checklist) with under 1% drop in their +zero-shot accuracy. + +
+
+ comment: Accepted to ICCV 2023. Project page: https://synthetic-vic.github.io/ +
+
+
+
+
+ + ♻ ☆ CartiMorph: a framework for automated knee articular cartilage + morphometrics + + +
+ We introduce CartiMorph, a framework for automated knee articular cartilage +morphometrics. It takes an image as input and generates quantitative metrics +for cartilage subregions, including the percentage of full-thickness cartilage +loss (FCL), mean thickness, surface area, and volume. CartiMorph leverages the +power of deep learning models for hierarchical image feature representation. +Deep learning models were trained and validated for tissue segmentation, +template construction, and template-to-image registration. We established +methods for surface-normal-based cartilage thickness mapping, FCL estimation, +and rule-based cartilage parcellation. Our cartilage thickness map showed less +error in thin and peripheral regions. We evaluated the effectiveness of the +adopted segmentation model by comparing the quantitative metrics obtained from +model segmentation and those from manual segmentation. The root-mean-squared +deviation of the FCL measurements was less than 8%, and strong correlations +were observed for the mean thickness (Pearson's correlation coefficient $\rho +\in [0.82,0.97]$), surface area ($\rho \in [0.82,0.98]$) and volume ($\rho \in +[0.89,0.98]$) measurements. We compared our FCL measurements with those from a +previous study and found that our measurements deviated less from the ground +truths. We observed superior performance of the proposed rule-based cartilage +parcellation method compared with the atlas-based approach. CartiMorph has the +potential to promote imaging biomarkers discovery for knee osteoarthritis. + +
+
+ comment: To be published in Medical Image Analysis +
+
+
+
+
+ + ♻ ☆ Uncertainty-Aware Source-Free Adaptive Image Super-Resolution with + Wavelet Augmentation Transformer + + +
+ Unsupervised Domain Adaptation (UDA) can effectively address domain gap +issues in real-world image Super-Resolution (SR) by accessing both the source +and target data. Considering privacy policies or transmission restrictions of +source data in practical scenarios, we propose a SOurce-free Domain Adaptation +framework for image SR (SODA-SR) to address this issue, i.e., adapt a +source-trained model to a target domain with only unlabeled target data. +SODA-SR leverages the source-trained model to generate refined pseudo-labels +for teacher-student learning. To better utilize pseudo-labels, we propose a +novel wavelet-based augmentation method, named Wavelet Augmentation Transformer +(WAT), which can be flexibly incorporated with existing networks, to implicitly +produce useful augmented data. WAT learns low-frequency information of varying +levels across diverse samples, which is aggregated efficiently via deformable +attention. Furthermore, an uncertainty-aware self-training mechanism is +proposed to improve the accuracy of pseudo-labels, with inaccurate predictions +being rectified by uncertainty estimation. To acquire better SR results and +avoid overfitting pseudo-labels, several regularization losses are proposed to +constrain target LR and SR images in the frequency domain. Experiments show +that without accessing source data, SODA-SR outperforms state-of-the-art UDA +methods in both synthetic$\rightarrow$real and real$\rightarrow$real adaptation +settings, and is not constrained by specific network architectures. + +
+
+ comment: 9 pages, 7 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Conditioning Diffusion Models via Attributes and Semantic Masks for Face + Generation + + +
+ Deep generative models have shown impressive results in generating realistic +images of faces. GANs managed to generate high-quality, high-fidelity images +when conditioned on semantic masks, but they still lack the ability to +diversify their output. Diffusion models partially solve this problem and are +able to generate diverse samples given the same condition. In this paper, we +propose a multi-conditioning approach for diffusion models via cross-attention +exploiting both attributes and semantic masks to generate high-quality and +controllable face images. We also studied the impact of applying +perceptual-focused loss weighting into the latent space instead of the pixel +space. Our method extends the previous approaches by introducing conditioning +on more than one set of features, guaranteeing a more fine-grained control over +the generated face images. We evaluate our approach on the CelebA-HQ dataset, +and we show that it can generate realistic and diverse samples while allowing +for fine-grained control over multiple attributes and semantic regions. +Additionally, we perform an ablation study to evaluate the impact of different +conditioning strategies on the quality and diversity of the generated images. + +
+
+
+
+
+ + ♻ ☆ What You Hear Is What You See: Audio Quality Metrics From Image Quality + Metrics + + +
+ In this study, we investigate the feasibility of utilizing state-of-the-art +image perceptual metrics for evaluating audio signals by representing them as +spectrograms. The encouraging outcome of the proposed approach is based on the +similarity between the neural mechanisms in the auditory and visual pathways. +Furthermore, we customise one of the metrics which has a psychoacoustically +plausible architecture to account for the peculiarities of sound signals. We +evaluate the effectiveness of our proposed metric and several baseline metrics +using a music dataset, with promising results in terms of the correlation +between the metrics and the perceived quality of audio as rated by human +evaluators. + +
+
+
+
+
+ + ♻ ☆ Context-VQA: Towards Context-Aware and Purposeful Visual Question + Answering ICCV 2023 + + +
+ Visual question answering (VQA) has the potential to make the Internet more +accessible in an interactive way, allowing people who cannot see images to ask +questions about them. However, multiple studies have shown that people who are +blind or have low-vision prefer image explanations that incorporate the context +in which an image appears, yet current VQA datasets focus on images in +isolation. We argue that VQA models will not fully succeed at meeting people's +needs unless they take context into account. To further motivate and analyze +the distinction between different contexts, we introduce Context-VQA, a VQA +dataset that pairs images with contexts, specifically types of websites (e.g., +a shopping website). We find that the types of questions vary systematically +across contexts. For example, images presented in a travel context garner 2 +times more "Where?" questions, and images on social media and news garner 2.8 +and 1.8 times more "Who?" questions than the average. We also find that context +effects are especially important when participants can't see the image. These +results demonstrate that context affects the types of questions asked and that +VQA models should be context-sensitive to better meet people's needs, +especially in accessibility settings. + +
+
+ comment: Proceedings of ICCV 2023 Workshop on Closing the Loop Between Vision + and Language +
+
+
+
+
+ + ♻ ☆ Priority-Centric Human Motion Generation in Discrete Latent Space ICCV2023 + + +
+ Text-to-motion generation is a formidable task, aiming to produce human +motions that align with the input text while also adhering to human +capabilities and physical laws. While there have been advancements in diffusion +models, their application in discrete spaces remains underexplored. Current +methods often overlook the varying significance of different motions, treating +them uniformly. It is essential to recognize that not all motions hold the same +relevance to a particular textual description. Some motions, being more salient +and informative, should be given precedence during generation. In response, we +introduce a Priority-Centric Motion Discrete Diffusion Model (M2DM), which +utilizes a Transformer-based VQ-VAE to derive a concise, discrete motion +representation, incorporating a global self-attention mechanism and a +regularization term to counteract code collapse. We also present a motion +discrete diffusion model that employs an innovative noise schedule, determined +by the significance of each motion token within the entire motion sequence. +This approach retains the most salient motions during the reverse diffusion +process, leading to more semantically rich and varied motions. Additionally, we +formulate two strategies to gauge the importance of motion tokens, drawing from +both textual and visual indicators. Comprehensive experiments on the HumanML3D +and KIT-ML datasets confirm that our model surpasses existing techniques in +fidelity and diversity, particularly for intricate textual descriptions. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ♻ ☆ NeXtQSM -- A complete deep learning pipeline for data-consistent + quantitative susceptibility mapping trained with hybrid data + + +
+ Deep learning based Quantitative Susceptibility Mapping (QSM) has shown great +potential in recent years, obtaining similar results to established +non-learning approaches. Many current deep learning approaches are not data +consistent, require in vivo training data or solve the QSM problem in +consecutive steps resulting in the propagation of errors. Here we aim to +overcome these limitations and developed a framework to solve the QSM +processing steps jointly. We developed a new hybrid training data generation +method that enables the end-to-end training for solving background field +correction and dipole inversion in a data-consistent fashion using a +variational network that combines the QSM model term and a learned regularizer. +We demonstrate that NeXtQSM overcomes the limitations of previous deep learning +methods. NeXtQSM offers a new deep learning based pipeline for computing +quantitative susceptibility maps that integrates each processing step into the +training and provides results that are robust and fast. + +
+
+
+
+
+ + ♻ ☆ TAPIR: Tracking Any Point with per-frame Initialization and temporal + Refinement ICCV 2023 + + +
+ We present a novel model for Tracking Any Point (TAP) that effectively tracks +any queried point on any physical surface throughout a video sequence. Our +approach employs two stages: (1) a matching stage, which independently locates +a suitable candidate point match for the query point on every other frame, and +(2) a refinement stage, which updates both the trajectory and query features +based on local correlations. The resulting model surpasses all baseline methods +by a significant margin on the TAP-Vid benchmark, as demonstrated by an +approximate 20% absolute average Jaccard (AJ) improvement on DAVIS. Our model +facilitates fast inference on long and high-resolution video sequences. On a +modern GPU, our implementation has the capacity to track points faster than +real-time, and can be flexibly extended to higher-resolution videos. Given the +high-quality trajectories extracted from a large dataset, we demonstrate a +proof-of-concept diffusion model which generates trajectories from static +images, enabling plausible animations. Visualizations, source code, and +pretrained models can be found on our project webpage. + +
+
+ comment: Published at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ TriangleNet: Edge Prior Augmented Network for Semantic Segmentation + through Cross-Task Consistency + + +
+ This paper addresses the task of semantic segmentation in computer vision, +aiming to achieve precise pixel-wise classification. We investigate the joint +training of models for semantic edge detection and semantic segmentation, which +has shown promise. However, implicit cross-task consistency learning in +multi-task networks is limited. To address this, we propose a novel "decoupled +cross-task consistency loss" that explicitly enhances cross-task consistency. +Our semantic segmentation network, TriangleNet, achieves a substantial 2.88\% +improvement over the Baseline in mean Intersection over Union (mIoU) on the +Cityscapes test set. Notably, TriangleNet operates at 77.4\% mIoU/46.2 FPS on +Cityscapes, showcasing real-time inference capabilities at full resolution. +With multi-scale inference, performance is further enhanced to 77.8\%. +Furthermore, TriangleNet consistently outperforms the Baseline on the FloodNet +dataset, demonstrating its robust generalization capabilities. The proposed +method underscores the significance of multi-task learning and explicit +cross-task consistency enhancement for advancing semantic segmentation and +highlights the potential of multitasking in real-time semantic segmentation. + +
+
+ comment: Accepted for publication in the journal "International Journal of + Intelligent Systems" +
+
+
+
+
+ + ♻ ☆ DREAM: Efficient Dataset Distillation by Representative Matching + + +
+ Dataset distillation aims to synthesize small datasets with little +information loss from original large-scale ones for reducing storage and +training costs. Recent state-of-the-art methods mainly constrain the sample +synthesis process by matching synthetic images and the original ones regarding +gradients, embedding distributions, or training trajectories. Although there +are various matching objectives, currently the strategy for selecting original +images is limited to naive random sampling. + We argue that random sampling overlooks the evenness of the selected sample +distribution, which may result in noisy or biased matching targets. + Besides, the sample diversity is also not constrained by random sampling. +These factors together lead to optimization instability in the distilling +process and degrade the training efficiency. Accordingly, we propose a novel +matching strategy named as \textbf{D}ataset distillation by +\textbf{RE}present\textbf{A}tive \textbf{M}atching (DREAM), where only +representative original images are selected for matching. DREAM is able to be +easily plugged into popular dataset distillation frameworks and reduce the +distilling iterations by more than 8 times without performance drop. Given +sufficient training time, DREAM further provides significant improvements and +achieves state-of-the-art performances. + +
+
+ comment: Efficient matching for dataset distillation +
+
+
+
+
+ + ♻ ☆ LAC -- Latent Action Composition for Skeleton-based Action Segmentation ICCV 2023 + + +
+ Skeleton-based action segmentation requires recognizing composable actions in +untrimmed videos. Current approaches decouple this problem by first extracting +local visual features from skeleton sequences and then processing them by a +temporal model to classify frame-wise actions. However, their performances +remain limited as the visual features cannot sufficiently express composable +actions. In this context, we propose Latent Action Composition (LAC), a novel +self-supervised framework aiming at learning from synthesized composable +motions for skeleton-based action segmentation. LAC is composed of a novel +generation module towards synthesizing new sequences. Specifically, we design a +linear latent space in the generator to represent primitive motion. New +composed motions can be synthesized by simply performing arithmetic operations +on latent representations of multiple input skeleton sequences. LAC leverages +such synthesized sequences, which have large diversity and complexity, for +learning visual representations of skeletons in both sequence and frame spaces +via contrastive learning. The resulting visual encoder has a high expressive +power and can be effectively transferred onto action segmentation tasks by +end-to-end fine-tuning without the need for additional temporal models. We +conduct a study focusing on transfer-learning and we show that representations +learned from pre-trained LAC outperform the state-of-the-art by a large margin +on TSU, Charades, PKU-MMD datasets. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Exploring the Benefits of Visual Prompting in Differential Privacy ICCV 2023 + + +
+ Visual Prompting (VP) is an emerging and powerful technique that allows +sample-efficient adaptation to downstream tasks by engineering a well-trained +frozen source model. In this work, we explore the benefits of VP in +constructing compelling neural network classifiers with differential privacy +(DP). We explore and integrate VP into canonical DP training methods and +demonstrate its simplicity and efficiency. In particular, we discover that VP +in tandem with PATE, a state-of-the-art DP training method that leverages the +knowledge transfer from an ensemble of teachers, achieves the state-of-the-art +privacy-utility trade-off with minimum expenditure of privacy budget. Moreover, +we conduct additional experiments on cross-domain image classification with a +sufficient domain gap to further unveil the advantage of VP in DP. Lastly, we +also conduct extensive ablation studies to validate the effectiveness and +contribution of VP under DP consideration. Our code is available at +(https://github.com/EzzzLi/Prompt-PATE). + +
+
+ comment: Published at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ DeltaNN: Assessing the Impact of Computational Environment Parameters on + the Performance of Image Recognition Models + + +
+ Image recognition tasks typically use deep learning and require enormous +processing power, thus relying on hardware accelerators like GPUs and TPUs for +fast, timely processing. Failure in real-time image recognition tasks can occur +due to sub-optimal mapping on hardware accelerators during model deployment, +which may lead to timing uncertainty and erroneous behavior. Mapping on +hardware accelerators is done using multiple software components like deep +learning frameworks, compilers, and device libraries, that we refer to as the +computational environment. Owing to the increased use of image recognition +tasks in safety-critical applications like autonomous driving and medical +imaging, it is imperative to assess their robustness to changes in the +computational environment, as the impact of parameters like deep learning +frameworks, compiler optimizations, and hardware devices on model performance +and correctness is not yet well understood. + In this paper we present a differential testing framework, DeltaNN, that +allows us to assess the impact of different computational environment +parameters on the performance of image recognition models during deployment, +post training. DeltaNN generates different implementations of a given image +recognition model for variations in environment parameters, namely, deep +learning frameworks, compiler optimizations and hardware devices and analyzes +differences in model performance as a result. Using DeltaNN, we conduct an +empirical study of robustness analysis of three popular image recognition +models using the ImageNet dataset. We report the impact in terms of +misclassifications and inference time differences across different settings. In +total, we observed up to 72% output label differences across deep learning +frameworks, and up to 81% unexpected performance degradation in terms of +inference time, when applying compiler optimizations. + +
+
+ comment: 11 pages, 10 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Laughing Matters: Introducing Laughing-Face Generation using Diffusion + Models + + +
+ Speech-driven animation has gained significant traction in recent years, with +current methods achieving near-photorealistic results. However, the field +remains underexplored regarding non-verbal communication despite evidence +demonstrating its importance in human interaction. In particular, generating +laughter sequences presents a unique challenge due to the intricacy and nuances +of this behaviour. This paper aims to bridge this gap by proposing a novel +model capable of generating realistic laughter sequences, given a still +portrait and an audio clip containing laughter. We highlight the failure cases +of traditional facial animation methods and leverage recent advances in +diffusion models to produce convincing laughter videos. We train our model on a +diverse set of laughter datasets and introduce an evaluation metric +specifically designed for laughter. When compared with previous speech-driven +approaches, our model achieves state-of-the-art performance across all metrics, +even when these are re-trained for laughter generation. Our code and project +are publicly available + +
+
+
+
+
+ + ♻ ☆ Discriminator-free Unsupervised Domain Adaptation for Multi-label Image + Classification + + +
+ In this paper, a discriminator-free adversarial-based Unsupervised Domain +Adaptation (UDA) for Multi-Label Image Classification (MLIC) referred to as +DDA-MLIC is proposed. Recently, some attempts have been made for introducing +adversarial-based UDA methods in the context of MLIC. However, these methods +which rely on an additional discriminator subnet present one major shortcoming. +The learning of domain-invariant features may harm their task-specific +discriminative power, since the classification and discrimination tasks are +decoupled. Herein, we propose to overcome this issue by introducing a novel +adversarial critic that is directly deduced from the task-specific classifier. +Specifically, a two-component Gaussian Mixture Model (GMM) is fitted on the +source and target predictions in order to distinguish between two clusters. +This allows extracting a Gaussian distribution for each component. The +resulting Gaussian distributions are then used for formulating an adversarial +loss based on a Frechet distance. The proposed method is evaluated on several +multi-label image datasets covering three different types of domain shift. The +obtained results demonstrate that DDA-MLIC outperforms existing +state-of-the-art methods in terms of precision while requiring a lower number +of parameters. The code will be made publicly available online. + +
+
+
+
+
+ + ♻ ☆ Fault Localization for Buggy Deep Learning Framework Conversions in + Image Recognition + + +
+ When deploying Deep Neural Networks (DNNs), developers often convert models +from one deep learning framework to another (e.g., TensorFlow to PyTorch). +However, this process is error-prone and can impact target model accuracy. To +identify the extent of such impact, we perform and briefly present a +differential analysis against three DNNs widely used for image recognition +(MobileNetV2, ResNet101, and InceptionV3) converted across four well-known deep +learning frameworks (PyTorch, Keras, TensorFlow (TF), and TFLite), which +revealed numerous model crashes and output label discrepancies of up to 72%. To +mitigate such errors, we present a novel approach towards fault localization +and repair of buggy deep learning framework conversions, focusing on +pre-trained image recognition models. Our technique consists of four stages of +analysis: 1) conversion tools, 2) model parameters, 3) model hyperparameters, +and 4) graph representation. In addition, we propose various strategies towards +fault repair of the faults detected. We implement our technique on top of the +Apache TVM deep learning compiler, and we test it by conducting a preliminary +fault localization analysis for the conversion of InceptionV3 from TF to +TFLite. Our approach detected a fault in a common DNN converter tool, which +introduced precision errors in weights, reducing model accuracy. After our +fault localization, we repaired the issue, reducing our conversion error to +zero. + +
+
+ comment: 5 pages, 3 figures, 1 table +
+
+
+
+
+ + ♻ ☆ Nonrigid Object Contact Estimation With Regional Unwrapping Transformer ICCV2023 + + +
+ Acquiring contact patterns between hands and nonrigid objects is a common +concern in the vision and robotics community. However, existing learning-based +methods focus more on contact with rigid ones from monocular images. When +adopting them for nonrigid contact, a major problem is that the existing +contact representation is restricted by the geometry of the object. +Consequently, contact neighborhoods are stored in an unordered manner and +contact features are difficult to align with image cues. At the core of our +approach lies a novel hand-object contact representation called RUPs (Region +Unwrapping Profiles), which unwrap the roughly estimated hand-object surfaces +as multiple high-resolution 2D regional profiles. The region grouping strategy +is consistent with the hand kinematic bone division because they are the +primitive initiators for a composite contact pattern. Based on this +representation, our Regional Unwrapping Transformer (RUFormer) learns the +correlation priors across regions from monocular inputs and predicts +corresponding contact and deformed transformations. Our experiments demonstrate +that the proposed framework can robustly estimate the deformed degrees and +deformed transformations, which makes it suitable for both nonrigid and rigid +contact. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ♻ ☆ How Good is Google Bard's Visual Understanding? An Empirical Study on + Open Challenges + + +
+ Google's Bard has emerged as a formidable competitor to OpenAI's ChatGPT in +the field of conversational AI. Notably, Bard has recently been updated to +handle visual inputs alongside text prompts during conversations. Given Bard's +impressive track record in handling textual inputs, we explore its capabilities +in understanding and interpreting visual data (images) conditioned by text +questions. This exploration holds the potential to unveil new insights and +challenges for Bard and other forthcoming multi-modal Generative models, +especially in addressing complex computer vision problems that demand accurate +visual and language understanding. Specifically, in this study, we focus on 15 +diverse task scenarios encompassing regular, camouflaged, medical, under-water +and remote sensing data to comprehensively evaluate Bard's performance. Our +primary finding indicates that Bard still struggles in these vision scenarios, +highlighting the significant gap in vision-based understanding that needs to be +bridged in future developments. We expect that this empirical study will prove +valuable in advancing future models, leading to enhanced capabilities in +comprehending and interpreting fine-grained visual data. Our project is +released on https://github.com/htqin/GoogleBard-VisUnderstand + +
+
+
+
+
+ + ♻ ☆ MB-TaylorFormer: Multi-branch Efficient Transformer Expanded by Taylor + Formula for Image Dehazing ICCV 2023 + + +
+ In recent years, Transformer networks are beginning to replace pure +convolutional neural networks (CNNs) in the field of computer vision due to +their global receptive field and adaptability to input. However, the quadratic +computational complexity of softmax-attention limits the wide application in +image dehazing task, especially for high-resolution images. To address this +issue, we propose a new Transformer variant, which applies the Taylor expansion +to approximate the softmax-attention and achieves linear computational +complexity. A multi-scale attention refinement module is proposed as a +complement to correct the error of the Taylor expansion. Furthermore, we +introduce a multi-branch architecture with multi-scale patch embedding to the +proposed Transformer, which embeds features by overlapping deformable +convolution of different scales. The design of multi-scale patch embedding is +based on three key ideas: 1) various sizes of the receptive field; 2) +multi-level semantic information; 3) flexible shapes of the receptive field. +Our model, named Multi-branch Transformer expanded by Taylor formula +(MB-TaylorFormer), can embed coarse to fine features more flexibly at the patch +embedding stage and capture long-distance pixel interactions with limited +computational cost. Experimental results on several dehazing benchmarks show +that MB-TaylorFormer achieves state-of-the-art (SOTA) performance with a light +computational burden. The source code and pre-trained models are available at +https://github.com/FVL2020/ICCV-2023-MB-TaylorFormer. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ SegViTv2: Exploring Efficient and Continual Semantic Segmentation with + Plain Vision Transformers + + +
+ This paper investigates the capability of plain Vision Transformers (ViTs) +for semantic segmentation using the encoder-decoder framework and introduces +\textbf{SegViTv2}. In this study, we introduce a novel Attention-to-Mask (\atm) +module to design a lightweight decoder effective for plain ViT. The proposed +ATM converts the global attention map into semantic masks for high-quality +segmentation results. Our decoder outperforms the popular decoder UPerNet using +various ViT backbones while consuming only about $5\%$ of the computational +cost. For the encoder, we address the concern of the relatively high +computational cost in the ViT-based encoders and propose a \emph{Shrunk++} +structure that incorporates edge-aware query-based down-sampling (EQD) and +query-based upsampling (QU) modules. The Shrunk++ structure reduces the +computational cost of the encoder by up to $50\%$ while maintaining competitive +performance. Furthermore, we propose to adapt SegViT for continual semantic +segmentation, demonstrating nearly zero forgetting of previously learned +knowledge. Experiments show that our proposed SegViTv2 surpasses recent +segmentation methods on three popular benchmarks including ADE20k, +COCO-Stuff-10k and PASCAL-Context datasets. The code is available through the +following link: \url{https://github.com/zbwxp/SegVit}. + +
+
+ comment: IJCV 2023 accepted, 21 pages, 8 figures, 12 tables +
+
+
+
+
+ + ♻ ☆ BinaryViT: Towards Efficient and Accurate Binary Vision Transformers + + +
+ Vision Transformers (ViTs) have emerged as the fundamental architecture for +most computer vision fields, but the considerable memory and computation costs +hinders their application on resource-limited devices. As one of the most +powerful compression methods, binarization reduces the computation of the +neural network by quantizing the weights and activation values as $\pm$1. +Although existing binarization methods have demonstrated excellent performance +on Convolutional Neural Networks (CNNs), the full binarization of ViTs is still +under-studied and suffering a significant performance drop. In this paper, we +first argue empirically that the severe performance degradation is mainly +caused by the weight oscillation in the binarization training and the +information distortion in the activation of ViTs. Based on these analyses, we +propose $\textbf{BinaryViT}$, an accurate full binarization scheme for ViTs, +which pushes the quantization of ViTs to the limit. Specifically, we propose a +novel gradient regularization scheme (GRS) for driving a bimodal distribution +of the weights to reduce oscillation in binarization training. Moreover, we +design an activation shift module (ASM) to adaptively tune the activation +distribution to reduce the information distortion caused by binarization. +Extensive experiments on ImageNet dataset show that our BinaryViT consistently +surpasses the strong baseline by 2.05% and improve the accuracy of fully +binarized ViTs to a usable level. Furthermore, our method achieves impressive +savings of 16.2$\times$ and 17.7$\times$ in model size and OPs compared to the +full-precision DeiT-S. + +
+
+
+
+
+ + ♻ ☆ Evaluating the Quality and Diversity of DCGAN-based Generatively + Synthesized Diabetic Retinopathy Imagery + + +
+ Publicly available diabetic retinopathy (DR) datasets are imbalanced, +containing limited numbers of images with DR. This imbalance contributes to +overfitting when training machine learning classifiers. The impact of this +imbalance is exacerbated as the severity of the DR stage increases, affecting +the classifiers' diagnostic capacity. The imbalance can be addressed using +Generative Adversarial Networks (GANs) to augment the datasets with synthetic +images. Generating synthetic images is advantageous if high-quality and +diversified images are produced. To evaluate the quality and diversity of +synthetic images, several evaluation metrics, such as Multi-Scale Structural +Similarity Index (MS-SSIM), Cosine Distance (CD), and Fr\'echet Inception +Distance (FID) are used. Understanding the effectiveness of each metric in +evaluating the quality and diversity of GAN-based synthetic images is critical +to select images for augmentation. To date, there has been limited analysis of +the appropriateness of these metrics in the context of biomedical imagery. This +work contributes an empirical assessment of these evaluation metrics as applied +to synthetic Proliferative DR imagery generated by a Deep Convolutional GAN +(DCGAN). Furthermore, the metrics' capacity to indicate the quality and +diversity of synthetic images and a correlation with classifier performance is +undertaken. This enables a quantitative selection of synthetic imagery and an +informed augmentation strategy. Results indicate that FID is suitable for +evaluating the quality, while MS-SSIM and CD are suitable for evaluating the +diversity of synthetic imagery. Furthermore, the superior performance of +Convolutional Neural Network (CNN) and EfficientNet classifiers, as indicated +by the F1 and AUC scores, for the augmented datasets demonstrates the efficacy +of synthetic imagery to augment the imbalanced dataset. + +
+
+ comment: 29 Pages, 8 Figures, submitted to MEDAL23: Advances in Deep + Generative Models for Medical Artificial Intelligence (Springer Nature + series) +
+
+
+
+
+ + ♻ ☆ Food Classification using Joint Representation of Visual and Textual + Data + + +
+ Food classification is an important task in health care. In this work, we +propose a multimodal classification framework that uses the modified version of +EfficientNet with the Mish activation function for image classification, and +the traditional BERT transformer-based network is used for text classification. +The proposed network and the other state-of-the-art methods are evaluated on a +large open-source dataset, UPMC Food-101. The experimental results show that +the proposed network outperforms the other methods, a significant difference of +11.57% and 6.34% in accuracy is observed for image and text classification, +respectively, when compared with the second-best performing method. We also +compared the performance in terms of accuracy, precision, and recall for text +classification using both machine learning and deep learning-based models. The +comparative analysis from the prediction results of both images and text +demonstrated the efficiency and robustness of the proposed approach. + +
+
+ comment: Updated results and discussions to be posted and some sections needed + to be expanded +
+
+
+
+
+ + ♻ ☆ NBV-SC: Next Best View Planning based on Shape Completion for Fruit + Mapping and Reconstruction + + +
+ Active perception for fruit mapping and harvesting is a difficult task since +occlusions occur frequently and the location as well as size of fruits change +over time. State-of-the-art viewpoint planning approaches utilize +computationally expensive ray casting operations to find good viewpoints aiming +at maximizing information gain and covering the fruits in the scene. In this +paper, we present a novel viewpoint planning approach that explicitly uses +information about the predicted fruit shapes to compute targeted viewpoints +that observe as yet unobserved parts of the fruits. Furthermore, we formulate +the concept of viewpoint dissimilarity to reduce the sampling space for more +efficient selection of useful, dissimilar viewpoints. Our simulation +experiments with a UR5e arm equipped with an RGB-D sensor provide a +quantitative demonstration of the efficacy of our iterative next best view +planning method based on shape completion. In comparative experiments with a +state-of-the-art viewpoint planner, we demonstrate improvement not only in the +estimation of the fruit sizes, but also in their reconstruction, while +significantly reducing the planning time. Finally, we show the viability of our +approach for mapping sweet peppers plants with a real robotic system in a +commercial glasshouse. + +
+
+ comment: Agricultural Automation, Viewpoint Planning, Active Perception, Shape + Completion +
+
+
+
+
+ + ♻ ☆ Three-stage binarization of color document images based on discrete + wavelet transform and generative adversarial networks + + +
+ The efficient segmentation of foreground text information from the background +in degraded color document images is a critical challenge in the preservation +of ancient manuscripts. The imperfect preservation of ancient manuscripts over +time has led to various types of degradation, such as staining, yellowing, and +ink seepage, significantly affecting image binarization results. This work +proposes a three-stage method using Generative Adversarial Networks (GAN) for +enhancing and binarizing degraded color document images through Discrete +Wavelet Transform (DWT). Stage-1 involves applying DWT and retaining the +Low-Low (LL) subband images for image enhancement. In Stage-2, the original +input image is divided into four single-channel images (Red, Green, Blue, and +Gray), and each is trained with independent adversarial networks to extract +color foreground information. In Stage-3, the output image from Stage-2 and the +original input image are used to train independent adversarial networks for +document binarization, enabling the integration of global and local features. +The experimental results demonstrate that our proposed method outperforms other +classic and state-of-the-art (SOTA) methods on the Document Image Binarization +Contest (DIBCO) datasets. We have released our implementation code at +https://github.com/abcpp12383/ThreeStageBinarization. + +
+
+
+
+
+ + ♻ ☆ Implicit neural representation for change detection + + +
+ Identifying changes in a pair of 3D aerial LiDAR point clouds, obtained +during two distinct time periods over the same geographic region presents a +significant challenge due to the disparities in spatial coverage and the +presence of noise in the acquisition system. The most commonly used approaches +to detecting changes in point clouds are based on supervised methods which +necessitate extensive labelled data often unavailable in real-world +applications. To address these issues, we propose an unsupervised approach that +comprises two components: Implicit Neural Representation (INR) for continuous +shape reconstruction and a Gaussian Mixture Model for categorising changes. INR +offers a grid-agnostic representation for encoding bi-temporal point clouds, +with unmatched spatial support that can be regularised to enhance +high-frequency details and reduce noise. The reconstructions at each timestamp +are compared at arbitrary spatial scales, leading to a significant increase in +detection capabilities. We apply our method to a benchmark dataset comprising +simulated LiDAR point clouds for urban sprawling. This dataset encompasses +diverse challenging scenarios, varying in resolutions, input modalities and +noise levels. This enables a comprehensive multi-scenario evaluation, comparing +our method with the current state-of-the-art approach. We outperform the +previous methods by a margin of 10% in the intersection over union metric. In +addition, we put our techniques to practical use by applying them in a +real-world scenario to identify instances of illicit excavation of +archaeological sites and validate our results by comparing them with findings +from field experts. + +
+
+ comment: Main article is 10 pages + 6 pages of supplementary. Conference style + paper +
+
+
+
+
+ + ♻ ☆ What do neural networks learn in image classification? A frequency + shortcut perspective ICCV2023 + + +
+ Frequency analysis is useful for understanding the mechanisms of +representation learning in neural networks (NNs). Most research in this area +focuses on the learning dynamics of NNs for regression tasks, while little for +classification. This study empirically investigates the latter and expands the +understanding of frequency shortcuts. First, we perform experiments on +synthetic datasets, designed to have a bias in different frequency bands. Our +results demonstrate that NNs tend to find simple solutions for classification, +and what they learn first during training depends on the most distinctive +frequency characteristics, which can be either low- or high-frequencies. +Second, we confirm this phenomenon on natural images. We propose a metric to +measure class-wise frequency characteristics and a method to identify frequency +shortcuts. The results show that frequency shortcuts can be texture-based or +shape-based, depending on what best simplifies the objective. Third, we +validate the transferability of frequency shortcuts on out-of-distribution +(OOD) test sets. Our results suggest that frequency shortcuts can be +transferred across datasets and cannot be fully avoided by larger model +capacity and data augmentation. We recommend that future research should focus +on effective training schemes mitigating frequency shortcut learning. + +
+
+ comment: Accepted at ICCV2023 +
+
+
+
+
+ + ♻ ☆ Poincaré ResNet + + +
+ This paper introduces an end-to-end residual network that operates entirely +on the Poincar\'e ball model of hyperbolic space. Hyperbolic learning has +recently shown great potential for visual understanding, but is currently only +performed in the penultimate layer(s) of deep networks. All visual +representations are still learned through standard Euclidean networks. In this +paper we investigate how to learn hyperbolic representations of visual data +directly from the pixel-level. We propose Poincar\'e ResNet, a hyperbolic +counterpart of the celebrated residual network, starting from Poincar\'e 2D +convolutions up to Poincar\'e residual connections. We identify three +roadblocks for training convolutional networks entirely in hyperbolic space and +propose a solution for each: (i) Current hyperbolic network initializations +collapse to the origin, limiting their applicability in deeper networks. We +provide an identity-based initialization that preserves norms over many layers. +(ii) Residual networks rely heavily on batch normalization, which comes with +expensive Fr\'echet mean calculations in hyperbolic space. We introduce +Poincar\'e midpoint batch normalization as a faster and equally effective +alternative. (iii) Due to the many intermediate operations in Poincar\'e +layers, we lastly find that the computation graphs of deep learning libraries +blow up, limiting our ability to train on deep hyperbolic networks. We provide +manual backward derivations of core hyperbolic operations to maintain +manageable computation graphs. + +
+
+ comment: International Conference on Computer Vision 2023 +
+
+
+
+
+ + ♻ ☆ Dynamic Depth-Supervised NeRF for Multi-View RGB-D Operating Room Images + + +
+ The operating room (OR) is an environment of interest for the development of +sensing systems, enabling the detection of people, objects, and their semantic +relations. Due to frequent occlusions in the OR, these systems often rely on +input from multiple cameras. While increasing the number of cameras generally +increases algorithm performance, there are hard limitations to the number and +locations of cameras in the OR. Neural Radiance Fields (NeRF) can be used to +render synthetic views from arbitrary camera positions, virtually enlarging the +number of cameras in the dataset. In this work, we explore the use of NeRF for +view synthesis of dynamic scenes in the OR, and we show that regularisation +with depth supervision from RGB-D sensor data results in higher image quality. +We optimise a dynamic depth-supervised NeRF with up to six synchronised cameras +that capture the surgical field in five distinct phases before and during a +knee replacement surgery. We qualitatively inspect views rendered by a virtual +camera that moves 180 degrees around the surgical field at differing time +values. Quantitatively, we evaluate view synthesis from an unseen camera +position in terms of PSNR, SSIM and LPIPS for the colour channels and in MAE +and error percentage for the estimated depth. We find that NeRFs can be used to +generate geometrically consistent views, also from interpolated camera +positions and at interpolated time intervals. Views are generated from an +unseen camera pose with an average PSNR of 18.2 and a depth estimation error of +2.0%. Our results show the potential of a dynamic NeRF for view synthesis in +the OR and stress the relevance of depth supervision in a clinical setting. + +
+
+ comment: Accepted to the Workshop on Ambient Intelligence for HealthCare 2023 +
+
+
+
+
+ + ♻ ☆ NSF: Neural Surface Fields for Human Modeling from Monocular Depth ICCV 2023 + + +
+ Obtaining personalized 3D animatable avatars from a monocular camera has +several real world applications in gaming, virtual try-on, animation, and +VR/XR, etc. However, it is very challenging to model dynamic and fine-grained +clothing deformations from such sparse data. Existing methods for modeling 3D +humans from depth data have limitations in terms of computational efficiency, +mesh coherency, and flexibility in resolution and topology. For instance, +reconstructing shapes using implicit functions and extracting explicit meshes +per frame is computationally expensive and cannot ensure coherent meshes across +frames. Moreover, predicting per-vertex deformations on a pre-designed human +template with a discrete surface lacks flexibility in resolution and topology. +To overcome these limitations, we propose a novel method `\keyfeature: Neural +Surface Fields' for modeling 3D clothed humans from monocular depth. NSF +defines a neural field solely on the base surface which models a continuous and +flexible displacement field. NSF can be adapted to the base surface with +different resolution and topology without retraining at inference time. +Compared to existing approaches, our method eliminates the expensive per-frame +surface extraction while maintaining mesh coherency, and is capable of +reconstructing meshes with arbitrary resolution without retraining. To foster +research in this direction, we release our code in project page at: +https://yuxuan-xue.com/nsf. + +
+
+ comment: Accpted to ICCV 2023; Homepage at: https://yuxuan-xue.com/nsf +
+
+
+
+
+ + ♻ ☆ Collaborative Perception in Autonomous Driving: Methods, Datasets and + Challenges + + +
+ Collaborative perception is essential to address occlusion and sensor failure +issues in autonomous driving. In recent years, theoretical and experimental +investigations of novel works for collaborative perception have increased +tremendously. So far, however, few reviews have focused on systematical +collaboration modules and large-scale collaborative perception datasets. This +work reviews recent achievements in this field to bridge this gap and motivate +future research. We start with a brief overview of collaboration schemes. After +that, we systematically summarize the collaborative perception methods for +ideal scenarios and real-world issues. The former focuses on collaboration +modules and efficiency, and the latter is devoted to addressing the problems in +actual application. Furthermore, we present large-scale public datasets and +summarize quantitative results on these benchmarks. Finally, we highlight gaps +and overlook challenges between current academic research and real-world +applications. The project page is +https://github.com/CatOneTwo/Collaborative-Perception-in-Autonomous-Driving + +
+
+ comment: 18 pages, 6 figures. Accepted by IEEE Intelligent Transportation + Systems Magazine. URL: + https://github.com/CatOneTwo/Collaborative-Perception-in-Autonomous-Driving +
+
+
+
+
+ + ♻ ☆ Elucidating the Exposure Bias in Diffusion Models + + +
+ Diffusion models have demonstrated impressive generative capabilities, but +their 'exposure bias' problem, described as the input mismatch between training +and sampling, lacks in-depth exploration. In this paper, we systematically +investigate the exposure bias problem in diffusion models by first analytically +modelling the sampling distribution, based on which we then attribute the +prediction error at each sampling step as the root cause of the exposure bias +issue. Furthermore, we discuss potential solutions to this issue and propose an +intuitive metric for it. Along with the elucidation of exposure bias, we +propose a simple, yet effective, training-free method called Epsilon Scaling to +alleviate the exposure bias. We show that Epsilon Scaling explicitly moves the +sampling trajectory closer to the vector field learned in the training phase by +scaling down the network output (Epsilon), mitigating the input mismatch +between training and sampling. Experiments on various diffusion frameworks +(ADM, DDPM/DDIM, LDM), unconditional and conditional settings, and +deterministic vs. stochastic sampling verify the effectiveness of our method. + +
+
+ comment: 7 pages, code available soon +
+
+
+
+
+ + ♻ ☆ Case-Aware Adversarial Training + + +
+ The neural network (NN) becomes one of the most heated type of models in +various signal processing applications. However, NNs are extremely vulnerable +to adversarial examples (AEs). To defend AEs, adversarial training (AT) is +believed to be the most effective method while due to the intensive +computation, AT is limited to be applied in most applications. In this paper, +to resolve the problem, we design a generic and efficient AT improvement +scheme, namely case-aware adversarial training (CAT). Specifically, the +intuition stems from the fact that a very limited part of informative samples +can contribute to most of model performance. Alternatively, if only the most +informative AEs are used in AT, we can lower the computation complexity of AT +significantly as maintaining the defense effect. To achieve this, CAT achieves +two breakthroughs. First, a method to estimate the information degree of +adversarial examples is proposed for AE filtering. Second, to further enrich +the information that the NN can obtain from AEs, CAT involves a weight +estimation and class-level balancing based sampling strategy to increase the +diversity of AT at each iteration. Extensive experiments show that CAT is +faster than vanilla AT by up to 3x while achieving competitive defense effect. + +
+
+
+
+
+ + ♻ ☆ Tranfer Learning of Semantic Segmentation Methods for Identifying Buried + Archaeological Structures on LiDAR Data + + +
+ When applying deep learning to remote sensing data in archaeological +research, a notable obstacle is the limited availability of suitable datasets +for training models. The application of transfer learning is frequently +employed to mitigate this drawback. However, there is still a need to explore +its effectiveness when applied across different archaeological datasets. This +paper compares the performance of various transfer learning configurations +using two semantic segmentation deep neural networks on two LiDAR datasets. The +experimental results indicate that transfer learning-based approaches in +archaeology can lead to performance improvements, although a systematic +enhancement has not yet been observed. We provide specific insights about the +validity of such techniques that can serve as a baseline for future works. + +
+
+ comment: Accepted to IEEE International Geoscience and Remote Sensing + Symposium 2023 (IGARSS 2023) @IEEE copyright +
+
+
+
+
+ + ♻ ☆ Is Complexity Required for Neural Network Pruning? A Case Study on + Global Magnitude Pruning + + +
+ Pruning neural networks has become popular in the last decade when it was +shown that a large number of weights can be safely removed from modern neural +networks without compromising accuracy. Numerous pruning methods have been +proposed since then, each claiming to be better than the previous. Many +state-of-the-art (SOTA) techniques today rely on complex pruning methodologies +utilizing importance scores, getting feedback through back-propagation or +having heuristics-based pruning rules amongst others. In this work, we question +whether this pattern of introducing complexity is really necessary to achieve +better pruning results. We benchmark these SOTA techniques against a naive +pruning baseline, namely, Global Magnitude Pruning (Global MP). Global MP ranks +weights in order of their magnitudes and prunes the smallest ones. Hence, in +its vanilla form, it is one of the simplest pruning techniques. Surprisingly, +we find that vanilla Global MP outperforms all the other SOTA techniques and +achieves a new SOTA result. It also achieves promising performance on FLOPs +sparsification, which we find is enhanced, when pruning is conducted in a +gradual fashion. We also find that Global MP is generalizable across tasks, +datasets, and models with superior performance. Moreover, a common issue that +many pruning algorithms run into at high sparsity rates, namely, +layer-collapse, can be easily fixed in Global MP by setting a minimum threshold +of weights to be retained in each layer. Lastly, unlike many other SOTA +techniques, Global MP does not require any additional algorithm specific +hyper-parameters and is very straightforward to tune and implement. We showcase +our findings on various models (WRN-28-8, ResNet-32, ResNet-50, MobileNet-V1 +and FastGRNN) and multiple datasets (CIFAR-10, ImageNet and HAR-2). Code is +available at https://github.com/manasgupta-1/GlobalMP. + +
+
+
+
+
+ + ♻ ☆ HHTrack: Hyperspectral Object Tracking Using Hybrid Attention + + +
+ Hyperspectral imagery provides abundant spectral information beyond the +visible RGB bands, offering rich discriminative details about objects in a +scene. Leveraging such data has the potential to enhance visual tracking +performance. In this paper, we propose a hyperspectral object tracker based on +hybrid attention (HHTrack). The core of HHTrack is a hyperspectral hybrid +attention (HHA) module that unifies feature extraction and fusion within one +component through token interactions. A hyperspectral bands fusion (HBF) module +is also introduced to selectively aggregate spatial and spectral signatures +from the full hyperspectral input. Extensive experiments demonstrate the +state-of-the-art performance of HHTrack on benchmark Near Infrared (NIR), Red +Near Infrared (Red-NIR), and Visible (VIS) hyperspectral tracking datasets. Our +work provides new insights into harnessing the strengths of transformers and +hyperspectral fusion to advance robust object tracking. + +
+
+
+
+
+ + ♻ ☆ Semi-supervised Semantic Segmentation with Mutual Knowledge Distillation + + +
+ Consistency regularization has been widely studied in recent semisupervised +semantic segmentation methods, and promising performance has been achieved. In +this work, we propose a new consistency regularization framework, termed mutual +knowledge distillation (MKD), combined with data and feature augmentation. We +introduce two auxiliary mean-teacher models based on consistency +regularization. More specifically, we use the pseudo-labels generated by a mean +teacher to supervise the student network to achieve a mutual knowledge +distillation between the two branches. In addition to using image-level strong +and weak augmentation, we also discuss feature augmentation. This involves +considering various sources of knowledge to distill the student network. Thus, +we can significantly increase the diversity of the training samples. +Experiments on public benchmarks show that our framework outperforms previous +state-of-the-art (SOTA) methods under various semi-supervised settings. Code is +available at semi-mmseg. + +
+
+
+
+
+ + ♻ ☆ Efficient Adaptive Ensembling for Image Classification + + +
+ In recent times, with the exception of sporadic cases, the trend in Computer +Vision is to achieve minor improvements compared to considerable increases in +complexity. + To reverse this trend, we propose a novel method to boost image +classification performances without increasing complexity. + To this end, we revisited ensembling, a powerful approach, often not used +properly due to its more complex nature and the training time, so as to make it +feasible through a specific design choice. First, we trained two +EfficientNet-b0 end-to-end models (known to be the architecture with the best +overall accuracy/complexity trade-off for image classification) on disjoint +subsets of data (i.e. bagging). Then, we made an efficient adaptive ensemble by +performing fine-tuning of a trainable combination layer. In this way, we were +able to outperform the state-of-the-art by an average of 0.5$\%$ on the +accuracy, with restrained complexity both in terms of the number of parameters +(by 5-60 times), and the FLoating point Operations Per Second (FLOPS) by 10-100 +times on several major benchmark datasets. + +
+
+
+
+
+ + ♻ ☆ Scene Matters: Model-based Deep Video Compression + + +
+ Video compression has always been a popular research area, where many +traditional and deep video compression methods have been proposed. These +methods typically rely on signal prediction theory to enhance compression +performance by designing high efficient intra and inter prediction strategies +and compressing video frames one by one. In this paper, we propose a novel +model-based video compression (MVC) framework that regards scenes as the +fundamental units for video sequences. Our proposed MVC directly models the +intensity variation of the entire video sequence in one scene, seeking +non-redundant representations instead of reducing redundancy through +spatio-temporal predictions. To achieve this, we employ implicit neural +representation as our basic modeling architecture. To improve the efficiency of +video modeling, we first propose context-related spatial positional embedding +and frequency domain supervision in spatial context enhancement. For temporal +correlation capturing, we design the scene flow constrain mechanism and +temporal contrastive loss. Extensive experimental results demonstrate that our +method achieves up to a 20\% bitrate reduction compared to the latest video +coding standard H.266 and is more efficient in decoding than existing video +coding strategies. + +
+
+
+
+
+ + ♻ ☆ SelfTalk: A Self-Supervised Commutative Training Diagram to Comprehend + 3D Talking Faces ACM MM 2023 + + +
+ Speech-driven 3D face animation technique, extending its applications to +various multimedia fields. Previous research has generated promising realistic +lip movements and facial expressions from audio signals. However, traditional +regression models solely driven by data face several essential problems, such +as difficulties in accessing precise labels and domain gaps between different +modalities, leading to unsatisfactory results lacking precision and coherence. +To enhance the visual accuracy of generated lip movement while reducing the +dependence on labeled data, we propose a novel framework SelfTalk, by involving +self-supervision in a cross-modals network system to learn 3D talking faces. +The framework constructs a network system consisting of three modules: facial +animator, speech recognizer, and lip-reading interpreter. The core of SelfTalk +is a commutative training diagram that facilitates compatible features exchange +among audio, text, and lip shape, enabling our models to learn the intricate +connection between these factors. The proposed framework leverages the +knowledge learned from the lip-reading interpreter to generate more plausible +lip shapes. Extensive experiments and user studies demonstrate that our +proposed approach achieves state-of-the-art performance both qualitatively and +quantitatively. We recommend watching the supplementary video. + +
+
+ comment: Accepted by ACM MM 2023 +
+
+
+
+
+ + ♻ ☆ Estimating 3D Dental Structures using Simulated Panoramic Radiographs + and Neural Ray Tracing + + +
+ Panoramic radiography (Panoramic X-ray, PX) is a widely used imaging modality +for dental examination. However, PX only provides a flattened 2D image, lacking +in a 3D view of the oral structure. In this paper, we propose a framework to +estimate 3D oral structures from real-world PX. Our framework tackles full 3D +reconstruction for varying subjects (patients) where each reconstruction is +based only on a single panoramic image. We create an intermediate +representation called simulated PX (SimPX) from 3D Cone-beam computed +tomography (CBCT) data based on the Beer-Lambert law of X-ray rendering and +rotational principles of PX imaging. SimPX aims at not only truthfully +simulating PX, but also facilitates the reverting process back to 3D data. We +propose a novel neural model based on ray tracing which exploits both global +and local input features to convert SimPX to 3D output. At inference, a real PX +image is translated to a SimPX-style image with semantic regularization, and +the translated image is processed by generation module to produce high-quality +outputs. Experiments show that our method outperforms prior state-of-the-art in +reconstruction tasks both quantitatively and qualitatively. Unlike prior +methods, Our method does not require any prior information such as the shape of +dental arches, nor the matched PX-CBCT dataset for training, which is difficult +to obtain in clinical practice. + +
+
+ comment: 20 pages, 16 figures +
+
+
+
+
+ + ♻ ☆ Pre-trained transformer for adversarial purification + + +
+ With more and more deep neural networks being deployed as various daily +services, their reliability is essential. It's frightening that deep neural +networks are vulnerable and sensitive to adversarial attacks, the most common +one of which for the services is evasion-based. Recent works usually strengthen +the robustness by adversarial training or leveraging the knowledge of an amount +of clean data. However, in practical terms, retraining and redeploying the +model need a large computational budget, leading to heavy losses to the online +service. In addition, when adversarial examples of a certain attack are +detected, only limited adversarial examples are available for the service +provider, while much clean data may not be accessible. Given the mentioned +problems, we propose a new scenario, RaPiD (Rapid Plug-in Defender), which is +to rapidly defend against a certain attack for the frozen original service +model with limitations of few clean and adversarial examples. Motivated by the +generalization and the universal computation ability of pre-trained transformer +models, we come up with a new defender method, CeTaD, which stands for +Considering Pre-trained Transformers as Defenders. In particular, we evaluate +the effectiveness and the transferability of CeTaD in the case of one-shot +adversarial examples and explore the impact of different parts of CeTaD as well +as training data conditions. CeTaD is flexible, able to be embedded into an +arbitrary differentiable model, and suitable for various types of attacks. + +
+
+
+
+
+ + ♻ ☆ Human Motion Diffusion as a Generative Prior + + +
+ Recent work has demonstrated the significant potential of denoising diffusion +models for generating human motion, including text-to-motion capabilities. +However, these methods are restricted by the paucity of annotated motion data, +a focus on single-person motions, and a lack of detailed control. In this +paper, we introduce three forms of composition based on diffusion priors: +sequential, parallel, and model composition. Using sequential composition, we +tackle the challenge of long sequence generation. We introduce DoubleTake, an +inference-time method with which we generate long animations consisting of +sequences of prompted intervals and their transitions, using a prior trained +only for short clips. Using parallel composition, we show promising steps +toward two-person generation. Beginning with two fixed priors as well as a few +two-person training examples, we learn a slim communication block, ComMDM, to +coordinate interaction between the two resulting motions. Lastly, using model +composition, we first train individual priors to complete motions that realize +a prescribed motion for a given joint. We then introduce DiffusionBlending, an +interpolation mechanism to effectively blend several such models to enable +flexible and efficient fine-grained joint and trajectory-level control and +editing. We evaluate the composition methods using an off-the-shelf motion +diffusion model, and further compare the results to dedicated models trained +for these specific tasks. + +
+
+
+
+
+ + ♻ ☆ FineDance: A Fine-grained Choreography Dataset for 3D Full Body Dance + Generation ICCV 2023 + + +
+ Generating full-body and multi-genre dance sequences from given music is a +challenging task, due to the limitations of existing datasets and the inherent +complexity of the fine-grained hand motion and dance genres. To address these +problems, we propose FineDance, which contains 14.6 hours of music-dance paired +data, with fine-grained hand motions, fine-grained genres (22 dance genres), +and accurate posture. To the best of our knowledge, FineDance is the largest +music-dance paired dataset with the most dance genres. Additionally, to address +monotonous and unnatural hand movements existing in previous methods, we +propose a full-body dance generation network, which utilizes the diverse +generation capabilities of the diffusion model to solve monotonous problems, +and use expert nets to solve unreal problems. To further enhance the +genre-matching and long-term stability of generated dances, we propose a +Genre&Coherent aware Retrieval Module. Besides, we propose a novel metric named +Genre Matching Score to evaluate the genre-matching degree between dance and +music. Quantitative and qualitative experiments demonstrate the quality of +FineDance, and the state-of-the-art performance of FineNet. The FineDance +Dataset and more qualitative samples can be found at our website. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ iWarpGAN: Disentangling Identity and Style to Generate Synthetic Iris + Images + + +
+ Generative Adversarial Networks (GANs) have shown success in approximating +complex distributions for synthetic image generation. However, current +GAN-based methods for generating biometric images, such as iris, have certain +limitations: (a) the synthetic images often closely resemble images in the +training dataset; (b) the generated images lack diversity in terms of the +number of unique identities represented in them; and (c) it is difficult to +generate multiple images pertaining to the same identity. To overcome these +issues, we propose iWarpGAN that disentangles identity and style in the context +of the iris modality by using two transformation pathways: Identity +Transformation Pathway to generate unique identities from the training set, and +Style Transformation Pathway to extract the style code from a reference image +and output an iris image using this style. By concatenating the transformed +identity code and reference style code, iWarpGAN generates iris images with +both inter- and intra-class variations. The efficacy of the proposed method in +generating such iris DeepFakes is evaluated both qualitatively and +quantitatively using ISO/IEC 29794-6 Standard Quality Metrics and the VeriEye +iris matcher. Further, the utility of the synthetically generated images is +demonstrated by improving the performance of deep learning based iris matchers +that augment synthetic data with real data during the training process. + +
+
+
+
+
+ + ♻ ☆ EvHandPose: Event-based 3D Hand Pose Estimation with Sparse Supervision + + +
+ Event camera shows great potential in 3D hand pose estimation, especially +addressing the challenges of fast motion and high dynamic range in a low-power +way. However, due to the asynchronous differential imaging mechanism, it is +challenging to design event representation to encode hand motion information +especially when the hands are not moving (causing motion ambiguity), and it is +infeasible to fully annotate the temporally dense event stream. In this paper, +we propose EvHandPose with novel hand flow representations in Event-to-Pose +module for accurate hand pose estimation and alleviating the motion ambiguity +issue. To solve the problem under sparse annotation, we design contrast +maximization and hand-edge constraints in Pose-to-IWE (Image with Warped +Events) module and formulate EvHandPose in a weakly-supervision framework. We +further build EvRealHands, the first large-scale real-world event-based hand +pose dataset on several challenging scenes to bridge the real-synthetic domain +gap. Experiments on EvRealHands demonstrate that EvHandPose outperforms +previous event-based methods under all evaluation scenes, achieves accurate and +stable hand pose estimation with high temporal resolution in fast motion and +strong light scenes compared with RGB-based methods, generalizes well to +outdoor scenes and another type of event camera, and shows the potential for +the hand gesture recognition task. + +
+
+
+
+
+ + ♻ ☆ WDiscOOD: Out-of-Distribution Detection via Whitened Linear Discriminant + Analysis ICCV 2023 + + +
+ Deep neural networks are susceptible to generating overconfident yet +erroneous predictions when presented with data beyond known concepts. This +challenge underscores the importance of detecting out-of-distribution (OOD) +samples in the open world. In this work, we propose a novel feature-space OOD +detection score based on class-specific and class-agnostic information. +Specifically, the approach utilizes Whitened Linear Discriminant Analysis to +project features into two subspaces - the discriminative and residual subspaces +- for which the in-distribution (ID) classes are maximally separated and +closely clustered, respectively. The OOD score is then determined by combining +the deviation from the input data to the ID pattern in both subspaces. The +efficacy of our method, named WDiscOOD, is verified on the large-scale +ImageNet-1k benchmark, with six OOD datasets that cover a variety of +distribution shifts. WDiscOOD demonstrates superior performance on deep +classifiers with diverse backbone architectures, including CNN and vision +transformer. Furthermore, we also show that WDiscOOD more effectively detects +novel concepts in representation spaces trained with contrastive objectives, +including supervised contrastive loss and multi-modality contrastive loss. + +
+
+ comment: Accepted by ICCV 2023. Code is available at: + https://github.com/ivalab/WDiscOOD.git +
+
+
+
+
+ + ♻ ☆ Generalized Universal Domain Adaptation with Generative Flow Networks + + +
+ We introduce a new problem in unsupervised domain adaptation, termed as +Generalized Universal Domain Adaptation (GUDA), which aims to achieve precise +prediction of all target labels including unknown categories. GUDA bridges the +gap between label distribution shift-based and label space mismatch-based +variants, essentially categorizing them as a unified problem, guiding to a +comprehensive framework for thoroughly solving all the variants. The key +challenge of GUDA is developing and identifying novel target categories while +estimating the target label distribution. To address this problem, we take +advantage of the powerful exploration capability of generative flow networks +and propose an active domain adaptation algorithm named GFlowDA, which selects +diverse samples with probabilities proportional to a reward function. To +enhance the exploration capability and effectively perceive the target label +distribution, we tailor the states and rewards, and introduce an efficient +solution for parent exploration and state transition. We also propose a +training paradigm for GUDA called Generalized Universal Adversarial Network +(GUAN), which involves collaborative optimization between GUAN and GFlowNet. +Theoretical analysis highlights the importance of exploration, and extensive +experiments on benchmark datasets demonstrate the superiority of GFlowDA. + +
+
+
+
+
+ + ♻ ☆ Universal Domain Adaptation via Compressive Attention Matching + + +
+ Universal domain adaptation (UniDA) aims to transfer knowledge from the +source domain to the target domain without any prior knowledge about the label +set. The challenge lies in how to determine whether the target samples belong +to common categories. The mainstream methods make judgments based on the sample +features, which overemphasizes global information while ignoring the most +crucial local objects in the image, resulting in limited accuracy. To address +this issue, we propose a Universal Attention Matching (UniAM) framework by +exploiting the self-attention mechanism in vision transformer to capture the +crucial object information. The proposed framework introduces a novel +Compressive Attention Matching (CAM) approach to explore the core information +by compressively representing attentions. Furthermore, CAM incorporates a +residual-based measurement to determine the sample commonness. By utilizing the +measurement, UniAM achieves domain-wise and category-wise Common Feature +Alignment (CFA) and Target Class Separation (TCS). Notably, UniAM is the first +method utilizing the attention in vision transformer directly to perform +classification tasks. Extensive experiments show that UniAM outperforms the +current state-of-the-art methods on various benchmark datasets. + +
+
+
+
+
+ + ♻ ☆ Exploring the Relationship between Samples and Masks for Robust Defect + Localization + + +
+ Defect detection aims to detect and localize regions out of the normal +distribution.Previous approaches model normality and compare it with the input +to identify defective regions, potentially limiting their generalizability.This +paper proposes a one-stage framework that detects defective patterns directly +without the modeling process.This ability is adopted through the joint efforts +of three parties: a generative adversarial network (GAN), a newly proposed +scaled pattern loss, and a dynamic masked cycle-consistent auxiliary network. +Explicit information that could indicate the position of defects is +intentionally excluded to avoid learning any direct mapping.Experimental +results on the texture class of the challenging MVTec AD dataset show that the +proposed method is 2.9\% higher than the SOTA methods in F1-Score, while +substantially outperforming SOTA methods in generalizability. + +
+
+
+
+
+ + ♻ ☆ UniM$^2$AE: Multi-modal Masked Autoencoders with Unified 3D + Representation for 3D Perception in Autonomous Driving + + +
+ Masked Autoencoders (MAE) play a pivotal role in learning potent +representations, delivering outstanding results across various 3D perception +tasks essential for autonomous driving. In real-world driving scenarios, it's +commonplace to deploy multiple sensors for comprehensive environment +perception. While integrating multi-modal features from these sensors can +produce rich and powerful features, there is a noticeable gap in MAE methods +addressing this integration. This research delves into multi-modal Masked +Autoencoders tailored for a unified representation space in autonomous driving, +aiming to pioneer a more efficient fusion of two distinct modalities. To +intricately marry the semantics inherent in images with the geometric +intricacies of LiDAR point clouds, the UniM$^2$AE is proposed. This model +stands as a potent yet straightforward, multi-modal self-supervised +pre-training framework, mainly consisting of two designs. First, it projects +the features from both modalities into a cohesive 3D volume space, ingeniously +expanded from the bird's eye view (BEV) to include the height dimension. The +extension makes it possible to back-project the informative features, obtained +by fusing features from both modalities, into their native modalities to +reconstruct the multiple masked inputs. Second, the Multi-modal 3D Interactive +Module (MMIM) is invoked to facilitate the efficient inter-modal interaction +during the interaction process. Extensive experiments conducted on the nuScenes +Dataset attest to the efficacy of UniM$^2$AE, indicating enhancements in 3D +object detection and BEV map segmentation by 1.2\%(NDS) and 6.5\% (mIoU), +respectively. Code is available at https://github.com/hollow-503/UniM2AE. + +
+
+ comment: Code available at https://github.com/hollow-503/UniM2AE +
+
+
+
+
+ + ♻ ☆ EA-LSS: Edge-aware Lift-splat-shot Framework for 3D BEV Object Detection + + +
+ In recent years, great progress has been made in the Lift-Splat-Shot-based +(LSS-based) 3D object detection method. However, inaccurate depth estimation +remains an important constraint to the accuracy of camera-only and multi-model +3D object detection models, especially in regions where the depth changes +significantly (i.e., the "depth jump" problem). In this paper, we proposed a +novel Edge-aware Lift-splat-shot (EA-LSS) framework. Specifically, edge-aware +depth fusion (EADF) module is proposed to alleviate the "depth jump" problem +and fine-grained depth (FGD) module to further enforce refined supervision on +depth. Our EA-LSS framework is compatible for any LSS-based 3D object detection +models, and effectively boosts their performances with negligible increment of +inference time. Experiments on nuScenes benchmarks demonstrate that EA-LSS is +effective in either camera-only or multi-model models. It is worth mentioning +that EA-LSS achieved the state-of-the-art performance on nuScenes test +benchmarks with mAP and NDS of 76.5% and 77.6%, respectively. + +
+
+
+
+
+ + ♻ ☆ Few-Shot Object Detection via Synthetic Features with Optimal Transport + + +
+ Few-shot object detection aims to simultaneously localize and classify the +objects in an image with limited training samples. However, most existing +few-shot object detection methods focus on extracting the features of a few +samples of novel classes that lack diversity. Hence, they may not be sufficient +to capture the data distribution. To address that limitation, in this paper, we +propose a novel approach in which we train a generator to generate synthetic +data for novel classes. Still, directly training a generator on the novel class +is not effective due to the lack of novel data. To overcome that issue, we +leverage the large-scale dataset of base classes. Our overarching goal is to +train a generator that captures the data variations of the base dataset. We +then transform the captured variations into novel classes by generating +synthetic data with the trained generator. To encourage the generator to +capture data variations on base classes, we propose to train the generator with +an optimal transport loss that minimizes the optimal transport distance between +the distributions of real and synthetic data. Extensive experiments on two +benchmark datasets demonstrate that the proposed method outperforms the state +of the art. Source code will be available. + +
+
+
+
+
+ + ♻ ☆ RAFT: Reward rAnked FineTuning for Generative Foundation Model Alignment + + +
+ Generative foundation models are susceptible to implicit biases that can +arise from extensive unsupervised training data. Such biases can produce +suboptimal samples, skewed outcomes, and unfairness, with potentially serious +consequences. Consequently, aligning these models with human ethics and +preferences is an essential step toward ensuring their responsible and +effective deployment in real-world applications. Prior research has primarily +employed Reinforcement Learning from Human Feedback (RLHF) to address this +problem, where generative models are fine-tuned with RL algorithms guided by a +human-feedback-informed reward model. However, the inefficiencies and +instabilities associated with RL algorithms frequently present substantial +obstacles to the successful alignment, necessitating the development of a more +robust and streamlined approach. To this end, we introduce a new framework, +Reward rAnked FineTuning (RAFT), designed to align generative models +effectively. Utilizing a reward model and a sufficient number of samples, our +approach selects the high-quality samples, discarding those that exhibit +undesired behavior, and subsequently enhancing the model by fine-tuning on +these filtered samples. Our studies show that RAFT can effectively improve the +model performance in both reward learning and other automated metrics in both +large language models and diffusion models. + +
+
+ comment: 26 pages, 8 figures +
+
+
+
+
+
+
+
+ + Information Retrieval 10 + +
+
+
+ + ☆ Adaptive Multi-Modalities Fusion in Sequential Recommendation Systems CIKM'2023 + + +
+ In sequential recommendation, multi-modal information (e.g., text or image) +can provide a more comprehensive view of an item's profile. The optimal stage +(early or late) to fuse modality features into item representations is still +debated. We propose a graph-based approach (named MMSR) to fuse modality +features in an adaptive order, enabling each modality to prioritize either its +inherent sequential nature or its interplay with other modalities. MMSR +represents each user's history as a graph, where the modality features of each +item in a user's history sequence are denoted by cross-linked nodes. The edges +between homogeneous nodes represent intra-modality sequential relationships, +and the ones between heterogeneous nodes represent inter-modality +interdependence relationships. During graph propagation, MMSR incorporates dual +attention, differentiating homogeneous and heterogeneous neighbors. To +adaptively assign nodes with distinct fusion orders, MMSR allows each node's +representation to be asynchronously updated through an update gate. In +scenarios where modalities exhibit stronger sequential relationships, the +update gate prioritizes updates among homogeneous nodes. Conversely, when the +interdependent relationships between modalities are more pronounced, the update +gate prioritizes updates among heterogeneous nodes. Consequently, MMSR +establishes a fusion order that spans a spectrum from early to late modality +fusion. In experiments across six datasets, MMSR consistently outperforms +state-of-the-art models, and our graph propagation methods surpass other graph +neural networks. Additionally, MMSR naturally manages missing modalities. + +
+
+ comment: CIKM'2023 +
+
+
+
+
+ + ☆ Denoising Attention for Query-aware User Modeling in Personalized Search + + +
+ The personalization of search results has gained increasing attention in the +past few years, thanks to the development of Neural Networks-based approaches +for Information Retrieval and the importance of personalization in many search +scenarios. Recent works have proposed to build user models at query time by +leveraging the Attention mechanism, which allows weighing the contribution of +the user-related information w.r.t. the current query. This approach allows +taking into account the diversity of the user's interests by giving more +importance to those related to the current search performed by the user. + In this paper, we first discuss some shortcomings of the standard Attention +formulation when employed for personalization. In particular, we focus on +issues related to its normalization mechanism and its inability to entirely +filter out noisy user-related information. Then, we introduce the Denoising +Attention mechanism: an Attention variant that directly tackles the above +shortcomings by adopting a robust normalization scheme and introducing a +filtering mechanism. The reported experimental evaluation shows the benefits of +the proposed approach over other Attention-based variants. + +
+
+
+
+
+ + ☆ DRGame: Diversified Recommendation for Multi-category Video Games with + Balanced Implicit Preferences + + +
+ The growing popularity of subscription services in video game consumption has +emphasized the importance of offering diversified recommendations. Providing +users with a diverse range of games is essential for ensuring continued +engagement and fostering long-term subscriptions. However, existing +recommendation models face challenges in effectively handling highly imbalanced +implicit feedback in gaming interactions. Additionally, they struggle to take +into account the distinctive characteristics of multiple categories and the +latent user interests associated with these categories. In response to these +challenges, we propose a novel framework, named DRGame, to obtain diversified +recommendation. It is centered on multi-category video games, consisting of two +{components}: Balance-driven Implicit Preferences Learning for data +pre-processing and Clustering-based Diversified Recommendation {Module} for +final prediction. The first module aims to achieve a balanced representation of +implicit feedback in game time, thereby discovering a comprehensive view of +player interests across different categories. The second module adopts +category-aware representation learning to cluster and select players and games +based on balanced implicit preferences, and then employs asymmetric neighbor +aggregation to achieve diversified recommendations. Experimental results on a +real-world dataset demonstrate the superiority of our proposed method over +existing approaches in terms of game diversity recommendations. + +
+
+
+
+
+ + ☆ Knowledge-grounded Natural Language Recommendation Explanation + + +
+ Explanations accompanied by a recommendation can assist users in +understanding the decision made by recommendation systems, which in turn +increases a user's confidence and trust in the system. Recently, research has +focused on generating natural language explanations in a human-readable format. +Thus far, the proposed approaches leverage item reviews written by users, which +are often subjective, sparse in language, and unable to account for new items +that have not been purchased or reviewed before. Instead, we aim to generate +fact-grounded recommendation explanations that are objectively described with +item features while implicitly considering a user's preferences, based on the +user's purchase history. To achieve this, we propose a knowledge graph (KG) +approach to natural language explainable recommendation. Our approach draws on +user-item features through a novel collaborative filtering-based KG +representation to produce fact-grounded, personalized explanations, while +jointly learning user-item representations for recommendation scoring. +Experimental results show that our approach consistently outperforms previous +state-of-the-art models on natural language explainable recommendation. + +
+
+
+
+
+ + ☆ Fragment and Integrate Network (FIN): A Novel Spatial-Temporal Modeling + Based on Long Sequential Behavior for Online Food Ordering Click-Through Rate + Prediction CIKM 2023 + + +
+ Spatial-temporal information has been proven to be of great significance for +click-through rate prediction tasks in online Location-Based Services (LBS), +especially in mainstream food ordering platforms such as DoorDash, Uber Eats, +Meituan, and Ele.me. Modeling user spatial-temporal preferences with sequential +behavior data has become a hot topic in recommendation systems and online +advertising. However, most of existing methods either lack the representation +of rich spatial-temporal information or only handle user behaviors with limited +length, e.g. 100. In this paper, we tackle these problems by designing a new +spatial-temporal modeling paradigm named Fragment and Integrate Network (FIN). +FIN consists of two networks: (i) Fragment Network (FN) extracts Multiple +Sub-Sequences (MSS) from lifelong sequential behavior data, and captures the +specific spatial-temporal representation by modeling each MSS respectively. +Here both a simplified attention and a complicated attention are adopted to +balance the performance gain and resource consumption. (ii) Integrate Network +(IN) builds a new integrated sequence by utilizing spatial-temporal interaction +on MSS and captures the comprehensive spatial-temporal representation by +modeling the integrated sequence with a complicated attention. Both public +datasets and production datasets have demonstrated the accuracy and scalability +of FIN. Since 2022, FIN has been fully deployed in the recommendation +advertising system of Ele.me, one of the most popular online food ordering +platforms in China, obtaining 5.7% improvement on Click-Through Rate (CTR) and +7.3% increase on Revenue Per Mille (RPM). + +
+
+ comment: Accepted by CIKM 2023 Applied Research Paper +
+
+
+
+
+ + ☆ A Survey on Multi-Behavior Sequential Recommendation + + +
+ Recommender systems is set up to address the issue of information overload in +traditional information retrieval systems, which is focused on recommending +information that is of most interest to users from massive information. +Generally, there is a sequential nature and heterogeneity to the behavior of a +person interacting with a system, leading to the proposal of multi-behavior +sequential recommendation (MBSR). MBSR is a relatively new and worthy direction +for in-depth research, which can achieve state-of-the-art recommendation +through suitable modeling, and some related works have been proposed. This +survey aims to shed light on the MBSR problem. Firstly, we introduce MBSR in +detail, including its problem definition, application scenarios and challenges +faced. Secondly, we detail the classification of MBSR, including +neighborhood-based methods, matrix factorization-based methods and deep +learning-based methods, where we further classify the deep learning-based +methods into different learning architectures based on RNN, GNN, Transformer, +and generic architectures as well as architectures that integrate hybrid +techniques. In each method, we present related works based on the data +perspective and the modeling perspective, as well as analyze the strengths, +weaknesses and features of these works. Finally, we discuss some promising +future research directions to address the challenges and improve the current +status of MBSR. + +
+
+
+
+
+ + ♻ ☆ On the Consistency of Average Embeddings for Item Recommendation RecSys 2023 + + +
+ A prevalent practice in recommender systems consists in averaging item +embeddings to represent users or higher-level concepts in the same embedding +space. This paper investigates the relevance of such a practice. For this +purpose, we propose an expected precision score, designed to measure the +consistency of an average embedding relative to the items used for its +construction. We subsequently analyze the mathematical expression of this score +in a theoretical setting with specific assumptions, as well as its empirical +behavior on real-world data from music streaming services. Our results +emphasize that real-world averages are less consistent for recommendation, +which paves the way for future research to better align real-world embeddings +with assumptions from our theoretical setting. + +
+
+ comment: 17th ACM Conference on Recommender Systems (RecSys 2023) +
+
+
+
+
+ + ♻ ☆ Large Language Models are not Fair Evaluators + + +
+ In this paper, we uncover a systematic bias in the evaluation paradigm of +adopting large language models~(LLMs), e.g., GPT-4, as a referee to score and +compare the quality of responses generated by candidate models. We find that +the quality ranking of candidate responses can be easily hacked by simply +altering their order of appearance in the context. This manipulation allows us +to skew the evaluation result, making one model appear considerably superior to +the other, e.g., Vicuna-13B could beat ChatGPT on 66 over 80 tested queries +with ChatGPT as an evaluator. To address this issue, we propose a calibration +framework with three simple yet effective strategies: 1) Multiple Evidence +Calibration, which requires the evaluator model to generate multiple evaluation +evidence before assigning ratings; 2) Balanced Position Calibration, which +aggregates results across various orders to determine the final score; 3) +Human-in-the-Loop Calibration, which introduces a balanced position diversity +entropy to measure the difficulty of each example and seeks human assistance +when needed. We also manually annotate the "win/tie/lose" outcomes of responses +from ChatGPT and Vicuna-13B in the Vicuna Benchmark's question prompt, and +extensive experiments demonstrate that our approach successfully mitigates +evaluation bias, resulting in closer alignment with human judgments. We release +our code and human annotation at \url{https://github.com/i-Eval/FairEval} to +facilitate future research. + +
+
+
+
+
+ + ♻ ☆ ONCE: Boosting Content-based Recommendation with Both Open- and + Closed-source Large Language Models + + +
+ Personalized content-based recommender systems have become indispensable +tools for users to navigate through the vast amount of content available on +platforms like daily news websites and book recommendation services. However, +existing recommenders face significant challenges in understanding the content +of items. Large language models (LLMs), which possess deep semantic +comprehension and extensive knowledge from pretraining, have proven to be +effective in various natural language processing tasks. In this study, we +explore the potential of leveraging both open- and closed-source LLMs to +enhance content-based recommendation. With open-source LLMs, we utilize their +deep layers as content encoders, enriching the representation of content at the +embedding level. For closed-source LLMs, we employ prompting techniques to +enrich the training data at the token level. Through comprehensive experiments, +we demonstrate the high effectiveness of both types of LLMs and show the +synergistic relationship between them. Notably, we observed a significant +relative improvement of up to 19.32% compared to existing state-of-the-art +recommendation models. These findings highlight the immense potential of both +open- and closed-source of LLMs in enhancing content-based recommendation +systems. We will make our code and LLM-generated data available for other +researchers to reproduce our results. + +
+
+
+
+
+ + ♻ ☆ Voucher Abuse Detection with Prompt-based Fine-tuning on Graph Neural + Networks CIKM23 + + +
+ Voucher abuse detection is an important anomaly detection problem in +E-commerce. While many GNN-based solutions have emerged, the supervised +paradigm depends on a large quantity of labeled data. A popular alternative is +to adopt self-supervised pre-training using label-free data, and further +fine-tune on a downstream task with limited labels. Nevertheless, the +"pre-train, fine-tune" paradigm is often plagued by the objective gap between +pre-training and downstream tasks. Hence, we propose VPGNN, a prompt-based +fine-tuning framework on GNNs for voucher abuse detection. We design a novel +graph prompting function to reformulate the downstream task into a similar +template as the pretext task in pre-training, thereby narrowing the objective +gap. Extensive experiments on both proprietary and public datasets demonstrate +the strength of VPGNN in both few-shot and semi-supervised scenarios. Moreover, +an online deployment of VPGNN in a production environment shows a 23.4% +improvement over two existing deployed models. + +
+
+ comment: 7 pages, Accepted by CIKM23 Applied Research Track +
+
+
+
+
+
+
+
+ + Machine Learning 98 + +
+
+
+ + ☆ Algebraic, Topological, and Mereological Foundations of Existential + Granules + + +
+ In this research, new concepts of existential granules that determine +themselves are invented, and are characterized from algebraic, topological, and +mereological perspectives. Existential granules are those that determine +themselves initially, and interact with their environment subsequently. +Examples of the concept, such as those of granular balls, though inadequately +defined, algorithmically established, and insufficiently theorized in earlier +works by others, are already used in applications of rough sets and soft +computing. It is shown that they fit into multiple theoretical frameworks +(axiomatic, adaptive, and others) of granular computing. The characterization +is intended for algorithm development, application to classification problems +and possible mathematical foundations of generalizations of the approach. +Additionally, many open problems are posed and directions provided. + +
+
+ comment: 15 Pages +
+
+
+
+
+ + ☆ Modality Cycles with Masked Conditional Diffusion for Unsupervised + Anomaly Segmentation in MRI MICCAI + 2023 + + +
+ Unsupervised anomaly segmentation aims to detect patterns that are distinct +from any patterns processed during training, commonly called abnormal or +out-of-distribution patterns, without providing any associated manual +segmentations. Since anomalies during deployment can lead to model failure, +detecting the anomaly can enhance the reliability of models, which is valuable +in high-risk domains like medical imaging. This paper introduces Masked +Modality Cycles with Conditional Diffusion (MMCCD), a method that enables +segmentation of anomalies across diverse patterns in multimodal MRI. The method +is based on two fundamental ideas. First, we propose the use of cyclic modality +translation as a mechanism for enabling abnormality detection. +Image-translation models learn tissue-specific modality mappings, which are +characteristic of tissue physiology. Thus, these learned mappings fail to +translate tissues or image patterns that have never been encountered during +training, and the error enables their segmentation. Furthermore, we combine +image translation with a masked conditional diffusion model, which attempts to +`imagine' what tissue exists under a masked area, further exposing unknown +patterns as the generative model fails to recreate them. We evaluate our method +on a proxy task by training on healthy-looking slices of BraTS2021 +multi-modality MRIs and testing on slices with tumors. We show that our method +compares favorably to previous unsupervised approaches based on image +reconstruction and denoising with autoencoders and diffusion models. + +
+
+ comment: Accepted in Multiscale Multimodal Medical Imaging workshop in MICCAI + 2023 +
+
+
+
+
+ + ☆ Jais and Jais-chat: Arabic-Centric Foundation and Instruction-Tuned Open + Generative Large Language Models + + +
+ We introduce Jais and Jais-chat, new state-of-the-art Arabic-centric +foundation and instruction-tuned open generative large language models (LLMs). +The models are based on the GPT-3 decoder-only architecture and are pretrained +on a mixture of Arabic and English texts, including source code in various +programming languages. With 13 billion parameters, they demonstrate better +knowledge and reasoning capabilities in Arabic than any existing open Arabic +and multilingual models by a sizable margin, based on extensive evaluation. +Moreover, the models are competitive in English compared to English-centric +open models of similar size, despite being trained on much less English data. +We provide a detailed description of the training, the tuning, the safety +alignment, and the evaluation of the models. We release two open versions of +the model -- the foundation Jais model, and an instruction-tuned Jais-chat +variant -- with the aim of promoting research on Arabic LLMs. Available at +https://huggingface.co/inception-mbzuai/jais-13b-chat + +
+
+ comment: Arabic-centric, foundation model, large-language model, LLM, + generative model, instruction-tuned, Jais, Jais-chat +
+
+
+
+
+ + ☆ MedShapeNet -- A Large-Scale Dataset of 3D Medical Shapes for Computer + Vision + + +
+ We present MedShapeNet, a large collection of anatomical shapes (e.g., bones, +organs, vessels) and 3D surgical instrument models. Prior to the deep learning +era, the broad application of statistical shape models (SSMs) in medical image +analysis is evidence that shapes have been commonly used to describe medical +data. Nowadays, however, state-of-the-art (SOTA) deep learning algorithms in +medical imaging are predominantly voxel-based. In computer vision, on the +contrary, shapes (including, voxel occupancy grids, meshes, point clouds and +implicit surface models) are preferred data representations in 3D, as seen from +the numerous shape-related publications in premier vision conferences, such as +the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), as +well as the increasing popularity of ShapeNet (about 51,300 models) and +Princeton ModelNet (127,915 models) in computer vision research. MedShapeNet is +created as an alternative to these commonly used shape benchmarks to facilitate +the translation of data-driven vision algorithms to medical applications, and +it extends the opportunities to adapt SOTA vision algorithms to solve critical +medical problems. Besides, the majority of the medical shapes in MedShapeNet +are modeled directly on the imaging data of real patients, and therefore it +complements well existing shape benchmarks comprising of computer-aided design +(CAD) models. MedShapeNet currently includes more than 100,000 medical shapes, +and provides annotations in the form of paired data. It is therefore also a +freely available repository of 3D models for extended reality (virtual reality +- VR, augmented reality - AR, mixed reality - MR) and medical 3D printing. This +white paper describes in detail the motivations behind MedShapeNet, the shape +acquisition procedures, the use cases, as well as the usage of the online shape +search portal: https://medshapenet.ikim.nrw/ + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ Spatial Graph Coarsening: Weather and Weekday Prediction with London's + Bike-Sharing Service using GNN + + +
+ This study introduced the use of Graph Neural Network (GNN) for predicting +the weather and weekday of a day in London, from the dataset of Santander +Cycles bike-sharing system as a graph classification task. The proposed GNN +models newly introduced (i) a concatenation operator of graph features with +trained node embeddings and (ii) a graph coarsening operator based on +geographical contiguity, namely "Spatial Graph Coarsening". With the node +features of land-use characteristics and number of households around the bike +stations and graph features of temperatures in the city, our proposed models +outperformed the baseline model in cross-entropy loss and accuracy of the +validation dataset. + +
+
+
+
+
+ + ☆ survex: an R package for explaining machine learning survival models + + +
+ Due to their flexibility and superior performance, machine learning models +frequently complement and outperform traditional statistical survival models. +However, their widespread adoption is hindered by a lack of user-friendly tools +to explain their internal operations and prediction rationales. To tackle this +issue, we introduce the survex R package, which provides a cohesive framework +for explaining any survival model by applying explainable artificial +intelligence techniques. The capabilities of the proposed software encompass +understanding and diagnosing survival models, which can lead to their +improvement. By revealing insights into the decision-making process, such as +variable effects and importances, survex enables the assessment of model +reliability and the detection of biases. Thus, transparency and responsibility +may be promoted in sensitive areas, such as biomedical research and healthcare +applications. + +
+
+
+
+
+ + ☆ Advanced Deep Regression Models for Forecasting Time Series Oil + Production + + +
+ Global oil demand is rapidly increasing and is expected to reach 106.3 +million barrels per day by 2040. Thus, it is vital for hydrocarbon extraction +industries to forecast their production to optimize their operations and avoid +losses. Big companies have realized that exploiting the power of deep learning +(DL) and the massive amount of data from various oil wells for this purpose can +save a lot of operational costs and reduce unwanted environmental impacts. In +this direction, researchers have proposed models using conventional machine +learning (ML) techniques for oil production forecasting. However, these +techniques are inappropriate for this problem as they can not capture +historical patterns found in time series data, resulting in inaccurate +predictions. This research aims to overcome these issues by developing advanced +data-driven regression models using sequential convolutions and long short-term +memory (LSTM) units. Exhaustive analyses are conducted to select the optimal +sequence length, model hyperparameters, and cross-well dataset formation to +build highly generalized robust models. A comprehensive experimental study on +Volve oilfield data validates the proposed models. It reveals that the +LSTM-based sequence learning model can predict oil production better than the +1-D convolutional neural network (CNN) with mean absolute error (MAE) and R2 +score of 111.16 and 0.98, respectively. It is also found that the LSTM-based +model performs better than all the existing state-of-the-art solutions and +achieves a 37% improvement compared to a standard linear regression, which is +considered the baseline model in this work. + +
+
+
+
+
+ + ☆ Application of Zone Method based Machine Learning and Physics-Informed + Neural Networks in Reheating Furnaces + + +
+ Despite the high economic relevance of Foundation Industries, certain +components like Reheating furnaces within their manufacturing chain are +energy-intensive. Notable energy consumption reduction could be obtained by +reducing the overall heating time in furnaces. Computer-integrated Machine +Learning (ML) and Artificial Intelligence (AI) powered control systems in +furnaces could be enablers in achieving the Net-Zero goals in Foundation +Industries for sustainable manufacturing. + In this work, due to the infeasibility of achieving good quality data in +scenarios like reheating furnaces, classical Hottel's zone method based +computational model has been used to generate data for ML and Deep Learning +(DL) based model training via regression. It should be noted that the zone +method provides an elegant way to model the physical phenomenon of Radiative +Heat Transfer (RHT), the dominating heat transfer mechanism in high-temperature +processes inside heating furnaces. Using this data, an extensive comparison +among a wide range of state-of-the-art, representative ML and DL methods has +been made against their temperature prediction performances in varying furnace +environments. Owing to their holistic balance among inference times and model +performance, DL stands out among its counterparts. To further enhance the +Out-Of-Distribution (OOD) generalization capability of the trained DL models, +we propose a Physics-Informed Neural Network (PINN) by incorporating prior +physical knowledge using a set of novel Energy-Balance regularizers. Our setup +is a generic framework, is geometry-agnostic of the 3D structure of the +underlying furnace, and as such could accommodate any standard ML regression +model, to serve as a Digital Twin of the underlying physical processes, for +transitioning Foundation Industries towards Industry 4.0. + +
+
+
+
+
+ + ☆ Consensus of state of the art mortality prediction models: From + all-cause mortality to sudden death prediction + + +
+ Worldwide, many millions of people die suddenly and unexpectedly each year, +either with or without a prior history of cardiovascular disease. Such events +are sparse (once in a lifetime), many victims will not have had prior +investigations for cardiac disease and many different definitions of sudden +death exist. Accordingly, sudden death is hard to predict. + This analysis used NHS Electronic Health Records (EHRs) for people aged +$\geq$50 years living in the Greater Glasgow and Clyde (GG\&C) region in 2010 +(n = 380,000) to try to overcome these challenges. We investigated whether +medical history, blood tests, prescription of medicines, and hospitalisations +might, in combination, predict a heightened risk of sudden death. + We compared the performance of models trained to predict either sudden death +or all-cause mortality. We built six models for each outcome of interest: three +taken from state-of-the-art research (BEHRT, Deepr and Deep Patient), and three +of our own creation. We trained these using two different data representations: +a language-based representation, and a sparse temporal matrix. + We used global interpretability to understand the most important features of +each model, and compare how much agreement there was amongst models using Rank +Biased Overlap. It is challenging to account for correlated variables without +increasing the complexity of the interpretability technique. We overcame this +by clustering features into groups and comparing the most important groups for +each model. We found the agreement between models to be much higher when +accounting for correlated variables. + Our analysis emphasises the challenge of predicting sudden death and +emphasises the need for better understanding and interpretation of machine +learning models applied to healthcare applications. + +
+
+
+
+
+ + ☆ Conti Inc.: Understanding the Internal Discussions of a large + Ransomware-as-a-Service Operator with Machine Learning + + +
+ Ransomware-as-a-service (RaaS) is increasing the scale and complexity of +ransomware attacks. Understanding the internal operations behind RaaS has been +a challenge due to the illegality of such activities. The recent chat leak of +the Conti RaaS operator, one of the most infamous ransomware operators on the +international scene, offers a key opportunity to better understand the inner +workings of such organizations. This paper analyzes the main topic discussions +in the Conti chat leak using machine learning techniques such as Natural +Language Processing (NLP) and Latent Dirichlet Allocation (LDA), as well as +visualization strategies. Five discussion topics are found: 1) Business, 2) +Technical, 3) Internal tasking/Management, 4) Malware, and 5) Customer +Service/Problem Solving. Moreover, the distribution of topics among Conti +members shows that only 4% of individuals have specialized discussions while +almost all individuals (96%) are all-rounders, meaning that their discussions +revolve around the five topics. The results also indicate that a significant +proportion of Conti discussions are non-tech related. This study thus +highlights that running such large RaaS operations requires a workforce skilled +beyond technical abilities, with individuals involved in various tasks, from +management to customer service or problem solving. The discussion topics also +show that the organization behind the Conti RaaS oper5086933ator shares +similarities with a large firm. We conclude that, although RaaS represents an +example of specialization in the cybercrime industry, only a few members are +specialized in one topic, while the rest runs and coordinates the RaaS +operation. + +
+
+
+
+
+ + ☆ A Parameter-Free Two-Bit Covariance Estimator with Improved Operator + Norm Error Rate + + +
+ A covariance matrix estimator using two bits per entry was recently developed +by Dirksen, Maly and Rauhut [Annals of Statistics, 50(6), pp. 3538-3562]. The +estimator achieves near minimax rate for general sub-Gaussian distributions, +but also suffers from two downsides: theoretically, there is an essential gap +on operator norm error between their estimator and sample covariance when the +diagonal of the covariance matrix is dominated by only a few entries; +practically, its performance heavily relies on the dithering scale, which needs +to be tuned according to some unknown parameters. In this work, we propose a +new 2-bit covariance matrix estimator that simultaneously addresses both +issues. Unlike the sign quantizer associated with uniform dither in Dirksen et +al., we adopt a triangular dither prior to a 2-bit quantizer inspired by the +multi-bit uniform quantizer. By employing dithering scales varying across +entries, our estimator enjoys an improved operator norm error rate that depends +on the effective rank of the underlying covariance matrix rather than the +ambient dimension, thus closing the theoretical gap. Moreover, our proposed +method eliminates the need of any tuning parameter, as the dithering scales are +entirely determined by the data. Experimental results under Gaussian samples +are provided to showcase the impressive numerical performance of our estimator. +Remarkably, by halving the dithering scales, our estimator oftentimes achieves +operator norm errors less than twice of the errors of sample covariance. + +
+
+ comment: 24 pages, 2 figures +
+
+
+
+
+ + ☆ Low-Rank Multitask Learning based on Tensorized SVMs and LSSVMs + + +
+ Multitask learning (MTL) leverages task-relatedness to enhance performance. +With the emergence of multimodal data, tasks can now be referenced by multiple +indices. In this paper, we employ high-order tensors, with each mode +corresponding to a task index, to naturally represent tasks referenced by +multiple indices and preserve their structural relations. Based on this +representation, we propose a general framework of low-rank MTL methods with +tensorized support vector machines (SVMs) and least square support vector +machines (LSSVMs), where the CP factorization is deployed over the coefficient +tensor. Our approach allows to model the task relation through a linear +combination of shared factors weighted by task-specific factors and is +generalized to both classification and regression problems. Through the +alternating optimization scheme and the Lagrangian function, each subproblem is +transformed into a convex problem, formulated as a quadratic programming or +linear system in the dual form. In contrast to previous MTL frameworks, our +decision function in the dual induces a weighted kernel function with a +task-coupling term characterized by the similarities of the task-specific +factors, better revealing the explicit relations across tasks in MTL. +Experimental results validate the effectiveness and superiority of our proposed +methods compared to existing state-of-the-art approaches in MTL. The code of +implementation will be available at https://github.com/liujiani0216/TSVM-MTL. + +
+
+
+
+
+ + ☆ PAVI: Plate-Amortized Variational Inference + + +
+ Given observed data and a probabilistic generative model, Bayesian inference +searches for the distribution of the model's parameters that could have yielded +the data. Inference is challenging for large population studies where millions +of measurements are performed over a cohort of hundreds of subjects, resulting +in a massive parameter space. This large cardinality renders off-the-shelf +Variational Inference (VI) computationally impractical. + In this work, we design structured VI families that efficiently tackle large +population studies. Our main idea is to share the parameterization and learning +across the different i.i.d. variables in a generative model, symbolized by the +model's \textit{plates}. We name this concept \textit{plate amortization}. +Contrary to off-the-shelf stochastic VI, which slows down inference, plate +amortization results in orders of magnitude faster to train variational +distributions. + Applied to large-scale hierarchical problems, PAVI yields expressive, +parsimoniously parameterized VI with an affordable training time. This faster +convergence effectively unlocks inference in those large regimes. We illustrate +the practical utility of PAVI through a challenging Neuroimaging example +featuring 400 million latent parameters, demonstrating a significant step +towards scalable and expressive Variational Inference. + +
+
+
+
+
+ + ☆ EnsembleFollower: A Hybrid Car-Following Framework Based On + Reinforcement Learning and Hierarchical Planning + + +
+ Car-following models have made significant contributions to our understanding +of longitudinal driving behavior. However, they often exhibit limited accuracy +and flexibility, as they cannot fully capture the complexity inherent in +car-following processes, or may falter in unseen scenarios due to their +reliance on confined driving skills present in training data. It is worth +noting that each car-following model possesses its own strengths and weaknesses +depending on specific driving scenarios. Therefore, we propose +EnsembleFollower, a hierarchical planning framework for achieving advanced +human-like car-following. The EnsembleFollower framework involves a high-level +Reinforcement Learning-based agent responsible for judiciously managing +multiple low-level car-following models according to the current state, either +by selecting an appropriate low-level model to perform an action or by +allocating different weights across all low-level components. Moreover, we +propose a jerk-constrained kinematic model for more convincing car-following +simulations. We evaluate the proposed method based on real-world driving data +from the HighD dataset. The experimental results illustrate that +EnsembleFollower yields improved accuracy of human-like behavior and achieves +effectiveness in combining hybrid models, demonstrating that our proposed +framework can handle diverse car-following conditions by leveraging the +strengths of various low-level models. + +
+
+ comment: 12 pages, 10 figures +
+
+
+
+
+ + ☆ FPTQ: Fine-grained Post-Training Quantization for Large Language Models + + +
+ In the era of large-scale language models, the substantial parameter size +poses significant challenges for deployment. Being a prevalent compression +technique, quantization has emerged as the mainstream practice to tackle this +issue, which is mainly centered on two recipes W8A8 and W4A16 (i.e. weights and +activations in such bit widths). In this study, we propose a novel W4A8 +post-training quantization method for the available open-sourced LLMs, which +combines the advantages of both two recipes. Therefore, we can leverage the +benefit in the I/O utilization of 4-bit weight quantization and the +acceleration due to 8-bit matrix computation. Nevertheless, the W4A8 faces +notorious performance degradation. As a remedy, we involve layerwise activation +quantization strategies which feature a novel logarithmic equalization for most +intractable layers, and we combine them with fine-grained weight quantization. +Without whistles and bells, we eliminate the necessity for further fine-tuning +and obtain the state-of-the-art W4A8 quantized performance on BLOOM, LLaMA, and +LLaMA-2 on standard benchmarks. We confirm that the W4A8 quantization is +achievable for the deployment of large language models, fostering their +wide-spreading real-world applications. + +
+
+
+
+
+ + ☆ Learning Structure-from-Motion with Graph Attention Networks + + +
+ In this paper we tackle the problem of learning Structure-from-Motion (SfM) +through the use of graph attention networks. SfM is a classic computer vision +problem that is solved though iterative minimization of reprojection errors, +referred to as Bundle Adjustment (BA), starting from a good initialization. In +order to obtain a good enough initialization to BA, conventional methods rely +on a sequence of sub-problems (such as pairwise pose estimation, pose averaging +or triangulation) which provides an initial solution that can then be refined +using BA. In this work we replace these sub-problems by learning a model that +takes as input the 2D keypoints detected across multiple views, and outputs the +corresponding camera poses and 3D keypoint coordinates. Our model takes +advantage of graph neural networks to learn SfM-specific primitives, and we +show that it can be used for fast inference of the reconstruction for new and +unseen sequences. The experimental results show that the proposed model +outperforms competing learning-based methods, and challenges COLMAP while +having lower runtime. + +
+
+
+
+
+ + ☆ Demo: A Digital Twin of the 5G Radio Access Network for Anomaly + Detection Functionality + + +
+ Recently, the concept of digital twins (DTs) has received significant +attention within the realm of 5G/6G. This demonstration shows an innovative DT +design and implementation framework tailored toward integration within the 5G +infrastructure. The proposed DT enables near real-time anomaly detection +capability pertaining to user connectivity. It empowers the 5G system to +proactively execute decisions for resource control and connection restoration. + +
+
+ comment: 2 pages, 2 figures. This paper has been accepted by the 31st IEEE + International Conference on Network Protocols (ICNP 2023) +
+
+
+
+
+ + ☆ Jaccard-constrained dense subgraph discovery + + +
+ Finding dense subgraphs is a core problem in graph mining with many +applications in diverse domains. At the same time many real-world networks vary +over time, that is, the dataset can be represented as a sequence of graph +snapshots. Hence, it is natural to consider the question of finding dense +subgraphs in a temporal network that are allowed to vary over time to a certain +degree. In this paper, we search for dense subgraphs that have large pairwise +Jaccard similarity coefficients. More formally, given a set of graph snapshots +and a weight $\lambda$, we find a collection of dense subgraphs such that the +sum of densities of the induced subgraphs plus the sum of Jaccard indices, +weighted by $\lambda$, is maximized. We prove that this problem is NP-hard. To +discover dense subgraphs with good objective value, we present an iterative +algorithm which runs in $\mathcal{O}(n^2k^2 + m \log n + k^3 n)$ time per +single iteration, and a greedy algorithm which runs in $\mathcal{O}(n^2k^2 + m +\log n + k^3 n)$ time, where $k$ is the length of the graph sequence and $n$ +and $m$ denote number of nodes and total number of edges respectively. We show +experimentally that our algorithms are efficient, they can find ground truth in +synthetic datasets and provide interpretable results from real-world datasets. +Finally, we present a case study that shows the usefulness of our problem. + +
+
+
+
+
+ + ☆ LLaSM: Large Language and Speech Model + + +
+ Multi-modal large language models have garnered significant interest +recently. Though, most of the works focus on vision-language multi-modal models +providing strong capabilities in following vision-and-language instructions. +However, we claim that speech is also an important modality through which +humans interact with the world. Hence, it is crucial for a general-purpose +assistant to be able to follow multi-modal speech-and-language instructions. In +this work, we propose Large Language and Speech Model (LLaSM). LLaSM is an +end-to-end trained large multi-modal speech-language model with cross-modal +conversational abilities, capable of following speech-and-language +instructions. Our early experiments show that LLaSM demonstrates a more +convenient and natural way for humans to interact with artificial intelligence. +Specifically, we also release a large Speech Instruction Following dataset +LLaSM-Audio-Instructions. Code and demo are available at +https://github.com/LinkSoul-AI/LLaSM and +https://huggingface.co/spaces/LinkSoul/LLaSM. The LLaSM-Audio-Instructions +dataset is available at +https://huggingface.co/datasets/LinkSoul/LLaSM-Audio-Instructions. + +
+
+
+
+
+ + ☆ Cyclophobic Reinforcement Learning + + +
+ In environments with sparse rewards, finding a good inductive bias for +exploration is crucial to the agent's success. However, there are two competing +goals: novelty search and systematic exploration. While existing approaches +such as curiosity-driven exploration find novelty, they sometimes do not +systematically explore the whole state space, akin to depth-first-search vs +breadth-first-search. In this paper, we propose a new intrinsic reward that is +cyclophobic, i.e., it does not reward novelty, but punishes redundancy by +avoiding cycles. Augmenting the cyclophobic intrinsic reward with a sequence of +hierarchical representations based on the agent's cropped observations we are +able to achieve excellent results in the MiniGrid and MiniHack environments. +Both are particularly hard, as they require complex interactions with different +objects in order to be solved. Detailed comparisons with previous approaches +and thorough ablation studies show that our newly proposed cyclophobic +reinforcement learning is more sample efficient than other state of the art +methods in a variety of tasks. + +
+
+ comment: Published in Transactions on Machine Learning Research (08/2023) +
+
+
+
+
+ + ☆ Thermodynamic Computing via Autonomous Quantum Thermal Machines + + +
+ We develop a physics-based model for classical computation based on +autonomous quantum thermal machines. These machines consist of few interacting +quantum bits (qubits) connected to several environments at different +temperatures. Heat flows through the machine are here exploited for computing. +The process starts by setting the temperatures of the environments according to +the logical input. The machine evolves, eventually reaching a non-equilibrium +steady state, from which the output of the computation can be determined via +the temperature of an auxilliary finite-size reservoir. Such a machine, which +we term a "thermodynamic neuron", can implement any linearly-separable +function, and we discuss explicitly the cases of NOT, 3-majority and NOR gates. +In turn, we show that a network of thermodynamic neurons can perform any +desired function. We discuss the close connection between our model and +artificial neurons (perceptrons), and argue that our model provides an +alternative physics-based analogue implementation of neural networks, and more +generally a platform for thermodynamic computing. + +
+
+ comment: 12 + 4 pages. Comments welcome! +
+
+
+
+
+ + ☆ Beyond Traditional Neural Networks: Toward adding Reasoning and Learning + Capabilities through Computational Logic Techniques + + +
+ Deep Learning (DL) models have become popular for solving complex problems, +but they have limitations such as the need for high-quality training data, lack +of transparency, and robustness issues. Neuro-Symbolic AI has emerged as a +promising approach combining the strengths of neural networks and symbolic +reasoning. Symbolic knowledge injection (SKI) techniques are a popular method +to incorporate symbolic knowledge into sub-symbolic systems. This work proposes +solutions to improve the knowledge injection process and integrate elements of +ML and logic into multi-agent systems (MAS). + +
+
+ comment: In Proceedings ICLP 2023, arXiv:2308.14898 +
+
+
+
+
+ + ☆ On the Potential of CLIP for Compositional Logical Reasoning + + +
+ In this paper we explore the possibility of using OpenAI's CLIP to perform +logically coherent grounded visual reasoning. To that end, we formalize our +terms and give a geometric analysis of how embeddings in CLIP's latent space +would need to be configured in order for the system to be logically coherent. +Our main conclusion is that, as usually configured, CLIP cannot perform such +reasoning. + +
+
+ comment: In Proceedings ICLP 2023, arXiv:2308.14898 +
+
+
+
+
+ + ☆ Towards One-Shot Learning for Text Classification using Inductive Logic + Programming + + +
+ With the ever-increasing potential of AI to perform personalised tasks, it is +becoming essential to develop new machine learning techniques which are +data-efficient and do not require hundreds or thousands of training data. In +this paper, we explore an Inductive Logic Programming approach for one-shot +text classification. In particular, we explore the framework of +Meta-Interpretive Learning (MIL), along with using common-sense background +knowledge extracted from ConceptNet. Results indicate that MIL can learn text +classification rules from a small number of training examples. Moreover, the +higher complexity of chosen examples, the higher accuracy of the outcome. + +
+
+ comment: In Proceedings ICLP 2023, arXiv:2308.14898 +
+
+
+
+
+ + ☆ "Would life be more interesting if I were in AI?" Answering + Counterfactuals based on Probabilistic Inductive Logic Programming + + +
+ Probabilistic logic programs are logic programs where some facts hold with a +specified probability. Here, we investigate these programs with a causal +framework that allows counterfactual queries. Learning the program structure +from observational data is usually done through heuristic search relying on +statistical tests. However, these statistical tests lack information about the +causal mechanism generating the data, which makes it unfeasible to use the +resulting programs for counterfactual reasoning. To address this, we propose a +language fragment that allows reconstructing a program from its induced +distribution. This further enables us to learn programs supporting +counterfactual queries. + +
+
+ comment: In Proceedings ICLP 2023, arXiv:2308.14898 +
+
+
+
+
+ + ☆ Minimum Width for Deep, Narrow MLP: A Diffeomorphism and the Whitney + Embedding Theorem Approach + + +
+ Recently, there has been significant attention on determining the minimum +width for the universal approximation property of deep, narrow MLPs. Among +these challenges, approximating a continuous function under the uniform norm is +important and challenging, with the gap between its lower and upper bound being +hard to narrow. In this regard, we propose a novel upper bound for the minimum +width, given by $\operatorname{max}(2d_x+1, d_y) + \alpha(\sigma)$, to achieve +uniform approximation in deep narrow MLPs, where $0\leq \alpha(\sigma)\leq 2$ +represents the constant depending on the activation function. We demonstrate +this bound through two key proofs. First, we establish that deep, narrow MLPs +with little additional width can approximate diffeomorphisms. Secondly, we +utilize the Whitney embedding theorem to show that any continuous function can +be approximated by embeddings, further decomposed into linear transformations +and diffeomorphisms. + +
+
+
+
+
+ + ☆ Domain Generalization without Excess Empirical Risk NeurIPS 2022 + + +
+ Given data from diverse sets of distinct distributions, domain generalization +aims to learn models that generalize to unseen distributions. A common approach +is designing a data-driven surrogate penalty to capture generalization and +minimize the empirical risk jointly with the penalty. We argue that a +significant failure mode of this recipe is an excess risk due to an erroneous +penalty or hardness in joint optimization. We present an approach that +eliminates this problem. Instead of jointly minimizing empirical risk with the +penalty, we minimize the penalty under the constraint of optimality of the +empirical risk. This change guarantees that the domain generalization penalty +cannot impair optimization of the empirical risk, i.e., in-distribution +performance. To solve the proposed optimization problem, we demonstrate an +exciting connection to rate-distortion theory and utilize its tools to design +an efficient method. Our approach can be applied to any penalty-based domain +generalization method, and we demonstrate its effectiveness by applying it to +three examplar methods from the literature, showing significant improvements. + +
+
+ comment: Published at NeurIPS 2022 +
+
+
+
+
+ + ☆ MSGNN: Multi-scale Spatio-temporal Graph Neural Network for Epidemic + Forecasting + + +
+ Infectious disease forecasting has been a key focus and proved to be crucial +in controlling epidemic. A recent trend is to develop forecast-ing models based +on graph neural networks (GNNs). However, existing GNN-based methods suffer +from two key limitations: (1) Current models broaden receptive fields by +scaling the depth of GNNs, which is insuffi-cient to preserve the semantics of +long-range connectivity between distant but epidemic related areas. (2) +Previous approaches model epidemics within single spatial scale, while ignoring +the multi-scale epidemic pat-terns derived from different scales. To address +these deficiencies, we devise the Multi-scale Spatio-temporal Graph Neural +Network (MSGNN) based on an innovative multi-scale view. To be specific, in the +proposed MSGNN model, we first devise a novel graph learning module, which +directly captures long-range connectivity from trans-regional epidemic signals +and integrates them into a multi-scale graph. Based on the learned multi-scale +graph, we utilize a newly designed graph convolution module to exploit +multi-scale epidemic patterns. This module allows us to facilitate multi-scale +epidemic modeling by mining both scale-shared and scale-specific pat-terns. +Experimental results on forecasting new cases of COVID-19 in United State +demonstrate the superiority of our method over state-of-arts. Further analyses +and visualization also show that MSGNN offers not only accurate, but also +robust and interpretable forecasting result. + +
+
+ comment: 29 pages +
+
+
+
+
+ + ☆ Adaptive Lasso, Transfer Lasso, and Beyond: An Asymptotic Perspective + + +
+ This paper presents a comprehensive exploration of the theoretical properties +inherent in the Adaptive Lasso and the Transfer Lasso. The Adaptive Lasso, a +well-established method, employs regularization divided by initial estimators +and is characterized by asymptotic normality and variable selection +consistency. In contrast, the recently proposed Transfer Lasso employs +regularization subtracted by initial estimators with the demonstrated capacity +to curtail non-asymptotic estimation errors. A pivotal question thus emerges: +Given the distinct ways the Adaptive Lasso and the Transfer Lasso employ +initial estimators, what benefits or drawbacks does this disparity confer upon +each method? This paper conducts a theoretical examination of the asymptotic +properties of the Transfer Lasso, thereby elucidating its differentiation from +the Adaptive Lasso. Informed by the findings of this analysis, we introduce a +novel method, one that amalgamates the strengths and compensates for the +weaknesses of both methods. The paper concludes with validations of our theory +and comparisons of the methods via simulation experiments. + +
+
+
+
+
+ + ☆ Federated Two Stage Decoupling With Adaptive Personalization Layers + + +
+ Federated learning has gained significant attention due to its groundbreaking +ability to enable distributed learning while maintaining privacy constraints. +However, as a consequence of data heterogeneity among decentralized devices, it +inherently experiences significant learning degradation and slow convergence +speed. Therefore, it is natural to employ the concept of clustering homogeneous +clients into the same group, allowing only the model weights within each group +to be aggregated. While most existing clustered federated learning methods +employ either model gradients or inference outputs as metrics for client +partitioning, with the goal of grouping similar devices together, may still +have heterogeneity within each cluster. Moreover, there is a scarcity of +research exploring the underlying reasons for determining the appropriate +timing for clustering, resulting in the common practice of assigning each +client to its own individual cluster, particularly in the context of highly non +independent and identically distributed (Non-IID) data. In this paper, we +introduce a two-stage decoupling federated learning algorithm with adaptive +personalization layers named FedTSDP, where client clustering is performed +twice according to inference outputs and model weights, respectively. Hopkins +amended sampling is adopted to determine the appropriate timing for clustering +and the sampling weight of public unlabeled data. In addition, a simple yet +effective approach is developed to adaptively adjust the personalization layers +based on varying degrees of data skew. Experimental results show that our +proposed method has reliable performance on both IID and non-IID scenarios. + +
+
+
+
+
+ + ☆ Peering Through Preferences: Unraveling Feedback Acquisition for + Aligning Large Language Models + + +
+ Aligning large language models (LLMs) with human values and intents +critically involves the use of human or AI feedback. While dense feedback +annotations are expensive to acquire and integrate, sparse feedback presents a +structural design choice between ratings (e.g., score Response A on a scale of +1-7) and rankings (e.g., is Response A better than Response B?). In this work, +we analyze the effect of this design choice for the alignment and evaluation of +LLMs. We uncover an inconsistency problem wherein the preferences inferred from +ratings and rankings significantly disagree 60% for both human and AI +annotators. Our subsequent analysis identifies various facets of annotator +biases that explain this phenomena, such as human annotators would rate denser +responses higher while preferring accuracy during pairwise judgments. To our +surprise, we also observe that the choice of feedback protocol also has a +significant effect on the evaluation of aligned LLMs. In particular, we find +that LLMs that leverage rankings data for alignment (say model X) are preferred +over those that leverage ratings data (say model Y), with a rank-based +evaluation protocol (is X/Y's response better than reference response?) but not +with a rating-based evaluation protocol (score Rank X/Y's response on a scale +of 1-7). Our findings thus shed light on critical gaps in methods for +evaluating the real-world utility of language models and their strong +dependence on the feedback protocol used for alignment. Our code and data are +available at https://github.com/Hritikbansal/sparse_feedback. + +
+
+ comment: 24 pages, 12 Tables, 3 Figures +
+
+
+
+
+ + ☆ HAlf-MAsked Model for Named Entity Sentiment analysis + + +
+ Named Entity Sentiment analysis (NESA) is one of the most actively developing +application domains in Natural Language Processing (NLP). Social media NESA is +a significant field of opinion analysis since detecting and tracking sentiment +trends in the news flow is crucial for building various analytical systems and +monitoring the media image of specific people or companies. In this paper, we +study different transformers-based solutions NESA in RuSentNE-23 evaluation. +Despite the effectiveness of the BERT-like models, they can still struggle with +certain challenges, such as overfitting, which appeared to be the main obstacle +in achieving high accuracy on the RuSentNE-23 data. We present several +approaches to overcome this problem, among which there is a novel technique of +additional pass over given data with masked entity before making the final +prediction so that we can combine logits from the model when it knows the exact +entity it predicts sentiment for and when it does not. Utilizing this +technique, we ensemble multiple BERT- like models trained on different subsets +of data to improve overall performance. Our proposed model achieves the best +result on RuSentNE-23 evaluation data and demonstrates improved consistency in +entity-level sentiment analysis. + +
+
+
+
+
+ + ☆ FedCiR: Client-Invariant Representation Learning for Federated Non-IID + Features + + +
+ Federated learning (FL) is a distributed learning paradigm that maximizes the +potential of data-driven models for edge devices without sharing their raw +data. However, devices often have non-independent and identically distributed +(non-IID) data, meaning their local data distributions can vary significantly. +The heterogeneity in input data distributions across devices, commonly referred +to as the feature shift problem, can adversely impact the training convergence +and accuracy of the global model. To analyze the intrinsic causes of the +feature shift problem, we develop a generalization error bound in FL, which +motivates us to propose FedCiR, a client-invariant representation learning +framework that enables clients to extract informative and client-invariant +features. Specifically, we improve the mutual information term between +representations and labels to encourage representations to carry essential +classification knowledge, and diminish the mutual information term between the +client set and representations conditioned on labels to promote representations +of clients to be client-invariant. We further incorporate two regularizers into +the FL framework to bound the mutual information terms with an approximate +global representation distribution to compensate for the absence of the +ground-truth global representation distribution, thus achieving informative and +client-invariant feature extraction. To achieve global representation +distribution approximation, we propose a data-free mechanism performed by the +server without compromising privacy. Extensive experiments demonstrate the +effectiveness of our approach in achieving client-invariant representation +learning and solving the data heterogeneity issue. + +
+
+
+
+
+ + ☆ Split Without a Leak: Reducing Privacy Leakage in Split Learning + + +
+ The popularity of Deep Learning (DL) makes the privacy of sensitive data more +imperative than ever. As a result, various privacy-preserving techniques have +been implemented to preserve user data privacy in DL. Among various +privacy-preserving techniques, collaborative learning techniques, such as Split +Learning (SL) have been utilized to accelerate the learning and prediction +process. Initially, SL was considered a promising approach to data privacy. +However, subsequent research has demonstrated that SL is susceptible to many +types of attacks and, therefore, it cannot serve as a privacy-preserving +technique. Meanwhile, countermeasures using a combination of SL and encryption +have also been introduced to achieve privacy-preserving deep learning. In this +work, we propose a hybrid approach using SL and Homomorphic Encryption (HE). +The idea behind it is that the client encrypts the activation map (the output +of the split layer between the client and the server) before sending it to the +server. Hence, during both forward and backward propagation, the server cannot +reconstruct the client's input data from the intermediate activation map. This +improvement is important as it reduces privacy leakage compared to other +SL-based works, where the server can gain valuable information about the +client's input. In addition, on the MIT-BIH dataset, our proposed hybrid +approach using SL and HE yields faster training time (about 6 times) and +significantly reduced communication overhead (almost 160 times) compared to +other HE-based approaches, thereby offering improved privacy protection for +sensitive data in DL. + +
+
+
+
+
+ + ☆ Efficient and Explainable Graph Neural Architecture Search via + Monte-Carlo Tree Search + + +
+ Graph neural networks (GNNs) are powerful tools for performing data science +tasks in various domains. Although we use GNNs in wide application scenarios, +it is a laborious task for researchers and practitioners to design/select +optimal GNN rchitectures in diverse graphs. To save human efforts and +computational costs, graph neural architecture search (Graph NAS) has been used +to search for a sub-optimal GNN architecture that combines existing components. +However, there are no existing Graph NAS methods that satisfy explainability, +efficiency, and adaptability to various graphs. Therefore, we propose an +efficient and explainable Graph NAS method, called ExGNAS, which consists of +(i) a simple search space that can adapt to various graphs and (ii) a search +algorithm that makes the decision process explainable. The search space +includes only fundamental functions that can handle homophilic and heterophilic +graphs. The search algorithm efficiently searches for the best GNN architecture +via Monte-Carlo tree search without neural models. The combination of our +search space and algorithm achieves finding accurate GNN models and the +important functions within the search space. We comprehensively evaluate our +method compared with twelve hand-crafted GNN architectures and three Graph NAS +methods in four graphs. Our experimental results show that ExGNAS increases AUC +up to 3.6 and reduces run time up to 78\% compared with the state-of-the-art +Graph NAS methods. Furthermore, we show ExGNAS is effective in analyzing the +difference between GNN architectures in homophilic and heterophilic graphs. + +
+
+
+
+
+ + ☆ Fully Embedded Time-Series Generative Adversarial Networks + + +
+ Generative Adversarial Networks (GANs) should produce synthetic data that +fits the underlying distribution of the data being modeled. For real valued +time-series data, this implies the need to simultaneously capture the static +distribution of the data, but also the full temporal distribution of the data +for any potential time horizon. This temporal element produces a more complex +problem that can potentially leave current solutions under-constrained, +unstable during training, or prone to varying degrees of mode collapse. In +FETSGAN, entire sequences are translated directly to the generator's sampling +space using a seq2seq style adversarial auto encoder (AAE), where adversarial +training is used to match the training distribution in both the feature space +and the lower dimensional sampling space. This additional constraint provides a +loose assurance that the temporal distribution of the synthetic samples will +not collapse. In addition, the First Above Threshold (FAT) operator is +introduced to supplement the reconstruction of encoded sequences, which +improves training stability and the overall quality of the synthetic data being +generated. These novel contributions demonstrate a significant improvement to +the current state of the art for adversarial learners in qualitative measures +of temporal similarity and quantitative predictive ability of data generated +through FETSGAN. + +
+
+
+
+
+ + ☆ Surrogate-based Autotuning for Randomized Sketching Algorithms in + Regression Problems + + +
+ Algorithms from Randomized Numerical Linear Algebra (RandNLA) are known to be +effective in handling high-dimensional computational problems, providing +high-quality empirical performance as well as strong probabilistic guarantees. +However, their practical application is complicated by the fact that the user +needs to set various algorithm-specific tuning parameters which are different +than those used in traditional NLA. This paper demonstrates how a +surrogate-based autotuning approach can be used to address fundamental problems +of parameter selection in RandNLA algorithms. In particular, we provide a +detailed investigation of surrogate-based autotuning for +sketch-and-precondition (SAP) based randomized least squares methods, which +have been one of the great success stories in modern RandNLA. Empirical results +show that our surrogate-based autotuning approach can achieve near-optimal +performance with much less tuning cost than a random search (up to about 4x +fewer trials of different parameter configurations). Moreover, while our +experiments focus on least squares, our results demonstrate a general-purpose +autotuning pipeline applicable to any kind of RandNLA algorithm. + +
+
+
+
+
+ + ☆ Exploring Deep Learning for Full-disk Solar Flare Prediction with + Empirical Insights from Guided Grad-CAM Explanations + + +
+ This study progresses solar flare prediction research by presenting a +full-disk deep-learning model to forecast $\geq$M-class solar flares and +evaluating its efficacy on both central (within $\pm$70$^\circ$) and near-limb +(beyond $\pm$70$^\circ$) events, showcasing qualitative assessment of post hoc +explanations for the model's predictions, and providing empirical findings from +human-centered quantitative assessments of these explanations. Our model is +trained using hourly full-disk line-of-sight magnetogram images to predict +$\geq$M-class solar flares within the subsequent 24-hour prediction window. +Additionally, we apply the Guided Gradient-weighted Class Activation Mapping +(Guided Grad-CAM) attribution method to interpret our model's predictions and +evaluate the explanations. Our analysis unveils that full-disk solar flare +predictions correspond with active region characteristics. The following points +represent the most important findings of our study: (1) Our deep learning +models achieved an average true skill statistic (TSS) of $\sim$0.51 and a +Heidke skill score (HSS) of $\sim$0.38, exhibiting skill to predict solar +flares where for central locations the average recall is $\sim$0.75 (recall +values for X- and M-class are 0.95 and 0.73 respectively) and for the near-limb +flares the average recall is $\sim$0.52 (recall values for X- and M-class are +0.74 and 0.50 respectively); (2) qualitative examination of the model's +explanations reveals that it discerns and leverages features linked to active +regions in both central and near-limb locations within full-disk magnetograms +to produce respective predictions. In essence, our models grasp the shape and +texture-based properties of flaring active regions, even in proximity to limb +areas -- a novel and essential capability with considerable significance for +operational forecasting systems. + +
+
+ comment: This is a preprint accepted at the 10th IEEE International Conference + On Data Science And Advanced Analytics (DSAA 2023). The conference + proceedings will be published by the IEEE Xplore Digital Library with ISBN: + 979-8-3503-4503-2. 10 pages, 6 figures +
+
+
+
+
+ + ☆ Speech Wikimedia: A 77 Language Multilingual Speech Dataset ICML + + +
+ The Speech Wikimedia Dataset is a publicly available compilation of audio +with transcriptions extracted from Wikimedia Commons. It includes 1780 hours +(195 GB) of CC-BY-SA licensed transcribed speech from a diverse set of +scenarios and speakers, in 77 different languages. Each audio file has one or +more transcriptions in different languages, making this dataset suitable for +training speech recognition, speech translation, and machine translation +models. + +
+
+ comment: Data-Centric Machine Learning Workshop at the International Machine + Learning Conference 2023 (ICML) +
+
+
+
+
+ + ☆ Threshold KNN-Shapley: A Linear-Time and Privacy-Friendly Approach to + Data Valuation + + +
+ Data valuation, a critical aspect of data-centric ML research, aims to +quantify the usefulness of individual data sources in training machine learning +(ML) models. However, data valuation faces significant yet frequently +overlooked privacy challenges despite its importance. This paper studies these +challenges with a focus on KNN-Shapley, one of the most practical data +valuation methods nowadays. We first emphasize the inherent privacy risks of +KNN-Shapley, and demonstrate the significant technical difficulties in adapting +KNN-Shapley to accommodate differential privacy (DP). To overcome these +challenges, we introduce TKNN-Shapley, a refined variant of KNN-Shapley that is +privacy-friendly, allowing for straightforward modifications to incorporate DP +guarantee (DP-TKNN-Shapley). We show that DP-TKNN-Shapley has several +advantages and offers a superior privacy-utility tradeoff compared to naively +privatized KNN-Shapley in discerning data quality. Moreover, even non-private +TKNN-Shapley achieves comparable performance as KNN-Shapley. Overall, our +findings suggest that TKNN-Shapley is a promising alternative to KNN-Shapley, +particularly for real-world applications involving sensitive data. + +
+
+
+
+
+ + ☆ Towards a Rigorous Analysis of Mutual Information in Contrastive + Learning + + +
+ Contrastive learning has emerged as a cornerstone in recent achievements of +unsupervised representation learning. Its primary paradigm involves an instance +discrimination task with a mutual information loss. The loss is known as +InfoNCE and it has yielded vital insights into contrastive learning through the +lens of mutual information analysis. However, the estimation of mutual +information can prove challenging, creating a gap between the elegance of its +mathematical foundation and the complexity of its estimation. As a result, +drawing rigorous insights or conclusions from mutual information analysis +becomes intricate. In this study, we introduce three novel methods and a few +related theorems, aimed at enhancing the rigor of mutual information analysis. +Despite their simplicity, these methods can carry substantial utility. +Leveraging these approaches, we reassess three instances of contrastive +learning analysis, illustrating their capacity to facilitate deeper +comprehension or to rectify pre-existing misconceptions. Specifically, we +investigate small batch size, mutual information as a measure, and the InfoMin +principle. + +
+
+ comment: 18 pages, 7 figures, Under review +
+
+
+
+
+ + ☆ Fragment and Integrate Network (FIN): A Novel Spatial-Temporal Modeling + Based on Long Sequential Behavior for Online Food Ordering Click-Through Rate + Prediction CIKM 2023 + + +
+ Spatial-temporal information has been proven to be of great significance for +click-through rate prediction tasks in online Location-Based Services (LBS), +especially in mainstream food ordering platforms such as DoorDash, Uber Eats, +Meituan, and Ele.me. Modeling user spatial-temporal preferences with sequential +behavior data has become a hot topic in recommendation systems and online +advertising. However, most of existing methods either lack the representation +of rich spatial-temporal information or only handle user behaviors with limited +length, e.g. 100. In this paper, we tackle these problems by designing a new +spatial-temporal modeling paradigm named Fragment and Integrate Network (FIN). +FIN consists of two networks: (i) Fragment Network (FN) extracts Multiple +Sub-Sequences (MSS) from lifelong sequential behavior data, and captures the +specific spatial-temporal representation by modeling each MSS respectively. +Here both a simplified attention and a complicated attention are adopted to +balance the performance gain and resource consumption. (ii) Integrate Network +(IN) builds a new integrated sequence by utilizing spatial-temporal interaction +on MSS and captures the comprehensive spatial-temporal representation by +modeling the integrated sequence with a complicated attention. Both public +datasets and production datasets have demonstrated the accuracy and scalability +of FIN. Since 2022, FIN has been fully deployed in the recommendation +advertising system of Ele.me, one of the most popular online food ordering +platforms in China, obtaining 5.7% improvement on Click-Through Rate (CTR) and +7.3% increase on Revenue Per Mille (RPM). + +
+
+ comment: Accepted by CIKM 2023 Applied Research Paper +
+
+
+
+
+ + ☆ Training Towards Critical Use: Learning to Situate AI Predictions + Relative to Human Knowledge + + +
+ A growing body of research has explored how to support humans in making +better use of AI-based decision support, including via training and onboarding. +Existing research has focused on decision-making tasks where it is possible to +evaluate "appropriate reliance" by comparing each decision against a ground +truth label that cleanly maps to both the AI's predictive target and the human +decision-maker's goals. However, this assumption does not hold in many +real-world settings where AI tools are deployed today (e.g., social work, +criminal justice, and healthcare). In this paper, we introduce a +process-oriented notion of appropriate reliance called critical use that +centers the human's ability to situate AI predictions against knowledge that is +uniquely available to them but unavailable to the AI model. To explore how +training can support critical use, we conduct a randomized online experiment in +a complex social decision-making setting: child maltreatment screening. We find +that, by providing participants with accelerated, low-stakes opportunities to +practice AI-assisted decision-making in this setting, novices came to exhibit +patterns of disagreement with AI that resemble those of experienced workers. A +qualitative examination of participants' explanations for their AI-assisted +decisions revealed that they drew upon qualitative case narratives, to which +the AI model did not have access, to learn when (not) to rely on AI +predictions. Our findings open new questions for the study and design of +training for real-world AI-assisted decision-making. + +
+
+
+
+
+ + ☆ Segmenting mechanically heterogeneous domains via unsupervised learning + + +
+ From biological organs to soft robotics, highly deformable materials are +essential components of natural and engineered systems. These highly deformable +materials can have heterogeneous material properties, and can experience +heterogeneous deformations with or without underlying material heterogeneity. +Many recent works have established that computational modeling approaches are +well suited for understanding and predicting the consequences of material +heterogeneity and for interpreting observed heterogeneous strain fields. In +particular, there has been significant work towards developing inverse analysis +approaches that can convert observed kinematic quantities (e.g., displacement, +strain) to material properties and mechanical state. Despite the success of +these approaches, they are not necessarily generalizable and often rely on +tight control and knowledge of boundary conditions. Here, we will build on the +recent advances (and ubiquity) of machine learning approaches to explore +alternative approaches to detect patterns in heterogeneous material properties +and mechanical behavior. Specifically, we will explore unsupervised learning +approaches to clustering and ensemble clutering to identify heterogeneous +regions. Overall, we find that these approaches are effective, yet limited in +their abilities. Through this initial exploration (where all data and code is +published alongside this manuscript), we set the stage for future studies that +more specifically adapt these methods to mechanical data. + +
+
+ comment: 26 pages, 10 figures +
+
+
+
+
+ + ☆ CongNaMul: A Dataset for Advanced Image Processing of Soybean Sprouts + + +
+ We present 'CongNaMul', a comprehensive dataset designed for various tasks in +soybean sprouts image analysis. The CongNaMul dataset is curated to facilitate +tasks such as image classification, semantic segmentation, decomposition, and +measurement of length and weight. The classification task provides four classes +to determine the quality of soybean sprouts: normal, broken, spotted, and +broken and spotted, for the development of AI-aided automatic quality +inspection technology. For semantic segmentation, images with varying +complexity, from single sprout images to images with multiple sprouts, along +with human-labelled mask images, are included. The label has 4 different +classes: background, head, body, tail. The dataset also provides images and +masks for the image decomposition task, including two separate sprout images +and their combined form. Lastly, 5 physical features of sprouts (head length, +body length, body thickness, tail length, weight) are provided for image-based +measurement tasks. This dataset is expected to be a valuable resource for a +wide range of research and applications in the advanced analysis of images of +soybean sprouts. Also, we hope that this dataset can assist researchers +studying classification, semantic segmentation, decomposition, and physical +feature measurement in other industrial fields, in evaluating their models. The +dataset is available at the authors' repository. (https://bhban.kr/data) + +
+
+ comment: Accepted to International Conference on ICT Convergence 2023 +
+
+
+
+
+ + ☆ MDTD: A Multi Domain Trojan Detector for Deep Neural Networks CCS + + +
+ Machine learning models that use deep neural networks (DNNs) are vulnerable +to backdoor attacks. An adversary carrying out a backdoor attack embeds a +predefined perturbation called a trigger into a small subset of input samples +and trains the DNN such that the presence of the trigger in the input results +in an adversary-desired output class. Such adversarial retraining however needs +to ensure that outputs for inputs without the trigger remain unaffected and +provide high classification accuracy on clean samples. In this paper, we +propose MDTD, a Multi-Domain Trojan Detector for DNNs, which detects inputs +containing a Trojan trigger at testing time. MDTD does not require knowledge of +trigger-embedding strategy of the attacker and can be applied to a pre-trained +DNN model with image, audio, or graph-based inputs. MDTD leverages an insight +that input samples containing a Trojan trigger are located relatively farther +away from a decision boundary than clean samples. MDTD estimates the distance +to a decision boundary using adversarial learning methods and uses this +distance to infer whether a test-time input sample is Trojaned or not. We +evaluate MDTD against state-of-the-art Trojan detection methods across five +widely used image-based datasets: CIFAR100, CIFAR10, GTSRB, SVHN, and +Flowers102; four graph-based datasets: AIDS, WinMal, Toxicant, and COLLAB; and +the SpeechCommand audio dataset. MDTD effectively identifies samples that +contain different types of Trojan triggers. We evaluate MDTD against adaptive +attacks where an adversary trains a robust DNN to increase (decrease) distance +of benign (Trojan) inputs from a decision boundary. + +
+
+ comment: Accepted to ACM Conference on Computer and Communications Security + (ACM CCS) 2023 +
+
+
+
+
+ + ♻ ☆ Policy composition in reinforcement learning via multi-objective policy + optimization + + +
+ We enable reinforcement learning agents to learn successful behavior policies +by utilizing relevant pre-existing teacher policies. The teacher policies are +introduced as objectives, in addition to the task objective, in a +multi-objective policy optimization setting. Using the Multi-Objective Maximum +a Posteriori Policy Optimization algorithm (Abdolmaleki et al. 2020), we show +that teacher policies can help speed up learning, particularly in the absence +of shaping rewards. In two domains with continuous observation and action +spaces, our agents successfully compose teacher policies in sequence and in +parallel, and are also able to further extend the policies of the teachers in +order to solve the task. + Depending on the specified combination of task and teacher(s), teacher(s) may +naturally act to limit the final performance of an agent. The extent to which +agents are required to adhere to teacher policies are determined by +hyperparameters which determine both the effect of teachers on learning speed +and the eventual performance of the agent on the task. In the humanoid domain +(Tassa et al. 2018), we also equip agents with the ability to control the +selection of teachers. With this ability, agents are able to meaningfully +compose from the teacher policies to achieve a superior task reward on the walk +task than in cases without access to the teacher policies. We show the +resemblance of composed task policies with the corresponding teacher policies +through videos. + +
+
+
+
+
+ + ♻ ☆ Walking in the Shadow: A New Perspective on Descent Directions for + Constrained Minimization + + +
+ Descent directions such as movement towards Descent directions, including +movement towards Frank-Wolfe vertices, away-steps, in-face away-steps and +pairwise directions, have been an important design consideration in conditional +gradient descent (CGD) variants. In this work, we attempt to demystify the +impact of the movement in these directions towards attaining constrained +minimizers. The optimal local direction of descent is the directional +derivative (i.e., shadow) of the projection of the negative gradient. We show +that this direction is the best away-step possible, and the continuous-time +dynamics of moving in the shadow is equivalent to the dynamics of projected +gradient descent (PGD), although it's non-trivial to discretize. We also show +that Frank-Wolfe (FW) vertices correspond to projecting onto the polytope using +an "infinite" step in the direction of the negative gradient, thus providing a +new perspective on these steps. We combine these insights into a novel +Shadow-CG method that uses FW and shadow steps, while enjoying linear +convergence, with a rate that depends on the number of breakpoints in its +projection curve, rather than the pyramidal width. We provide a linear bound on +the number of breakpoints for simple polytopes and present scaling-invariant +upper bounds for general polytopes based on the number of facets. We exemplify +the benefit of using Shadow-CG computationally for various applications, while +raising an open question about tightening the bound on the number of +breakpoints for general polytopes. + +
+
+
+
+
+ + ♻ ☆ CartiMorph: a framework for automated knee articular cartilage + morphometrics + + +
+ We introduce CartiMorph, a framework for automated knee articular cartilage +morphometrics. It takes an image as input and generates quantitative metrics +for cartilage subregions, including the percentage of full-thickness cartilage +loss (FCL), mean thickness, surface area, and volume. CartiMorph leverages the +power of deep learning models for hierarchical image feature representation. +Deep learning models were trained and validated for tissue segmentation, +template construction, and template-to-image registration. We established +methods for surface-normal-based cartilage thickness mapping, FCL estimation, +and rule-based cartilage parcellation. Our cartilage thickness map showed less +error in thin and peripheral regions. We evaluated the effectiveness of the +adopted segmentation model by comparing the quantitative metrics obtained from +model segmentation and those from manual segmentation. The root-mean-squared +deviation of the FCL measurements was less than 8%, and strong correlations +were observed for the mean thickness (Pearson's correlation coefficient $\rho +\in [0.82,0.97]$), surface area ($\rho \in [0.82,0.98]$) and volume ($\rho \in +[0.89,0.98]$) measurements. We compared our FCL measurements with those from a +previous study and found that our measurements deviated less from the ground +truths. We observed superior performance of the proposed rule-based cartilage +parcellation method compared with the atlas-based approach. CartiMorph has the +potential to promote imaging biomarkers discovery for knee osteoarthritis. + +
+
+ comment: To be published in Medical Image Analysis +
+
+
+
+
+ + ♻ ☆ Distributionally Robust Statistical Verification with Imprecise Neural + Networks + + +
+ A particularly challenging problem in AI safety is providing guarantees on +the behavior of high-dimensional autonomous systems. Verification approaches +centered around reachability analysis fail to scale, and purely statistical +approaches are constrained by the distributional assumptions about the sampling +process. Instead, we pose a distributionally robust version of the statistical +verification problem for black-box systems, where our performance guarantees +hold over a large family of distributions. This paper proposes a novel approach +based on a combination of active learning, uncertainty quantification, and +neural network verification. A central piece of our approach is an ensemble +technique called Imprecise Neural Networks, which provides the uncertainty to +guide active learning. The active learning uses an exhaustive neural-network +verification tool Sherlock to collect samples. An evaluation on multiple +physical simulators in the openAI gym Mujoco environments with +reinforcement-learned controllers demonstrates that our approach can provide +useful and scalable guarantees for high-dimensional systems. + +
+
+
+
+
+ + ♻ ☆ On progressive sharpening, flat minima and generalisation + + +
+ We present a new approach to understanding the relationship between loss +curvature and input-output model behaviour in deep learning. Specifically, we +use existing empirical analyses of the spectrum of deep network loss Hessians +to ground an ansatz tying together the loss Hessian and the input-output +Jacobian of a deep neural network over training samples throughout training. We +then prove a series of theoretical results which quantify the degree to which +the input-output Jacobian of a model approximates its Lipschitz norm over a +data distribution, and deduce a novel generalisation bound in terms of the +empirical Jacobian. We use our ansatz, together with our theoretical results, +to give a new account of the recently observed progressive sharpening +phenomenon, as well as the generalisation properties of flat minima. +Experimental evidence is provided to validate our claims. + +
+
+
+
+
+ + ♻ ☆ What You Hear Is What You See: Audio Quality Metrics From Image Quality + Metrics + + +
+ In this study, we investigate the feasibility of utilizing state-of-the-art +image perceptual metrics for evaluating audio signals by representing them as +spectrograms. The encouraging outcome of the proposed approach is based on the +similarity between the neural mechanisms in the auditory and visual pathways. +Furthermore, we customise one of the metrics which has a psychoacoustically +plausible architecture to account for the peculiarities of sound signals. We +evaluate the effectiveness of our proposed metric and several baseline metrics +using a music dataset, with promising results in terms of the correlation +between the metrics and the perceived quality of audio as rated by human +evaluators. + +
+
+
+
+
+ + ♻ ☆ Cancellation-Free Regret Bounds for Lagrangian Approaches in Constrained + Markov Decision Processes + + +
+ Constrained Markov Decision Processes (CMDPs) are one of the common ways to +model safe reinforcement learning problems, where constraint functions model +the safety objectives. Lagrangian-based dual or primal-dual algorithms provide +efficient methods for learning in CMDPs. For these algorithms, the currently +known regret bounds in the finite-horizon setting allow for a "cancellation of +errors"; one can compensate for a constraint violation in one episode with a +strict constraint satisfaction in another. However, we do not consider such a +behavior safe in practical applications. In this paper, we overcome this +weakness by proposing a novel model-based dual algorithm OptAug-CMDP for +tabular finite-horizon CMDPs. Our algorithm is motivated by the augmented +Lagrangian method and can be performed efficiently. We show that during $K$ +episodes of exploring the CMDP, our algorithm obtains a regret of +$\tilde{O}(\sqrt{K})$ for both the objective and the constraint violation. +Unlike existing Lagrangian approaches, our algorithm achieves this regret +without the need for the cancellation of errors. + +
+
+
+
+
+ + ♻ ☆ On the Consistency of Average Embeddings for Item Recommendation RecSys 2023 + + +
+ A prevalent practice in recommender systems consists in averaging item +embeddings to represent users or higher-level concepts in the same embedding +space. This paper investigates the relevance of such a practice. For this +purpose, we propose an expected precision score, designed to measure the +consistency of an average embedding relative to the items used for its +construction. We subsequently analyze the mathematical expression of this score +in a theoretical setting with specific assumptions, as well as its empirical +behavior on real-world data from music streaming services. Our results +emphasize that real-world averages are less consistent for recommendation, +which paves the way for future research to better align real-world embeddings +with assumptions from our theoretical setting. + +
+
+ comment: 17th ACM Conference on Recommender Systems (RecSys 2023) +
+
+
+
+
+ + ♻ ☆ Rule Generation for Classification: Scalability, Interpretability, and + Fairness + + +
+ We introduce a new rule-based optimization method for classification with +constraints. The proposed method leverages column generation for linear +programming, and hence, is scalable to large datasets. The resulting pricing +subproblem is shown to be NP-Hard. We recourse to a decision tree-based +heuristic and solve a proxy pricing subproblem for acceleration. The method +returns a set of rules along with their optimal weights indicating the +importance of each rule for learning. We address interpretability and fairness +by assigning cost coefficients to the rules and introducing additional +constraints. In particular, we focus on local interpretability and generalize +separation criterion in fairness to multiple sensitive attributes and classes. +We test the performance of the proposed methodology on a collection of datasets +and present a case study to elaborate on its different aspects. The proposed +rule-based learning method exhibits a good compromise between local +interpretability and fairness on the one side, and accuracy on the other side. + +
+
+
+
+
+ + ♻ ☆ Tensor train completion: local recovery guarantees via Riemannian + optimization + + +
+ In this work, we estimate the number of randomly selected elements of a +tensor that with high probability guarantees local convergence of Riemannian +gradient descent for tensor train completion. We derive a new bound for the +orthogonal projections onto the tangent spaces based on the harmonic mean of +the unfoldings' singular values and introduce a notion of core coherence for +tensor trains. We also extend the results to tensor train completion with +auxiliary subspace information and obtain the corresponding local convergence +guarantees. + +
+
+ comment: 1 figure added; Accepted version +
+
+
+
+
+ + ♻ ☆ Quantized Low-Rank Multivariate Regression with Random Dithering + + +
+ Low-rank multivariate regression (LRMR) is an important statistical learning +model that combines highly correlated tasks as a multiresponse regression +problem with low-rank priori on the coefficient matrix. In this paper, we study +quantized LRMR, a practical setting where the responses and/or the covariates +are discretized to finite precision. We focus on the estimation of the +underlying coefficient matrix. To make consistent estimator that could achieve +arbitrarily small error possible, we employ uniform quantization with random +dithering, i.e., we add appropriate random noise to the data before +quantization. Specifically, uniform dither and triangular dither are used for +responses and covariates, respectively. Based on the quantized data, we propose +the constrained Lasso and regularized Lasso estimators, and derive the +non-asymptotic error bounds. With the aid of dithering, the estimators achieve +minimax optimal rate, while quantization only slightly worsens the +multiplicative factor in the error rate. Moreover, we extend our results to a +low-rank regression model with matrix responses. We corroborate and demonstrate +our theoretical results via simulations on synthetic data or image restoration. + +
+
+ comment: 16 pages (Submitted) +
+
+
+
+
+ + ♻ ☆ MPI-rical: Data-Driven MPI Distributed Parallelism Assistance with + Transformers + + +
+ Message Passing Interface (MPI) plays a crucial role in distributed memory +parallelization across multiple nodes. However, parallelizing MPI code +manually, and specifically, performing domain decomposition, is a challenging, +error-prone task. In this paper, we address this problem by developing +MPI-RICAL, a novel data-driven, programming-assistance tool that assists +programmers in writing domain decomposition based distributed memory +parallelization code. Specifically, we train a supervised language model to +suggest MPI functions and their proper locations in the code on the fly. We +also introduce MPICodeCorpus, the first publicly available corpus of MPI-based +parallel programs that is created by mining more than 15,000 open-source +repositories on GitHub. Experimental results have been done on MPICodeCorpus +and more importantly, on a compiled benchmark of MPI-based parallel programs +for numerical computations that represent real-world scientific applications. +MPI-RICAL achieves F1 scores between 0.87-0.91 on these programs, demonstrating +its accuracy in suggesting correct MPI functions at appropriate code +locations.. The source code used in this work, as well as other relevant +sources, are available at: +https://github.com/Scientific-Computing-Lab-NRCN/MPI-rical + +
+
+
+
+
+ + ♻ ☆ Modeling Moral Choices in Social Dilemmas with Multi-Agent Reinforcement + Learning IJCAI 2023 + + +
+ Practical uses of Artificial Intelligence (AI) in the real world have +demonstrated the importance of embedding moral choices into intelligent agents. +They have also highlighted that defining top-down ethical constraints on AI +according to any one type of morality is extremely challenging and can pose +risks. A bottom-up learning approach may be more appropriate for studying and +developing ethical behavior in AI agents. In particular, we believe that an +interesting and insightful starting point is the analysis of emergent behavior +of Reinforcement Learning (RL) agents that act according to a predefined set of +moral rewards in social dilemmas. + In this work, we present a systematic analysis of the choices made by +intrinsically-motivated RL agents whose rewards are based on moral theories. We +aim to design reward structures that are simplified yet representative of a set +of key ethical systems. Therefore, we first define moral reward functions that +distinguish between consequence- and norm-based agents, between morality based +on societal norms or internal virtues, and between single- and mixed-virtue +(e.g., multi-objective) methodologies. Then, we evaluate our approach by +modeling repeated dyadic interactions between learning moral agents in three +iterated social dilemma games (Prisoner's Dilemma, Volunteer's Dilemma and Stag +Hunt). We analyze the impact of different types of morality on the emergence of +cooperation, defection or exploitation, and the corresponding social outcomes. +Finally, we discuss the implications of these findings for the development of +moral agents in artificial and mixed human-AI societies. + +
+
+ comment: Accepted at IJCAI 2023 (32nd International Joint Conference on + Artificial Intelligence - Macao, S.A.R.) +
+
+
+
+
+ + ♻ ☆ NeXtQSM -- A complete deep learning pipeline for data-consistent + quantitative susceptibility mapping trained with hybrid data + + +
+ Deep learning based Quantitative Susceptibility Mapping (QSM) has shown great +potential in recent years, obtaining similar results to established +non-learning approaches. Many current deep learning approaches are not data +consistent, require in vivo training data or solve the QSM problem in +consecutive steps resulting in the propagation of errors. Here we aim to +overcome these limitations and developed a framework to solve the QSM +processing steps jointly. We developed a new hybrid training data generation +method that enables the end-to-end training for solving background field +correction and dipole inversion in a data-consistent fashion using a +variational network that combines the QSM model term and a learned regularizer. +We demonstrate that NeXtQSM overcomes the limitations of previous deep learning +methods. NeXtQSM offers a new deep learning based pipeline for computing +quantitative susceptibility maps that integrates each processing step into the +training and provides results that are robust and fast. + +
+
+
+
+
+ + ♻ ☆ An exponentially-growing family of universal quantum circuits + + +
+ Quantum machine learning has become an area of growing interest but has +certain theoretical and hardware-specific limitations. Notably, the problem of +vanishing gradients, or barren plateaus, renders the training impossible for +circuits with high qubit counts, imposing a limit on the number of qubits that +data scientists can use for solving problems. Independently, angle-embedded +supervised quantum neural networks were shown to produce truncated Fourier +series with a degree directly dependent on two factors: the depth of the +encoding and the number of parallel qubits the encoding applied to. The degree +of the Fourier series limits the model expressivity. This work introduces two +new architectures whose Fourier degrees grow exponentially: the sequential and +parallel exponential quantum machine learning architectures. This is done by +efficiently using the available Hilbert space when encoding, increasing the +expressivity of the quantum encoding. Therefore, the exponential growth allows +staying at the low-qubit limit to create highly expressive circuits avoiding +barren plateaus. Practically, parallel exponential architecture was shown to +outperform the existing linear architectures by reducing their final mean +square error value by up to 44.7% in a one-dimensional test problem. +Furthermore, the feasibility of this technique was also shown on a trapped ion +quantum processing unit. + +
+
+ comment: 14 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Exploring the Benefits of Visual Prompting in Differential Privacy ICCV 2023 + + +
+ Visual Prompting (VP) is an emerging and powerful technique that allows +sample-efficient adaptation to downstream tasks by engineering a well-trained +frozen source model. In this work, we explore the benefits of VP in +constructing compelling neural network classifiers with differential privacy +(DP). We explore and integrate VP into canonical DP training methods and +demonstrate its simplicity and efficiency. In particular, we discover that VP +in tandem with PATE, a state-of-the-art DP training method that leverages the +knowledge transfer from an ensemble of teachers, achieves the state-of-the-art +privacy-utility trade-off with minimum expenditure of privacy budget. Moreover, +we conduct additional experiments on cross-domain image classification with a +sufficient domain gap to further unveil the advantage of VP in DP. Lastly, we +also conduct extensive ablation studies to validate the effectiveness and +contribution of VP under DP consideration. Our code is available at +(https://github.com/EzzzLi/Prompt-PATE). + +
+
+ comment: Published at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ DeltaNN: Assessing the Impact of Computational Environment Parameters on + the Performance of Image Recognition Models + + +
+ Image recognition tasks typically use deep learning and require enormous +processing power, thus relying on hardware accelerators like GPUs and TPUs for +fast, timely processing. Failure in real-time image recognition tasks can occur +due to sub-optimal mapping on hardware accelerators during model deployment, +which may lead to timing uncertainty and erroneous behavior. Mapping on +hardware accelerators is done using multiple software components like deep +learning frameworks, compilers, and device libraries, that we refer to as the +computational environment. Owing to the increased use of image recognition +tasks in safety-critical applications like autonomous driving and medical +imaging, it is imperative to assess their robustness to changes in the +computational environment, as the impact of parameters like deep learning +frameworks, compiler optimizations, and hardware devices on model performance +and correctness is not yet well understood. + In this paper we present a differential testing framework, DeltaNN, that +allows us to assess the impact of different computational environment +parameters on the performance of image recognition models during deployment, +post training. DeltaNN generates different implementations of a given image +recognition model for variations in environment parameters, namely, deep +learning frameworks, compiler optimizations and hardware devices and analyzes +differences in model performance as a result. Using DeltaNN, we conduct an +empirical study of robustness analysis of three popular image recognition +models using the ImageNet dataset. We report the impact in terms of +misclassifications and inference time differences across different settings. In +total, we observed up to 72% output label differences across deep learning +frameworks, and up to 81% unexpected performance degradation in terms of +inference time, when applying compiler optimizations. + +
+
+ comment: 11 pages, 10 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Laughing Matters: Introducing Laughing-Face Generation using Diffusion + Models + + +
+ Speech-driven animation has gained significant traction in recent years, with +current methods achieving near-photorealistic results. However, the field +remains underexplored regarding non-verbal communication despite evidence +demonstrating its importance in human interaction. In particular, generating +laughter sequences presents a unique challenge due to the intricacy and nuances +of this behaviour. This paper aims to bridge this gap by proposing a novel +model capable of generating realistic laughter sequences, given a still +portrait and an audio clip containing laughter. We highlight the failure cases +of traditional facial animation methods and leverage recent advances in +diffusion models to produce convincing laughter videos. We train our model on a +diverse set of laughter datasets and introduce an evaluation metric +specifically designed for laughter. When compared with previous speech-driven +approaches, our model achieves state-of-the-art performance across all metrics, +even when these are re-trained for laughter generation. Our code and project +are publicly available + +
+
+
+
+
+ + ♻ ☆ Fault Localization for Buggy Deep Learning Framework Conversions in + Image Recognition + + +
+ When deploying Deep Neural Networks (DNNs), developers often convert models +from one deep learning framework to another (e.g., TensorFlow to PyTorch). +However, this process is error-prone and can impact target model accuracy. To +identify the extent of such impact, we perform and briefly present a +differential analysis against three DNNs widely used for image recognition +(MobileNetV2, ResNet101, and InceptionV3) converted across four well-known deep +learning frameworks (PyTorch, Keras, TensorFlow (TF), and TFLite), which +revealed numerous model crashes and output label discrepancies of up to 72%. To +mitigate such errors, we present a novel approach towards fault localization +and repair of buggy deep learning framework conversions, focusing on +pre-trained image recognition models. Our technique consists of four stages of +analysis: 1) conversion tools, 2) model parameters, 3) model hyperparameters, +and 4) graph representation. In addition, we propose various strategies towards +fault repair of the faults detected. We implement our technique on top of the +Apache TVM deep learning compiler, and we test it by conducting a preliminary +fault localization analysis for the conversion of InceptionV3 from TF to +TFLite. Our approach detected a fault in a common DNN converter tool, which +introduced precision errors in weights, reducing model accuracy. After our +fault localization, we repaired the issue, reducing our conversion error to +zero. + +
+
+ comment: 5 pages, 3 figures, 1 table +
+
+
+
+
+ + ♻ ☆ How Good is Google Bard's Visual Understanding? An Empirical Study on + Open Challenges + + +
+ Google's Bard has emerged as a formidable competitor to OpenAI's ChatGPT in +the field of conversational AI. Notably, Bard has recently been updated to +handle visual inputs alongside text prompts during conversations. Given Bard's +impressive track record in handling textual inputs, we explore its capabilities +in understanding and interpreting visual data (images) conditioned by text +questions. This exploration holds the potential to unveil new insights and +challenges for Bard and other forthcoming multi-modal Generative models, +especially in addressing complex computer vision problems that demand accurate +visual and language understanding. Specifically, in this study, we focus on 15 +diverse task scenarios encompassing regular, camouflaged, medical, under-water +and remote sensing data to comprehensively evaluate Bard's performance. Our +primary finding indicates that Bard still struggles in these vision scenarios, +highlighting the significant gap in vision-based understanding that needs to be +bridged in future developments. We expect that this empirical study will prove +valuable in advancing future models, leading to enhanced capabilities in +comprehending and interpreting fine-grained visual data. Our project is +released on https://github.com/htqin/GoogleBard-VisUnderstand + +
+
+
+
+
+ + ♻ ☆ Classifying World War II Era Ciphers with Machine Learning + + +
+ We determine the accuracy with which machine learning and deep learning +techniques can classify selected World War II era ciphers when only ciphertext +is available. The specific ciphers considered are Enigma, M-209, Sigaba, +Purple, and Typex. We experiment with three classic machine learning models, +namely, Support Vector Machines (SVM), $k$-Nearest Neighbors ($k$-NN), and +Random Forest (RF). We also experiment with four deep learning neural +network-based models: Multi-Layer Perceptrons (MLP), Long Short-Term Memory +(LSTM), Extreme Learning Machines (ELM), and Convolutional Neural Networks +(CNN). Each model is trained on features consisting of histograms, digrams, and +raw ciphertext letter sequences. Furthermore, the classification problem is +considered under four distinct scenarios: Fixed plaintext with fixed keys, +random plaintext with fixed keys, fixed plaintext with random keys, and random +plaintext with random keys. Under the most realistic scenario, given 1000 +characters per ciphertext, we are able to distinguish the ciphers with greater +than 97% accuracy. In addition, we consider the accuracy of a subset of the +learning techniques as a function of the length of the ciphertext messages. +Somewhat surprisingly, our classic machine learning models perform at least as +well as our deep learning models. We also find that ciphers that are more +similar in design are somewhat more challenging to distinguish, but not as +difficult as might be expected. + +
+
+
+
+
+ + ♻ ☆ HypLL: The Hyperbolic Learning Library + + +
+ Deep learning in hyperbolic space is quickly gaining traction in the fields +of machine learning, multimedia, and computer vision. Deep networks commonly +operate in Euclidean space, implicitly assuming that data lies on regular +grids. Recent advances have shown that hyperbolic geometry provides a viable +alternative foundation for deep learning, especially when data is hierarchical +in nature and when working with few embedding dimensions. Currently however, no +accessible open-source library exists to build hyperbolic network modules akin +to well-known deep learning libraries. We present HypLL, the Hyperbolic +Learning Library to bring the progress on hyperbolic deep learning together. +HypLL is built on top of PyTorch, with an emphasis in its design for +ease-of-use, in order to attract a broad audience towards this new and +open-ended research direction. The code is available at: +https://github.com/maxvanspengler/hyperbolic_learning_library. + +
+
+ comment: ACM Multimedia Open-Source Software Competition 2023 +
+
+
+
+
+ + ♻ ☆ Context-Aware Composition of Agent Policies by Markov Decision Process + Entity Embeddings and Agent Ensembles + + +
+ Computational agents support humans in many areas of life and are therefore +found in heterogeneous contexts. This means they operate in rapidly changing +environments and can be confronted with huge state and action spaces. In order +to perform services and carry out activities in a goal-oriented manner, agents +require prior knowledge and therefore have to develop and pursue +context-dependent policies. However, prescribing policies in advance is limited +and inflexible, especially in dynamically changing environments. Moreover, the +context of an agent determines its choice of actions. Since the environments +can be stochastic and complex in terms of the number of states and feasible +actions, activities are usually modelled in a simplified way by Markov decision +processes so that, e.g., agents with reinforcement learning are able to learn +policies, that help to capture the context and act accordingly to optimally +perform activities. However, training policies for all possible contexts using +reinforcement learning is time-consuming. A requirement and challenge for +agents is to learn strategies quickly and respond immediately in cross-context +environments and applications, e.g., the Internet, service robotics, +cyber-physical systems. In this work, we propose a novel simulation-based +approach that enables a) the representation of heterogeneous contexts through +knowledge graphs and entity embeddings and b) the context-aware composition of +policies on demand by ensembles of agents running in parallel. The evaluation +we conducted with the "Virtual Home" dataset indicates that agents with a need +to switch seamlessly between different contexts, can request on-demand composed +policies that lead to the successful completion of context-appropriate +activities without having to learn these policies in lengthy training steps and +episodes, in contrast to agents that use reinforcement learning. + +
+
+ comment: 30 pages, 11 figures, 9 tables, 3 listings, Re-submitted to Semantic + Web Journal, Currently, under review +
+
+
+
+
+ + ♻ ☆ Food Classification using Joint Representation of Visual and Textual + Data + + +
+ Food classification is an important task in health care. In this work, we +propose a multimodal classification framework that uses the modified version of +EfficientNet with the Mish activation function for image classification, and +the traditional BERT transformer-based network is used for text classification. +The proposed network and the other state-of-the-art methods are evaluated on a +large open-source dataset, UPMC Food-101. The experimental results show that +the proposed network outperforms the other methods, a significant difference of +11.57% and 6.34% in accuracy is observed for image and text classification, +respectively, when compared with the second-best performing method. We also +compared the performance in terms of accuracy, precision, and recall for text +classification using both machine learning and deep learning-based models. The +comparative analysis from the prediction results of both images and text +demonstrated the efficiency and robustness of the proposed approach. + +
+
+ comment: Updated results and discussions to be posted and some sections needed + to be expanded +
+
+
+
+
+ + ♻ ☆ Solving AC Power Flow with Graph Neural Networks under Realistic + Constraints + + +
+ In this paper, we propose a graph neural network architecture to solve the AC +power flow problem under realistic constraints. To ensure a safe and resilient +operation of distribution grids, AC power flow calculations are the means of +choice to determine grid operating limits or analyze grid asset utilization in +planning procedures. In our approach, we demonstrate the development of a +framework that uses graph neural networks to learn the physical constraints of +the power flow. We present our model architecture on which we perform +unsupervised training to learn a general solution of the AC power flow +formulation independent of the specific topologies and supply tasks used for +training. Finally, we demonstrate, validate and discuss our results on medium +voltage benchmark grids. In our approach, we focus on the physical and +topological properties of distribution grids to provide scalable solutions for +real grid topologies. Therefore, we take a data-driven approach, using large +and diverse data sets consisting of realistic grid topologies, for the +unsupervised training of the AC power flow graph neural network architecture +and compare the results to a prior neural architecture and the Newton-Raphson +method. Our approach shows a high increase in computation time and good +accuracy compared to state-of-the-art solvers. It also out-performs that neural +solver for power flow in terms of accuracy. + +
+
+
+
+
+ + ♻ ☆ Implicit neural representation for change detection + + +
+ Identifying changes in a pair of 3D aerial LiDAR point clouds, obtained +during two distinct time periods over the same geographic region presents a +significant challenge due to the disparities in spatial coverage and the +presence of noise in the acquisition system. The most commonly used approaches +to detecting changes in point clouds are based on supervised methods which +necessitate extensive labelled data often unavailable in real-world +applications. To address these issues, we propose an unsupervised approach that +comprises two components: Implicit Neural Representation (INR) for continuous +shape reconstruction and a Gaussian Mixture Model for categorising changes. INR +offers a grid-agnostic representation for encoding bi-temporal point clouds, +with unmatched spatial support that can be regularised to enhance +high-frequency details and reduce noise. The reconstructions at each timestamp +are compared at arbitrary spatial scales, leading to a significant increase in +detection capabilities. We apply our method to a benchmark dataset comprising +simulated LiDAR point clouds for urban sprawling. This dataset encompasses +diverse challenging scenarios, varying in resolutions, input modalities and +noise levels. This enables a comprehensive multi-scenario evaluation, comparing +our method with the current state-of-the-art approach. We outperform the +previous methods by a margin of 10% in the intersection over union metric. In +addition, we put our techniques to practical use by applying them in a +real-world scenario to identify instances of illicit excavation of +archaeological sites and validate our results by comparing them with findings +from field experts. + +
+
+ comment: Main article is 10 pages + 6 pages of supplementary. Conference style + paper +
+
+
+
+
+ + ♻ ☆ E-MCTS: Deep Exploration in Model-Based Reinforcement Learning by + Planning with Epistemic Uncertainty NeurIPS 2023 + + +
+ One of the most well-studied and highly performing planning approaches used +in Model-Based Reinforcement Learning (MBRL) is Monte-Carlo Tree Search (MCTS). +Key challenges of MCTS-based MBRL methods remain dedicated deep exploration and +reliability in the face of the unknown, and both challenges can be alleviated +through principled epistemic uncertainty estimation in the predictions of MCTS. +We present two main contributions: First, we develop methodology to propagate +epistemic uncertainty in MCTS, enabling agents to estimate the epistemic +uncertainty in their predictions. Second, we utilize the propagated uncertainty +for a novel deep exploration algorithm by explicitly planning to explore. We +incorporate our approach into variations of MCTS-based MBRL approaches with +learned and provided dynamics models, and empirically show deep exploration +through successful epistemic uncertainty estimation achieved by our approach. +We compare to a non-planning-based deep-exploration baseline, and demonstrate +that planning with epistemic MCTS significantly outperforms non-planning based +exploration in the investigated deep exploration benchmark. + +
+
+ comment: Submitted to NeurIPS 2023, accepted to EWRL 2023 +
+
+
+
+
+ + ♻ ☆ What do neural networks learn in image classification? A frequency + shortcut perspective ICCV2023 + + +
+ Frequency analysis is useful for understanding the mechanisms of +representation learning in neural networks (NNs). Most research in this area +focuses on the learning dynamics of NNs for regression tasks, while little for +classification. This study empirically investigates the latter and expands the +understanding of frequency shortcuts. First, we perform experiments on +synthetic datasets, designed to have a bias in different frequency bands. Our +results demonstrate that NNs tend to find simple solutions for classification, +and what they learn first during training depends on the most distinctive +frequency characteristics, which can be either low- or high-frequencies. +Second, we confirm this phenomenon on natural images. We propose a metric to +measure class-wise frequency characteristics and a method to identify frequency +shortcuts. The results show that frequency shortcuts can be texture-based or +shape-based, depending on what best simplifies the objective. Third, we +validate the transferability of frequency shortcuts on out-of-distribution +(OOD) test sets. Our results suggest that frequency shortcuts can be +transferred across datasets and cannot be fully avoided by larger model +capacity and data augmentation. We recommend that future research should focus +on effective training schemes mitigating frequency shortcut learning. + +
+
+ comment: Accepted at ICCV2023 +
+
+
+
+
+ + ♻ ☆ Diffiner: A Versatile Diffusion-based Generative Refiner for Speech + Enhancement + + +
+ Although deep neural network (DNN)-based speech enhancement (SE) methods +outperform the previous non-DNN-based ones, they often degrade the perceptual +quality of generated outputs. To tackle this problem, we introduce a DNN-based +generative refiner, Diffiner, aiming to improve perceptual speech quality +pre-processed by an SE method. We train a diffusion-based generative model by +utilizing a dataset consisting of clean speech only. Then, our refiner +effectively mixes clean parts newly generated via denoising diffusion +restoration into the degraded and distorted parts caused by a preceding SE +method, resulting in refined speech. Once our refiner is trained on a set of +clean speech, it can be applied to various SE methods without additional +training specialized for each SE module. Therefore, our refiner can be a +versatile post-processing module w.r.t. SE methods and has high potential in +terms of modularity. Experimental results show that our method improved +perceptual speech quality regardless of the preceding SE methods used. + +
+
+ comment: Accepted by Interspeech 2023 +
+
+
+
+
+ + ♻ ☆ Deep neural networks on diffeomorphism groups for optimal shape + reparameterization + + +
+ One of the fundamental problems in shape analysis is to align curves or +surfaces before computing geodesic distances between their shapes. Finding the +optimal reparametrization realizing this alignment is a computationally +demanding task, typically done by solving an optimization problem on the +diffeomorphism group. In this paper, we propose an algorithm for constructing +approximations of orientation-preserving diffeomorphisms by composition of +elementary diffeomorphisms. The algorithm is implemented using PyTorch, and is +applicable for both unparametrized curves and surfaces. Moreover, we show +universal approximation properties for the constructed architectures, and +obtain bounds for the Lipschitz constants of the resulting diffeomorphisms. + +
+
+ comment: 36 pages, 11 figures. Accepted by BIT Numerical Mathematics, not yet + published +
+
+
+
+
+ + ♻ ☆ SignReLU neural network and its approximation ability + + +
+ Deep neural networks (DNNs) have garnered significant attention in various +fields of science and technology in recent years. Activation functions define +how neurons in DNNs process incoming signals for them. They are essential for +learning non-linear transformations and for performing diverse computations +among successive neuron layers. In the last few years, researchers have +investigated the approximation ability of DNNs to explain their power and +success. In this paper, we explore the approximation ability of DNNs using a +different activation function, called SignReLU. Our theoretical results +demonstrate that SignReLU networks outperform rational and ReLU networks in +terms of approximation performance. Numerical experiments are conducted +comparing SignReLU with the existing activations such as ReLU, Leaky ReLU, and +ELU, which illustrate the competitive practical performance of SignReLU. + +
+
+
+
+
+ + ♻ ☆ G-Signatures: Global Graph Propagation With Randomized Signatures + + +
+ Graph neural networks (GNNs) have evolved into one of the most popular deep +learning architectures. However, GNNs suffer from over-smoothing node +information and, therefore, struggle to solve tasks where global graph +properties are relevant. We introduce G-Signatures, a novel graph learning +method that enables global graph propagation via randomized signatures. +G-Signatures use a new graph conversion concept to embed graph structured +information which can be interpreted as paths in latent space. We further +introduce the idea of latent space path mapping. This allows us to iteratively +traverse latent space paths, and, thus globally process information. +G-Signatures excel at extracting and processing global graph properties, and +effectively scale to large graph problems. Empirically, we confirm the +advantages of G-Signatures at several classification and regression tasks. + +
+
+ comment: 7 pages (+ appendix); 4 figures +
+
+
+
+
+ + ♻ ☆ Alien Coding + + +
+ We introduce a self-learning algorithm for synthesizing programs for OEIS +sequences. The algorithm starts from scratch initially generating programs at +random. Then it runs many iterations of a self-learning loop that interleaves +(i) training neural machine translation to learn the correspondence between +sequences and the programs discovered so far, and (ii) proposing many new +programs for each OEIS sequence by the trained neural machine translator. The +algorithm discovers on its own programs for more than 78000 OEIS sequences, +sometimes developing unusual programming methods. We analyze its behavior and +the invented programs in several experiments. + +
+
+
+
+
+ + ♻ ☆ Elucidating the Exposure Bias in Diffusion Models + + +
+ Diffusion models have demonstrated impressive generative capabilities, but +their 'exposure bias' problem, described as the input mismatch between training +and sampling, lacks in-depth exploration. In this paper, we systematically +investigate the exposure bias problem in diffusion models by first analytically +modelling the sampling distribution, based on which we then attribute the +prediction error at each sampling step as the root cause of the exposure bias +issue. Furthermore, we discuss potential solutions to this issue and propose an +intuitive metric for it. Along with the elucidation of exposure bias, we +propose a simple, yet effective, training-free method called Epsilon Scaling to +alleviate the exposure bias. We show that Epsilon Scaling explicitly moves the +sampling trajectory closer to the vector field learned in the training phase by +scaling down the network output (Epsilon), mitigating the input mismatch +between training and sampling. Experiments on various diffusion frameworks +(ADM, DDPM/DDIM, LDM), unconditional and conditional settings, and +deterministic vs. stochastic sampling verify the effectiveness of our method. + +
+
+ comment: 7 pages, code available soon +
+
+
+
+
+ + ♻ ☆ Case-Aware Adversarial Training + + +
+ The neural network (NN) becomes one of the most heated type of models in +various signal processing applications. However, NNs are extremely vulnerable +to adversarial examples (AEs). To defend AEs, adversarial training (AT) is +believed to be the most effective method while due to the intensive +computation, AT is limited to be applied in most applications. In this paper, +to resolve the problem, we design a generic and efficient AT improvement +scheme, namely case-aware adversarial training (CAT). Specifically, the +intuition stems from the fact that a very limited part of informative samples +can contribute to most of model performance. Alternatively, if only the most +informative AEs are used in AT, we can lower the computation complexity of AT +significantly as maintaining the defense effect. To achieve this, CAT achieves +two breakthroughs. First, a method to estimate the information degree of +adversarial examples is proposed for AE filtering. Second, to further enrich +the information that the NN can obtain from AEs, CAT involves a weight +estimation and class-level balancing based sampling strategy to increase the +diversity of AT at each iteration. Extensive experiments show that CAT is +faster than vanilla AT by up to 3x while achieving competitive defense effect. + +
+
+
+
+
+ + ♻ ☆ Is Complexity Required for Neural Network Pruning? A Case Study on + Global Magnitude Pruning + + +
+ Pruning neural networks has become popular in the last decade when it was +shown that a large number of weights can be safely removed from modern neural +networks without compromising accuracy. Numerous pruning methods have been +proposed since then, each claiming to be better than the previous. Many +state-of-the-art (SOTA) techniques today rely on complex pruning methodologies +utilizing importance scores, getting feedback through back-propagation or +having heuristics-based pruning rules amongst others. In this work, we question +whether this pattern of introducing complexity is really necessary to achieve +better pruning results. We benchmark these SOTA techniques against a naive +pruning baseline, namely, Global Magnitude Pruning (Global MP). Global MP ranks +weights in order of their magnitudes and prunes the smallest ones. Hence, in +its vanilla form, it is one of the simplest pruning techniques. Surprisingly, +we find that vanilla Global MP outperforms all the other SOTA techniques and +achieves a new SOTA result. It also achieves promising performance on FLOPs +sparsification, which we find is enhanced, when pruning is conducted in a +gradual fashion. We also find that Global MP is generalizable across tasks, +datasets, and models with superior performance. Moreover, a common issue that +many pruning algorithms run into at high sparsity rates, namely, +layer-collapse, can be easily fixed in Global MP by setting a minimum threshold +of weights to be retained in each layer. Lastly, unlike many other SOTA +techniques, Global MP does not require any additional algorithm specific +hyper-parameters and is very straightforward to tune and implement. We showcase +our findings on various models (WRN-28-8, ResNet-32, ResNet-50, MobileNet-V1 +and FastGRNN) and multiple datasets (CIFAR-10, ImageNet and HAR-2). Code is +available at https://github.com/manasgupta-1/GlobalMP. + +
+
+
+
+
+ + ♻ ☆ Geometric Algebra Transformers + + +
+ Problems involving geometric data arise in physics, chemistry, robotics, +computer vision, and many other fields. Such data can take numerous forms, such +as points, direction vectors, translations, or rotations, but to date there is +no single architecture that can be applied to such a wide variety of geometric +types while respecting their symmetries. In this paper we introduce the +Geometric Algebra Transformer (GATr), a general-purpose architecture for +geometric data. GATr represents inputs, outputs, and hidden states in the +projective geometric (or Clifford) algebra, which offers an efficient +16-dimensional vector-space representation of common geometric objects as well +as operators acting on them. GATr is equivariant with respect to E(3), the +symmetry group of 3D Euclidean space. As a Transformer, GATr is versatile, +efficient, and scalable. We demonstrate GATr in problems from n-body modeling +to wall-shear-stress estimation on large arterial meshes to robotic motion +planning. GATr consistently outperforms both non-geometric and equivariant +baselines in terms of error, data efficiency, and scalability. + +
+
+ comment: v2: more experiments, more baselines +
+
+
+
+
+ + ♻ ☆ Is Bio-Inspired Learning Better than Backprop? Benchmarking Bio Learning + vs. Backprop + + +
+ Bio-inspired learning has been gaining popularity recently given that +Backpropagation (BP) is not considered biologically plausible. Many algorithms +have been proposed in the literature which are all more biologically plausible +than BP. However, apart from overcoming the biological implausibility of BP, a +strong motivation for using Bio-inspired algorithms remains lacking. In this +study, we undertake a holistic comparison of BP vs. multiple Bio-inspired +algorithms to answer the question of whether Bio-learning offers additional +benefits over BP. We test Bio-algorithms under different design choices such as +access to only partial training data, resource constraints in terms of the +number of training epochs, sparsification of the neural network parameters and +addition of noise to input samples. Through these experiments, we notably find +two key advantages of Bio-algorithms over BP. Firstly, Bio-algorithms perform +much better than BP when the entire training dataset is not supplied. Four of +the five Bio-algorithms tested outperform BP by upto 5% accuracy when only 20% +of the training dataset is available. Secondly, even when the full dataset is +available, Bio-algorithms learn much quicker and converge to a stable accuracy +in far lesser training epochs than BP. Hebbian learning, specifically, is able +to learn in just 5 epochs compared to around 100 epochs required by BP. These +insights present practical reasons for utilising Bio-learning beyond just their +biological plausibility and also point towards interesting new directions for +future work on Bio-learning. + +
+
+
+
+
+ + ♻ ☆ Control Theoretic Analysis of Temporal Difference Learning + + +
+ The goal of this manuscript is to conduct a controltheoretic analysis of +Temporal Difference (TD) learning algorithms. TD-learning serves as a +cornerstone in the realm of reinforcement learning, offering a methodology for +approximating the value function associated with a given policy in a Markov +Decision Process. Despite several existing works that have contributed to the +theoretical understanding of TD-learning, it is only in recent years that +researchers have been able to establish concrete guarantees on its statistical +efficiency. In this paper, we introduce a finite-time, control-theoretic +framework for analyzing TD-learning, leveraging established concepts from the +field of linear systems control. Consequently, this paper provides additional +insights into the mechanics of TD learning and the broader landscape of +reinforcement learning, all while employing straightforward analytical tools +derived from control theory. + +
+
+
+
+
+ + ♻ ☆ BCGGAN: Ballistocardiogram artifact removal in simultaneous EEG-fMRI + using generative adversarial network + + +
+ Due to its advantages of high temporal and spatial resolution, the technology +of simultaneous electroencephalogram-functional magnetic resonance imaging +(EEG-fMRI) acquisition and analysis has attracted much attention, and has been +widely used in various research fields of brain science. However, during the +fMRI of the brain, ballistocardiogram (BCG) artifacts can seriously contaminate +the EEG. As an unpaired problem, BCG artifact removal now remains a +considerable challenge. Aiming to provide a solution, this paper proposed a +novel modular generative adversarial network (GAN) and corresponding training +strategy to improve the network performance by optimizing the parameters of +each module. In this manner, we hope to improve the local representation +ability of the network model, thereby improving its overall performance and +obtaining a reliable generator for BCG artifact removal. Moreover, the proposed +method does not rely on additional reference signal or complex hardware +equipment. Experimental results show that, compared with multiple methods, the +technique presented in this paper can remove the BCG artifact more effectively +while retaining essential EEG information. + +
+
+
+
+
+ + ♻ ☆ Estimating 3D Dental Structures using Simulated Panoramic Radiographs + and Neural Ray Tracing + + +
+ Panoramic radiography (Panoramic X-ray, PX) is a widely used imaging modality +for dental examination. However, PX only provides a flattened 2D image, lacking +in a 3D view of the oral structure. In this paper, we propose a framework to +estimate 3D oral structures from real-world PX. Our framework tackles full 3D +reconstruction for varying subjects (patients) where each reconstruction is +based only on a single panoramic image. We create an intermediate +representation called simulated PX (SimPX) from 3D Cone-beam computed +tomography (CBCT) data based on the Beer-Lambert law of X-ray rendering and +rotational principles of PX imaging. SimPX aims at not only truthfully +simulating PX, but also facilitates the reverting process back to 3D data. We +propose a novel neural model based on ray tracing which exploits both global +and local input features to convert SimPX to 3D output. At inference, a real PX +image is translated to a SimPX-style image with semantic regularization, and +the translated image is processed by generation module to produce high-quality +outputs. Experiments show that our method outperforms prior state-of-the-art in +reconstruction tasks both quantitatively and qualitatively. Unlike prior +methods, Our method does not require any prior information such as the shape of +dental arches, nor the matched PX-CBCT dataset for training, which is difficult +to obtain in clinical practice. + +
+
+ comment: 20 pages, 16 figures +
+
+
+
+
+ + ♻ ☆ Pre-trained transformer for adversarial purification + + +
+ With more and more deep neural networks being deployed as various daily +services, their reliability is essential. It's frightening that deep neural +networks are vulnerable and sensitive to adversarial attacks, the most common +one of which for the services is evasion-based. Recent works usually strengthen +the robustness by adversarial training or leveraging the knowledge of an amount +of clean data. However, in practical terms, retraining and redeploying the +model need a large computational budget, leading to heavy losses to the online +service. In addition, when adversarial examples of a certain attack are +detected, only limited adversarial examples are available for the service +provider, while much clean data may not be accessible. Given the mentioned +problems, we propose a new scenario, RaPiD (Rapid Plug-in Defender), which is +to rapidly defend against a certain attack for the frozen original service +model with limitations of few clean and adversarial examples. Motivated by the +generalization and the universal computation ability of pre-trained transformer +models, we come up with a new defender method, CeTaD, which stands for +Considering Pre-trained Transformers as Defenders. In particular, we evaluate +the effectiveness and the transferability of CeTaD in the case of one-shot +adversarial examples and explore the impact of different parts of CeTaD as well +as training data conditions. CeTaD is flexible, able to be embedded into an +arbitrary differentiable model, and suitable for various types of attacks. + +
+
+
+
+
+ + ♻ ☆ Assessing Hidden Risks of LLMs: An Empirical Study on Robustness, + Consistency, and Credibility + + +
+ The recent popularity of large language models (LLMs) has brought a +significant impact to boundless fields, particularly through their open-ended +ecosystem such as the APIs, open-sourced models, and plugins. However, with +their widespread deployment, there is a general lack of research that +thoroughly discusses and analyzes the potential risks concealed. In that case, +we intend to conduct a preliminary but pioneering study covering the +robustness, consistency, and credibility of LLMs systems. With most of the +related literature in the era of LLM uncharted, we propose an automated +workflow that copes with an upscaled number of queries/responses. Overall, we +conduct over a million queries to the mainstream LLMs including ChatGPT, LLaMA, +and OPT. Core to our workflow consists of a data primitive, followed by an +automated interpreter that evaluates these LLMs under different adversarial +metrical systems. As a result, we draw several, and perhaps unfortunate, +conclusions that are quite uncommon from this trendy community. Briefly, they +are: (i)-the minor but inevitable error occurrence in the user-generated query +input may, by chance, cause the LLM to respond unexpectedly; (ii)-LLMs possess +poor consistency when processing semantically similar query input. In addition, +as a side finding, we find that ChatGPT is still capable to yield the correct +answer even when the input is polluted at an extreme level. While this +phenomenon demonstrates the powerful memorization of the LLMs, it raises +serious concerns about using such data for LLM-involved evaluation in academic +development. To deal with it, we propose a novel index associated with a +dataset that roughly decides the feasibility of using such data for +LLM-involved evaluation. Extensive empirical studies are tagged to support the +aforementioned claims. + +
+
+
+
+
+ + ♻ ☆ Automatically Correcting Large Language Models: Surveying the landscape + of diverse self-correction strategies + + +
+ Large language models (LLMs) have demonstrated remarkable performance across +a wide array of NLP tasks. However, their efficacy is undermined by undesired +and inconsistent behaviors, including hallucination, unfaithful reasoning, and +toxic content. A promising approach to rectify these flaws is self-correction, +where the LLM itself is prompted or guided to fix problems in its own output. +Techniques leveraging automated feedback -- either produced by the LLM itself +or some external system -- are of particular interest as they are a promising +way to make LLM-based solutions more practical and deployable with minimal +human feedback. This paper presents a comprehensive review of this emerging +class of techniques. We analyze and taxonomize a wide array of recent work +utilizing these strategies, including training-time, generation-time, and +post-hoc correction. We also summarize the major applications of this strategy +and conclude by discussing future directions and challenges. + +
+
+ comment: Work in Progress. Version 2 +
+
+
+
+
+ + ♻ ☆ WDiscOOD: Out-of-Distribution Detection via Whitened Linear Discriminant + Analysis ICCV 2023 + + +
+ Deep neural networks are susceptible to generating overconfident yet +erroneous predictions when presented with data beyond known concepts. This +challenge underscores the importance of detecting out-of-distribution (OOD) +samples in the open world. In this work, we propose a novel feature-space OOD +detection score based on class-specific and class-agnostic information. +Specifically, the approach utilizes Whitened Linear Discriminant Analysis to +project features into two subspaces - the discriminative and residual subspaces +- for which the in-distribution (ID) classes are maximally separated and +closely clustered, respectively. The OOD score is then determined by combining +the deviation from the input data to the ID pattern in both subspaces. The +efficacy of our method, named WDiscOOD, is verified on the large-scale +ImageNet-1k benchmark, with six OOD datasets that cover a variety of +distribution shifts. WDiscOOD demonstrates superior performance on deep +classifiers with diverse backbone architectures, including CNN and vision +transformer. Furthermore, we also show that WDiscOOD more effectively detects +novel concepts in representation spaces trained with contrastive objectives, +including supervised contrastive loss and multi-modality contrastive loss. + +
+
+ comment: Accepted by ICCV 2023. Code is available at: + https://github.com/ivalab/WDiscOOD.git +
+
+
+
+
+ + ♻ ☆ Dataflow Analysis-Inspired Deep Learning for Efficient Vulnerability + Detection ICSE 2024 + + +
+ Deep learning-based vulnerability detection has shown great performance and, +in some studies, outperformed static analysis tools. However, the +highest-performing approaches use token-based transformer models, which are not +the most efficient to capture code semantics required for vulnerability +detection. Classical program analysis techniques such as dataflow analysis can +detect many types of bugs based on their root causes. In this paper, we propose +to combine such causal-based vulnerability detection algorithms with deep +learning, aiming to achieve more efficient and effective vulnerability +detection. Specifically, we designed DeepDFA, a dataflow analysis-inspired +graph learning framework and an embedding technique that enables graph learning +to simulate dataflow computation. We show that DeepDFA is both performant and +efficient. DeepDFA outperformed all non-transformer baselines. It was trained +in 9 minutes, 75x faster than the highest-performing baseline model. When using +only 50+ vulnerable and several hundreds of total examples as training data, +the model retained the same performance as 100% of the dataset. DeepDFA also +generalized to real-world vulnerabilities in DBGBench; it detected 8.7 out of +17 vulnerabilities on average across folds and was able to distinguish between +patched and buggy versions, while the highest-performing baseline models did +not detect any vulnerabilities. By combining DeepDFA with a large language +model, we surpassed the state-of-the-art vulnerability detection performance on +the Big-Vul dataset with 96.46 F1 score, 97.82 precision, and 95.14 recall. Our +replication package is located at https://figshare.com/s/e7953b4d345b00990d17. + +
+
+ comment: 11 pages, 9 figures. Accepted as a conference paper at ICSE 2024 +
+
+
+
+
+ + ♻ ☆ Toward Generalizable Machine Learning Models in Speech, Language, and + Hearing Sciences: Sample Size Estimation and Reducing Overfitting + + +
+ This study's first purpose is to provide quantitative evidence that would +incentivize researchers to instead use the more robust method of nested +cross-validation. The second purpose is to present methods and MATLAB codes for +doing power analysis for ML-based analysis during the design of a study. Monte +Carlo simulations were used to quantify the interactions between the employed +cross-validation method, the discriminative power of features, the +dimensionality of the feature space, and the dimensionality of the model. Four +different cross-validations (single holdout, 10-fold, train-validation-test, +and nested 10-fold) were compared based on the statistical power and +statistical confidence of the ML models. Distributions of the null and +alternative hypotheses were used to determine the minimum required sample size +for obtaining a statistically significant outcome ({\alpha}=0.05, +1-\b{eta}=0.8). Statistical confidence of the model was defined as the +probability of correct features being selected and hence being included in the +final model. Our analysis showed that the model generated based on the single +holdout method had very low statistical power and statistical confidence and +that it significantly overestimated the accuracy. Conversely, the nested +10-fold cross-validation resulted in the highest statistical confidence and the +highest statistical power, while providing an unbiased estimate of the +accuracy. The required sample size with a single holdout could be 50% higher +than what would be needed if nested cross-validation were used. Confidence in +the model based on nested cross-validation was as much as four times higher +than the confidence in the single holdout-based model. A computational model, +MATLAB codes, and lookup tables are provided to assist researchers with +estimating the sample size during the design of their future studies. + +
+
+ comment: Under review at JSLHR +
+
+
+
+
+ + ♻ ☆ Noise-Free Sampling Algorithms via Regularized Wasserstein Proximals + + +
+ We consider the problem of sampling from a distribution governed by a +potential function. This work proposes an explicit score-based MCMC method that +is deterministic, resulting in a deterministic evolution for particles rather +than a stochastic differential equation evolution. The score term is given in +closed form by a regularized Wasserstein proximal, using a kernel convolution +that is approximated by sampling. We demonstrate fast convergence on various +problems and show improved dimensional dependence of mixing time bounds for the +case of Gaussian distributions compared to the unadjusted Langevin algorithm +(ULA) and the Metropolis-adjusted Langevin algorithm (MALA). We additionally +derive closed form expressions for the distributions at each iterate for +quadratic potential functions, characterizing the variance reduction. Empirical +results demonstrate that the particles behave in an organized manner, lying on +level set contours of the potential. Moreover, the posterior mean estimator of +the proposed method is shown to be closer to the maximum a-posteriori estimator +compared to ULA and MALA, in the context of Bayesian logistic regression. + +
+
+
+
+
+ + ♻ ☆ Evolutionary Reinforcement Learning: A Survey + + +
+ Reinforcement learning (RL) is a machine learning approach that trains agents +to maximize cumulative rewards through interactions with environments. The +integration of RL with deep learning has recently resulted in impressive +achievements in a wide range of challenging tasks, including board games, +arcade games, and robot control. Despite these successes, there remain several +crucial challenges, including brittle convergence properties caused by +sensitive hyperparameters, difficulties in temporal credit assignment with long +time horizons and sparse rewards, a lack of diverse exploration, especially in +continuous search space scenarios, difficulties in credit assignment in +multi-agent reinforcement learning, and conflicting objectives for rewards. +Evolutionary computation (EC), which maintains a population of learning agents, +has demonstrated promising performance in addressing these limitations. This +article presents a comprehensive survey of state-of-the-art methods for +integrating EC into RL, referred to as evolutionary reinforcement learning +(EvoRL). We categorize EvoRL methods according to key research fields in RL, +including hyperparameter optimization, policy search, exploration, reward +shaping, meta-RL, and multi-objective RL. We then discuss future research +directions in terms of efficient methods, benchmarks, and scalable platforms. +This survey serves as a resource for researchers and practitioners interested +in the field of EvoRL, highlighting the important challenges and opportunities +for future research. With the help of this survey, researchers and +practitioners can develop more efficient methods and tailored benchmarks for +EvoRL, further advancing this promising cross-disciplinary research field. + +
+
+
+
+
+ + ♻ ☆ RAFT: Reward rAnked FineTuning for Generative Foundation Model Alignment + + +
+ Generative foundation models are susceptible to implicit biases that can +arise from extensive unsupervised training data. Such biases can produce +suboptimal samples, skewed outcomes, and unfairness, with potentially serious +consequences. Consequently, aligning these models with human ethics and +preferences is an essential step toward ensuring their responsible and +effective deployment in real-world applications. Prior research has primarily +employed Reinforcement Learning from Human Feedback (RLHF) to address this +problem, where generative models are fine-tuned with RL algorithms guided by a +human-feedback-informed reward model. However, the inefficiencies and +instabilities associated with RL algorithms frequently present substantial +obstacles to the successful alignment, necessitating the development of a more +robust and streamlined approach. To this end, we introduce a new framework, +Reward rAnked FineTuning (RAFT), designed to align generative models +effectively. Utilizing a reward model and a sufficient number of samples, our +approach selects the high-quality samples, discarding those that exhibit +undesired behavior, and subsequently enhancing the model by fine-tuning on +these filtered samples. Our studies show that RAFT can effectively improve the +model performance in both reward learning and other automated metrics in both +large language models and diffusion models. + +
+
+ comment: 26 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Coagent Networks Revisited + + +
+ Coagent networks formalize the concept of arbitrary networks of stochastic +agents that collaborate to take actions in a reinforcement learning +environment. Prominent examples of coagent networks in action include +approaches to hierarchical reinforcement learning (HRL), such as those using +options, which attempt to address the exploration exploitation trade-off by +introducing abstract actions at different levels by sequencing multiple +stochastic networks within the HRL agents. We first provide a unifying +perspective on the many diverse examples that fall under coagent networks. We +do so by formalizing the rules of execution in a coagent network, enabled by +the novel and intuitive idea of execution paths in a coagent network. Motivated +by parameter sharing in the hierarchical option-critic architecture, we revisit +the coagent network theory and achieve a much shorter proof of the policy +gradient theorem using our idea of execution paths, without any assumption on +how parameters are shared among coagents. We then generalize our setting and +proof to include the scenario where coagents act asynchronously. This new +perspective and theorem also lead to more mathematically accurate and +performant algorithms than those in the existing literature. Lastly, by running +nonstationary RL experiments, we survey the performance and properties of +different generalizations of option-critic models. + +
+
+ comment: Reformatted paper significantly and clarified results on the + asynchronous case +
+
+
+
+
+ + ♻ ☆ Towards Generalist Robots: A Promising Paradigm via Generative + Simulation + + +
+ This document serves as a position paper that outlines the authors' vision +for a potential pathway towards generalist robots. The purpose of this document +is to share the excitement of the authors with the community and highlight a +promising research direction in robotics and AI. The authors believe the +proposed paradigm is a feasible path towards accomplishing the long-standing +goal of robotics research: deploying robots, or embodied AI agents more +broadly, in various non-factory real-world settings to perform diverse tasks. +This document presents a specific idea for mining knowledge in the latest +large-scale foundation models for robotics research. Instead of directly using +or adapting these models to produce low-level policies and actions, it +advocates for a fully automated generative pipeline (termed as generative +simulation), which uses these models to generate diversified tasks, scenes and +training supervisions at scale, thereby scaling up low-level skill learning and +ultimately leading to a foundation model for robotics that empowers generalist +robots. The authors are actively pursuing this direction, but in the meantime, +they recognize that the ambitious goal of building generalist robots with +large-scale policy training demands significant resources such as computing +power and hardware, and research groups in academia alone may face severe +resource constraints in implementing the entire vision. Therefore, the authors +believe sharing their thoughts at this early stage could foster discussions, +attract interest towards the proposed pathway and related topics from industry +groups, and potentially spur significant technical advancements in the field. + +
+
+
+
+
+
+
+
+ + Multimedia 2 + +
+
+
+ + ☆ Prompting Vision Language Model with Knowledge from Large Language Model + for Knowledge-Based VQA + + +
+ Knowledge-based visual question answering is a very challenging and widely +concerned task. Previous methods adopts the implicit knowledge in large +language models (LLM) to achieve excellent results, but we argue that existing +methods may suffer from biasing understanding of the image and insufficient +knowledge to solve the problem. In this paper, we propose PROOFREAD -PROmpting +vision language model with knOwledge From laRgE lAnguage moDel, a novel, +lightweight and efficient kowledge-based VQA framework, which make the vision +language model and the large language model cooperate to give full play to +their respective strengths and bootstrap each other. In detail, our proposed +method uses LLM to obtain knowledge explicitly, uses the vision language model +which can see the image to get the knowledge answer, and introduces knowledge +perceiver to filter out knowledge that is harmful for getting the correct final +answer. Experimental results on two datasets prove the effectiveness of our +approach. Our method outperforms all state-of-the-art methods on the A-OKVQA +dataset in two settings and also achieves relatively good performance on the +OKVQA dataset. + +
+
+
+
+
+ + ♻ ☆ Priority-Centric Human Motion Generation in Discrete Latent Space ICCV2023 + + +
+ Text-to-motion generation is a formidable task, aiming to produce human +motions that align with the input text while also adhering to human +capabilities and physical laws. While there have been advancements in diffusion +models, their application in discrete spaces remains underexplored. Current +methods often overlook the varying significance of different motions, treating +them uniformly. It is essential to recognize that not all motions hold the same +relevance to a particular textual description. Some motions, being more salient +and informative, should be given precedence during generation. In response, we +introduce a Priority-Centric Motion Discrete Diffusion Model (M2DM), which +utilizes a Transformer-based VQ-VAE to derive a concise, discrete motion +representation, incorporating a global self-attention mechanism and a +regularization term to counteract code collapse. We also present a motion +discrete diffusion model that employs an innovative noise schedule, determined +by the significance of each motion token within the entire motion sequence. +This approach retains the most salient motions during the reverse diffusion +process, leading to more semantically rich and varied motions. Additionally, we +formulate two strategies to gauge the importance of motion tokens, drawing from +both textual and visual indicators. Comprehensive experiments on the HumanML3D +and KIT-ML datasets confirm that our model surpasses existing techniques in +fidelity and diversity, particularly for intricate textual descriptions. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 56 + +
+
+
+ + ☆ ParaGuide: Guided Diffusion Paraphrasers for Plug-and-Play Textual Style + Transfer + + +
+ Textual style transfer is the task of transforming stylistic properties of +text while preserving meaning. Target "styles" can be defined in numerous ways, +ranging from single attributes (e.g, formality) to authorship (e.g, +Shakespeare). Previous unsupervised style-transfer approaches generally rely on +significant amounts of labeled data for only a fixed set of styles or require +large language models. In contrast, we introduce a novel diffusion-based +framework for general-purpose style transfer that can be flexibly adapted to +arbitrary target styles at inference time. Our parameter-efficient approach, +ParaGuide, leverages paraphrase-conditioned diffusion models alongside +gradient-based guidance from both off-the-shelf classifiers and strong existing +style embedders to transform the style of text while preserving semantic +information. We validate the method on the Enron Email Corpus, with both human +and automatic evaluations, and find that it outperforms strong baselines on +formality, sentiment, and even authorship style transfer. + +
+
+
+
+
+ + ☆ When Do Program-of-Thoughts Work for Reasoning? + + +
+ The reasoning capabilities of Large Language Models (LLMs) play a pivotal +role in the realm of embodied artificial intelligence. Although there are +effective methods like program-of-thought prompting for LLMs which uses +programming language to tackle complex reasoning tasks, the specific impact of +code data on the improvement of reasoning capabilities remains under-explored. +To address this gap, we propose complexity-impacted reasoning score (CIRS), +which combines structural and logical attributes, to measure the correlation +between code and reasoning abilities. Specifically, we use the abstract syntax +tree to encode the structural information and calculate logical complexity by +considering the difficulty and the cyclomatic complexity. Through an empirical +analysis, we find not all code data of complexity can be learned or understood +by LLMs. Optimal level of complexity is critical to the improvement of +reasoning abilities by program-aided prompting. Then we design an +auto-synthesizing and stratifying algorithm, and apply it to instruction +generation for mathematical reasoning and code data filtering for code +generation tasks. Extensive results demonstrates the effectiveness of our +proposed approach. Code will be integrated into the EasyInstruct framework at +https://github.com/zjunlp/EasyInstruct. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Vulgar Remarks Detection in Chittagonian Dialect of Bangla + + +
+ The negative effects of online bullying and harassment are increasing with +Internet popularity, especially in social media. One solution is using natural +language processing (NLP) and machine learning (ML) methods for the automatic +detection of harmful remarks, but these methods are limited in low-resource +languages like the Chittagonian dialect of Bangla.This study focuses on +detecting vulgar remarks in social media using supervised ML and deep learning +algorithms.Logistic Regression achieved promising accuracy (0.91) while simple +RNN with Word2vec and fastTex had lower accuracy (0.84-0.90), highlighting the +issue that NN algorithms require more data. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ☆ Characterizing Learning Curves During Language Model Pre-Training: + Learning, Forgetting, and Stability + + +
+ How do language models learn to make predictions during pre-training? To +study this question, we extract learning curves from five autoregressive +English language model pre-training runs, for 1M tokens in context. We observe +that the language models generate short repetitive phrases before learning to +generate longer and more coherent text. We quantify the final surprisal, +within-run variability, age of acquisition, forgettability, and cross-run +variability of learning curves for individual tokens in context. More frequent +tokens reach lower final surprisals, exhibit less variability within and across +pre-training runs, are learned earlier, and are less likely to be "forgotten" +during pre-training. Higher n-gram probabilities further accentuate these +effects. Independent of the target token, shorter and more frequent contexts +correlate with marginally more stable and quickly acquired predictions. Effects +of part-of-speech are also small, although nouns tend to be acquired later and +less stably than verbs, adverbs, and adjectives. Our work contributes to a +better understanding of language model pre-training dynamics and informs the +deployment of stable language models in practice. + +
+
+
+
+
+ + ☆ Rethinking Machine Ethics -- Can LLMs Perform Moral Reasoning through + the Lens of Moral Theories? + + +
+ Making moral judgments is an essential step toward developing ethical AI +systems. Prevalent approaches are mostly implemented in a bottom-up manner, +which uses a large set of annotated data to train models based on crowd-sourced +opinions about morality. These approaches have been criticized for potentially +overgeneralizing a limited group of annotators' moral stances and lacking +explainability. In contrast, top-down approaches make moral judgments grounded +in a set of principles. However, it remains conceptual due to the incapability +of previous language models and the unsolved debate among moral principles. In +this study, we propose a flexible framework to steer Large Language Models +(LLMs) to perform moral reasoning with well-established moral theories from +interdisciplinary research. The theory-guided top-down framework can +incorporate various moral theories. Our experiments demonstrate the +effectiveness of the proposed framework on datasets derived from moral +theories. Furthermore, we show the alignment between different moral theories +and existing morality datasets. Our analysis exhibits the potentials and flaws +in existing resources (models and datasets) in developing explainable moral +judgment-making systems. + +
+
+ comment: 7 pages +
+
+
+
+
+ + ☆ Text-to-SQL Empowered by Large Language Models: A Benchmark Evaluation + + +
+ Large language models (LLMs) have emerged as a new paradigm for Text-to-SQL +task. However, the absence of a systematical benchmark inhibits the development +of designing effective, efficient and economic LLM-based Text-to-SQL solutions. +To address this challenge, in this paper, we first conduct a systematical and +extensive comparison over existing prompt engineering methods, including +question representation, example selection and example organization, and with +these experimental results, we elaborates their pros and cons. Based on these +findings, we propose a new integrated solution, named DAIL-SQL, which refreshes +the Spider leaderboard with 86.6% execution accuracy and sets a new bar. +Towards an efficient and economic LLM-based Text-to-SQL solution, we emphasize +the token efficiency in prompt engineering and compare the prior studies under +this metric. Additionally, we investigate open-source LLMs in in-context +learning, and further enhance their performance with task-specific supervised +fine-tuning. Our explorations highlight open-source LLMs' potential in +Text-to-SQL, as well as the advantages and disadvantages of the task-specific +supervised fine-tuning. We hope that our work provides a deeper understanding +of Text-to-SQL with LLMs, and inspire further investigations and broad +applications. + +
+
+ comment: We have released code on https://github.com/BeachWang/DAIL-SQL +
+
+
+
+
+ + ☆ Historical patterns of rice farming explain modern-day language use in + China and Japan more than modernization and urbanization + + +
+ We used natural language processing to analyze a billion words to study +cultural differences on Weibo, one of China's largest social media platforms. +We compared predictions from two common explanations about cultural differences +in China (economic development and urban-rural differences) against the +less-obvious legacy of rice versus wheat farming. Rice farmers had to +coordinate shared irrigation networks and exchange labor to cope with higher +labor requirements. In contrast, wheat relied on rainfall and required half as +much labor. We test whether this legacy made southern China more +interdependent. Across all word categories, rice explained twice as much +variance as economic development and urbanization. Rice areas used more words +reflecting tight social ties, holistic thought, and a cautious, prevention +orientation. We then used Twitter data comparing prefectures in Japan, which +largely replicated the results from China. This provides crucial evidence of +the rice theory in a different nation, language, and platform. + +
+
+ comment: Includes Supplemental Materials +
+
+
+
+
+ + ☆ A Framework for Responsible Development of Automated Student Feedback + with Generative AI + + +
+ Providing rich feedback to students is essential for supporting student +learning. Recent advances in generative AI, particularly within large language +modelling (LLM), provide the opportunity to deliver repeatable, scalable and +instant automatically generated feedback to students, making abundant a +previously scarce and expensive learning resource. Such an approach is feasible +from a technical perspective due to these recent advances in Artificial +Intelligence (AI) and Natural Language Processing (NLP); while the potential +upside is a strong motivator, doing so introduces a range of potential ethical +issues that must be considered as we apply these technologies. The +attractiveness of AI systems is that they can effectively automate the most +mundane tasks; but this risks introducing a "tyranny of the majority", where +the needs of minorities in the long tail are overlooked because they are +difficult to automate. + Developing machine learning models that can generate valuable and authentic +feedback requires the input of human domain experts. The choices we make in +capturing this expertise -- whose, which, when, and how -- will have +significant consequences for the nature of the resulting feedback. How we +maintain our models will affect how that feedback remains relevant given +temporal changes in context, theory, and prior learning profiles of student +cohorts. These questions are important from an ethical perspective; but they +are also important from an operational perspective. Unless they can be +answered, our AI generated systems will lack the trust necessary for them to be +useful features in the contemporary learning environment. + This article will outline the frontiers of automated feedback, identify the +ethical issues involved in the provision of automated feedback and present a +framework to assist academics to develop such systems responsibly. + +
+
+ comment: 10 pages, under review at IEEE TLT +
+
+
+
+
+ + ☆ TaskLAMA: Probing the Complex Task Understanding of Language Models + + +
+ Structured Complex Task Decomposition (SCTD) is the problem of breaking down +a complex real-world task (such as planning a wedding) into a directed acyclic +graph over individual steps that contribute to achieving the task, with edges +specifying temporal dependencies between them. SCTD is an important component +of assistive planning tools, and a challenge for commonsense reasoning systems. +We probe how accurately SCTD can be done with the knowledge extracted from +Large Language Models (LLMs). We introduce a high-quality human-annotated +dataset for this problem and novel metrics to fairly assess performance of LLMs +against several baselines. Our experiments reveal that LLMs are able to +decompose complex tasks into individual steps effectively, with a relative +improvement of 15% to 280% over the best baseline. We also propose a number of +approaches to further improve their performance, with a relative improvement of +7% to 37% over the base model. However, we find that LLMs still struggle to +predict pairwise temporal dependencies, which reveals a gap in their +understanding of complex tasks. + +
+
+
+
+
+ + ☆ KGConv, a Conversational Corpus grounded in Wikidata + + +
+ We present KGConv, a large, conversational corpus of 71k conversations where +each question-answer pair is grounded in a Wikidata fact. Conversations contain +on average 8.6 questions and for each Wikidata fact, we provide multiple +variants (12 on average) of the corresponding question using templates, human +annotations, hand-crafted rules and a question rewriting neural model. We +provide baselines for the task of Knowledge-Based, Conversational Question +Generation. KGConv can further be used for other generation and analysis tasks +such as single-turn question generation from Wikidata triples, question +rewriting, question answering from conversation or from knowledge graphs and +quiz generation. + +
+
+
+
+
+ + ☆ Enhancing OCR Performance through Post-OCR Models: Adopting Glyph + Embedding for Improved Correction + + +
+ The study investigates the potential of post-OCR models to overcome +limitations in OCR models and explores the impact of incorporating glyph +embedding on post-OCR correction performance. In this study, we have developed +our own post-OCR correction model. The novelty of our approach lies in +embedding the OCR output using CharBERT and our unique embedding technique, +capturing the visual characteristics of characters. Our findings show that +post-OCR correction effectively addresses deficiencies in inferior OCR models, +and glyph embedding enables the model to achieve superior results, including +the ability to correct individual words. + +
+
+
+
+
+ + ☆ A Classification-Guided Approach for Adversarial Attacks against Neural + Machine Translation + + +
+ Neural Machine Translation (NMT) models have been shown to be vulnerable to +adversarial attacks, wherein carefully crafted perturbations of the input can +mislead the target model. In this paper, we introduce ACT, a novel adversarial +attack framework against NMT systems guided by a classifier. In our attack, the +adversary aims to craft meaning-preserving adversarial examples whose +translations by the NMT model belong to a different class than the original +translations in the target language. Unlike previous attacks, our new approach +has a more substantial effect on the translation by altering the overall +meaning, which leads to a different class determined by a classifier. To +evaluate the robustness of NMT models to this attack, we propose enhancements +to existing black-box word-replacement-based attacks by incorporating output +translations of the target NMT model and the output logits of a classifier +within the attack process. Extensive experiments in various settings, including +a comparison with existing untargeted attacks, demonstrate that the proposed +attack is considerably more successful in altering the class of the output +translation and has more effect on the translation. This new paradigm can show +the vulnerabilities of NMT systems by focusing on the class of translation +rather than the mere translation quality as studied traditionally. + +
+
+
+
+
+ + ☆ PronounFlow: A Hybrid Approach for Calibrating Pronouns in Sentences + + +
+ Flip through any book or listen to any song lyrics, and you will come across +pronouns that, in certain cases, can hinder meaning comprehension, especially +for machines. As the role of having cognitive machines becomes pervasive in our +lives, numerous systems have been developed to resolve pronouns under various +challenges. Commensurate with this, it is believed that having systems able to +disambiguate pronouns in sentences will help towards the endowment of machines +with commonsense and reasoning abilities like those found in humans. However, +one problem these systems face with modern English is the lack of gender +pronouns, where people try to alternate by using masculine, feminine, or plural +to avoid the whole issue. Since humanity aims to the building of systems in the +full-bodied sense we usually reserve for people, what happens when pronouns in +written text, like plural or epicene ones, refer to unspecified entities whose +gender is not necessarily known? Wouldn't that put extra barriers to existing +coreference resolution systems? Towards answering those questions, through the +implementation of a neural-symbolic system that utilizes the best of both +worlds, we are employing PronounFlow, a system that reads any English sentence +with pronouns and entities, identifies which of them are not tied to each +other, and makes suggestions on which to use to avoid biases. Undertaken +experiments show that PronounFlow not only alternates pronouns in sentences +based on the collective human knowledge around us but also considerably helps +coreference resolution systems with the pronoun disambiguation process. + +
+
+ comment: 13 pages, 4 figures, 3 tables +
+
+
+
+
+ + ☆ Classification-Aware Neural Topic Model Combined With Interpretable + Analysis -- For Conflict Classification + + +
+ A large number of conflict events are affecting the world all the time. In +order to analyse such conflict events effectively, this paper presents a +Classification-Aware Neural Topic Model (CANTM-IA) for Conflict Information +Classification and Topic Discovery. The model provides a reliable +interpretation of classification results and discovered topics by introducing +interpretability analysis. At the same time, interpretation is introduced into +the model architecture to improve the classification performance of the model +and to allow interpretation to focus further on the details of the data. +Finally, the model architecture is optimised to reduce the complexity of the +model. + +
+
+ comment: Accepted by RANLP 2023 +
+
+
+
+
+ + ☆ Multi-party Goal Tracking with LLMs: Comparing Pre-training, + Fine-tuning, and Prompt Engineering + + +
+ This paper evaluates the extent to which current Large Language Models (LLMs) +can capture task-oriented multi-party conversations (MPCs). We have recorded +and transcribed 29 MPCs between patients, their companions, and a social robot +in a hospital. We then annotated this corpus for multi-party goal-tracking and +intent-slot recognition. People share goals, answer each other's goals, and +provide other people's goals in MPCs - none of which occur in dyadic +interactions. To understand user goals in MPCs, we compared three methods in +zero-shot and few-shot settings: we fine-tuned T5, created pre-training tasks +to train DialogLM using LED, and employed prompt engineering techniques with +GPT-3.5-turbo, to determine which approach can complete this novel task with +limited data. GPT-3.5-turbo significantly outperformed the others in a few-shot +setting. The `reasoning' style prompt, when given 7% of the corpus as example +annotated conversations, was the best performing method. It correctly annotated +62.32% of the goal tracking MPCs, and 69.57% of the intent-slot recognition +MPCs. A `story' style prompt increased model hallucination, which could be +detrimental if deployed in safety-critical settings. We conclude that +multi-party conversations still challenge state-of-the-art LLMs. + +
+
+ comment: Accepted and will appear in the Proceedings of SIGdial 2023 +
+
+
+
+
+ + ☆ CLIPTrans: Transferring Visual Knowledge with Pre-trained Models for + Multimodal Machine Translation ICCV + + +
+ There has been a growing interest in developing multimodal machine +translation (MMT) systems that enhance neural machine translation (NMT) with +visual knowledge. This problem setup involves using images as auxiliary +information during training, and more recently, eliminating their use during +inference. Towards this end, previous works face a challenge in training +powerful MMT models from scratch due to the scarcity of annotated multilingual +vision-language data, especially for low-resource languages. Simultaneously, +there has been an influx of multilingual pre-trained models for NMT and +multimodal pre-trained models for vision-language tasks, primarily in English, +which have shown exceptional generalisation ability. However, these are not +directly applicable to MMT since they do not provide aligned multimodal +multilingual features for generative tasks. To alleviate this issue, instead of +designing complex modules for MMT, we propose CLIPTrans, which simply adapts +the independently pre-trained multimodal M-CLIP and the multilingual mBART. In +order to align their embedding spaces, mBART is conditioned on the M-CLIP +features by a prefix sequence generated through a lightweight mapping network. +We train this in a two-stage pipeline which warms up the model with image +captioning before the actual translation task. Through experiments, we +demonstrate the merits of this framework and consequently push forward the +state-of-the-art across standard benchmarks by an average of +2.67 BLEU. The +code can be found at www.github.com/devaansh100/CLIPTrans. + +
+
+ comment: 15 pages, 9 figures, to be published In Proceedings of International + Conference of Computer Vision(ICCV), 2023 +
+
+
+
+
+ + ☆ FurChat: An Embodied Conversational Agent using LLMs, Combining Open and + Closed-Domain Dialogue with Facial Expressions SIGDIAL 2023 + + +
+ We demonstrate an embodied conversational agent that can function as a +receptionist and generate a mixture of open and closed-domain dialogue along +with facial expressions, by using a large language model (LLM) to develop an +engaging conversation. We deployed the system onto a Furhat robot, which is +highly expressive and capable of using both verbal and nonverbal cues during +interaction. The system was designed specifically for the National Robotarium +to interact with visitors through natural conversations, providing them with +information about the facilities, research, news, upcoming events, etc. The +system utilises the state-of-the-art GPT-3.5 model to generate such information +along with domain-general conversations and facial expressions based on prompt +engineering. + +
+
+ comment: 5 pages, 2 figures, Accepted at SIGDIAL 2023 (24th Meeting of the + Special Interest Group on Discourse and Dialogue), for the demo video, see + https://youtu.be/fwtUl1kl22s +
+
+
+
+
+ + ☆ Shared Lexical Items as Triggers of Code Switching ACL + + +
+ Why do bilingual speakers code-switch (mix their two languages)? Among the +several theories that attempt to explain this natural and ubiquitous +phenomenon, the Triggering Hypothesis relates code-switching to the presence of +lexical triggers, specifically cognates and proper names, adjacent to the +switch point. We provide a fuller, more nuanced and refined exploration of the +triggering hypothesis, based on five large datasets in three language pairs, +reflecting both spoken and written bilingual interactions. Our results show +that words that are assumed to reside in a mental lexicon shared by both +languages indeed trigger code-switching; that the tendency to switch depends on +the distance of the trigger from the switch point; and on whether the trigger +precedes or succeeds the switch; but not on the etymology of the trigger words. +We thus provide strong, robust, evidence-based confirmation to several +hypotheses on the relationships between lexical triggers and code-switching. + +
+
+ comment: This is the author's final version; the article has been accepted for + publication in the Transactions of the Association for Computational + Linguistics (TACL) +
+
+
+
+
+ + ☆ Benchmarking the Generation of Fact Checking Explanations ACL + + +
+ Fighting misinformation is a challenging, yet crucial, task. Despite the +growing number of experts being involved in manual fact-checking, this activity +is time-consuming and cannot keep up with the ever-increasing amount of Fake +News produced daily. Hence, automating this process is necessary to help curb +misinformation. Thus far, researchers have mainly focused on claim veracity +classification. In this paper, instead, we address the generation of +justifications (textual explanation of why a claim is classified as either true +or false) and benchmark it with novel datasets and advanced baselines. In +particular, we focus on summarization approaches over unstructured knowledge +(i.e. news articles) and we experiment with several extractive and abstractive +strategies. We employed two datasets with different styles and structures, in +order to assess the generalizability of our findings. Results show that in +justification production summarization benefits from the claim information, +and, in particular, that a claim-driven extractive step improves abstractive +summarization performances. Finally, we show that although cross-dataset +experiments suffer from performance degradation, a unique model trained on a +combination of the two datasets is able to retain style information in an +efficient manner. + +
+
+ comment: Accepted to TACL. This arXiv version is a pre-MIT Press publication + version +
+
+
+
+
+ + ☆ Enhancing Psychological Counseling with Large Language Model: A + Multifaceted Decision-Support System for Non-Professionals + + +
+ In the contemporary landscape of social media, an alarming number of users +express negative emotions, some of which manifest as strong suicidal +intentions. This situation underscores a profound need for trained +psychological counselors who can enact effective mental interventions. However, +the development of these professionals is often an imperative but +time-consuming task. Consequently, the mobilization of non-professionals or +volunteers in this capacity emerges as a pressing concern. Leveraging the +capabilities of artificial intelligence, and in particular, the recent advances +in large language models, offers a viable solution to this challenge. This +paper introduces a novel model constructed on the foundation of large language +models to fully assist non-professionals in providing psychological +interventions on online user discourses. This framework makes it plausible to +harness the power of non-professional counselors in a meaningful way. A +comprehensive study was conducted involving ten professional psychological +counselors of varying expertise, evaluating the system across five critical +dimensions. The findings affirm that our system is capable of analyzing +patients' issues with relative accuracy and proffering professional-level +strategies recommendations, thereby enhancing support for non-professionals. +This research serves as a compelling validation of the application of large +language models in the field of psychology and lays the groundwork for a new +paradigm of community-based mental health support. + +
+
+
+
+
+ + ☆ The Anatomy of Conspirators: Unveiling Traits using a Comprehensive + Twitter Dataset + + +
+ The discourse around conspiracy theories is currently thriving amidst the +rampant misinformation prevalent in online environments. Research in this field +has been focused on detecting conspiracy theories on social media, often +relying on limited datasets. In this study, we present a novel methodology for +constructing a Twitter dataset that encompasses accounts engaged in +conspiracy-related activities throughout the year 2022. Our approach centers on +data collection that is independent of specific conspiracy theories and +information operations. Additionally, our dataset includes a control group +comprising randomly selected users who can be fairly compared to the +individuals involved in conspiracy activities. This comprehensive collection +effort yielded a total of 15K accounts and 37M tweets extracted from their +timelines. We conduct a comparative analysis of the two groups across three +dimensions: topics, profiles, and behavioral characteristics. The results +indicate that conspiracy and control users exhibit similarity in terms of their +profile metadata characteristics. However, they diverge significantly in terms +of behavior and activity, particularly regarding the discussed topics, the +terminology used, and their stance on trending subjects. Interestingly, there +is no significant disparity in the presence of bot users between the two +groups, suggesting that conspiracy and automation are orthogonal concepts. +Finally, we develop a classifier to identify conspiracy users using 93 +features, some of which are commonly employed in literature for troll +identification. The results demonstrate a high accuracy level (with an average +F1 score of 0.98%), enabling us to uncover the most discriminative features +associated with conspiracy-related accounts. + +
+
+
+
+
+ + ☆ Evaluation and Analysis of Hallucination in Large Vision-Language Models + + +
+ Large Vision-Language Models (LVLMs) have recently achieved remarkable +success. However, LVLMs are still plagued by the hallucination problem, which +limits the practicality in many scenarios. Hallucination refers to the +information of LVLMs' responses that does not exist in the visual input, which +poses potential risks of substantial consequences. There has been limited work +studying hallucination evaluation in LVLMs. In this paper, we propose +Hallucination Evaluation based on Large Language Models (HaELM), an LLM-based +hallucination evaluation framework. HaELM achieves an approximate 95% +performance comparable to ChatGPT and has additional advantages including low +cost, reproducibility, privacy preservation and local deployment. Leveraging +the HaELM, we evaluate the hallucination in current LVLMs. Furthermore, we +analyze the factors contributing to hallucination in LVLMs and offer helpful +suggestions to mitigate the hallucination problem. Our training data and human +annotation hallucination data will be made public soon. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ☆ SpikeBERT: A Language Spikformer Trained with Two-Stage Knowledge + Distillation from BERT + + +
+ Spiking neural networks (SNNs) offer a promising avenue to implement deep +neural networks in a more energy-efficient way. However, the network +architectures of existing SNNs for language tasks are too simplistic, and deep +architectures have not been fully explored, resulting in a significant +performance gap compared to mainstream transformer-based networks such as BERT. +To this end, we improve a recently-proposed spiking transformer (i.e., +Spikformer) to make it possible to process language tasks and propose a +two-stage knowledge distillation method for training it, which combines +pre-training by distilling knowledge from BERT with a large collection of +unlabelled texts and fine-tuning with task-specific instances via knowledge +distillation again from the BERT fine-tuned on the same training examples. +Through extensive experimentation, we show that the models trained with our +method, named SpikeBERT, outperform state-of-the-art SNNs and even achieve +comparable results to BERTs on text classification tasks for both English and +Chinese with much less energy consumption. + +
+
+
+
+
+ + ☆ Large Language Models on the Chessboard: A Study on ChatGPT's Formal + Language Comprehension and Complex Reasoning Skills + + +
+ While large language models have made strides in natural language processing, +their proficiency in complex reasoning tasks requiring formal language +comprehension, such as chess, remains less investigated. This paper probes the +performance of ChatGPT, a sophisticated language model by OpenAI in tackling +such complex reasoning tasks, using chess as a case study. Through robust +metrics examining both the legality and quality of moves, we assess ChatGPT's +understanding of the chessboard, adherence to chess rules, and strategic +decision-making abilities. Our evaluation identifies limitations within +ChatGPT's attention mechanism that affect its formal language comprehension and +uncovers the model's underdeveloped self-regulation abilities. Our study also +reveals ChatGPT's propensity for a coherent strategy in its gameplay and a +noticeable uptick in decision-making assertiveness when the model is presented +with a greater volume of natural language or possesses a more lucid +understanding of the state of the chessboard. These findings contribute to the +growing exploration of language models' abilities beyond natural language +processing, providing valuable information for future research towards models +demonstrating human-like cognitive abilities. + +
+
+
+
+
+ + ☆ Sequential annotations for naturally-occurring HRI: first insights + + +
+ We explain the methodology we developed for improving the interactions +accomplished by an embedded conversational agent, drawing from Conversation +Analytic sequential and multimodal analysis. The use case is a Pepper robot +that is expected to inform and orient users in a library. In order to propose +and learn better interactive schema, we are creating a corpus of +naturally-occurring interactions that will be made available to the community. +To do so, we propose an annotation practice based on some theoretical +underpinnings about the use of language and multimodal resources in human-robot +interaction. CCS CONCEPTS $\bullet$ Computing methodologies $\rightarrow$ +Discourse, dialogue and pragmatics; $\bullet$ Human-centered computing +$\rightarrow$ Text input; HCI theory, concepts and models; Field studies. + +
+
+ comment: Peer-reviewed workshop paper accepted for the ''Human-Robot + Conversational Interaction'' workshop that took place at the ''ACM/IEEE + International Conference on Human-Robot Interaction'' 2023 Conference in + Stockholm, Sweden +
+
+
+
+
+ + ☆ Killing two birds with one stone: Can an audio captioning system also be + used for audio-text retrieval? + + +
+ Automated Audio Captioning (AAC) aims to develop systems capable of +describing an audio recording using a textual sentence. In contrast, Audio-Text +Retrieval (ATR) systems seek to find the best matching audio recording(s) for a +given textual query (Text-to-Audio) or vice versa (Audio-to-Text). These tasks +require different types of systems: AAC employs a sequence-to-sequence model, +while ATR utilizes a ranking model that compares audio and text representations +within a shared projection subspace. However, this work investigates the +relationship between AAC and ATR by exploring the ATR capabilities of an +unmodified AAC system, without fine-tuning for the new task. Our AAC system +consists of an audio encoder (ConvNeXt-Tiny) trained on AudioSet for audio +tagging, and a transformer decoder responsible for generating sentences. For +AAC, it achieves a high SPIDEr-FL score of 0.298 on Clotho and 0.472 on +AudioCaps on average. For ATR, we propose using the standard Cross-Entropy loss +values obtained for any audio/caption pair. Experimental results on the Clotho +and AudioCaps datasets demonstrate decent recall values using this simple +approach. For instance, we obtained a Text-to-Audio R@1 value of 0.382 for +Au-dioCaps, which is above the current state-of-the-art method without external +data. Interestingly, we observe that normalizing the loss values was necessary +for Audio-to-Text retrieval. + +
+
+ comment: cam ready version (14/08/23) +
+
+
+
+
+ + ☆ Taxonomic Loss for Morphological Glossing of Low-Resource Languages + + +
+ Morpheme glossing is a critical task in automated language documentation and +can benefit other downstream applications greatly. While state-of-the-art +glossing systems perform very well for languages with large amounts of existing +data, it is more difficult to create useful models for low-resource languages. +In this paper, we propose the use of a taxonomic loss function that exploits +morphological information to make morphological glossing more performant when +data is scarce. We find that while the use of this loss function does not +outperform a standard loss function with regards to single-label prediction +accuracy, it produces better predictions when considering the top-n predicted +labels. We suggest this property makes the taxonomic loss function useful in a +human-in-the-loop annotation setting. + +
+
+
+
+
+ + ☆ Adapting text-based dialogue state tracker for spoken dialogues SIGDIAL 2023 + + +
+ Although there have been remarkable advances in dialogue systems through the +dialogue systems technology competition (DSTC), it remains one of the key +challenges to building a robust task-oriented dialogue system with a speech +interface. Most of the progress has been made for text-based dialogue systems +since there are abundant datasets with written corpora while those with spoken +dialogues are very scarce. However, as can be seen from voice assistant systems +such as Siri and Alexa, it is of practical importance to transfer the success +to spoken dialogues. In this paper, we describe our engineering effort in +building a highly successful model that participated in the speech-aware +dialogue systems technology challenge track in DSTC11. Our model consists of +three major modules: (1) automatic speech recognition error correction to +bridge the gap between the spoken and the text utterances, (2) text-based +dialogue system (D3ST) for estimating the slots and values using slot +descriptions, and (3) post-processing for recovering the error of the estimated +slot value. Our experiments show that it is important to use an explicit +automatic speech recognition error correction module, post-processing, and data +augmentation to adapt a text-based dialogue state tracker for spoken dialogue +corpora. + +
+
+ comment: 8 pages, 5 figures, Accepted at the DSTC 11 Workshop to be located at + SIGDIAL 2023 +
+
+
+
+
+ + ☆ Large language models converge toward human-like concept organization + + +
+ Large language models show human-like performance in knowledge extraction, +reasoning and dialogue, but it remains controversial whether this performance +is best explained by memorization and pattern matching, or whether it reflects +human-like inferential semantics and world knowledge. Knowledge bases such as +WikiData provide large-scale, high-quality representations of inferential +semantics and world knowledge. We show that large language models learn to +organize concepts in ways that are strikingly similar to how concepts are +organized in such knowledge bases. Knowledge bases model collective, +institutional knowledge, and large language models seem to induce such +knowledge from raw text. We show that bigger and better models exhibit more +human-like concept organization, across four families of language models and +three knowledge graph embeddings. + +
+
+
+
+
+ + ☆ Improving Neural Ranking Models with Traditional IR Methods + + +
+ Neural ranking methods based on large transformer models have recently gained +significant attention in the information retrieval community, and have been +adopted by major commercial solutions. Nevertheless, they are computationally +expensive to create, and require a great deal of labeled data for specialized +corpora. In this paper, we explore a low resource alternative which is a +bag-of-embedding model for document retrieval and find that it is competitive +with large transformer models fine tuned on information retrieval tasks. Our +results show that a simple combination of TF-IDF, a traditional keyword +matching method, with a shallow embedding model provides a low cost path to +compete well with the performance of complex neural ranking models on 3 +datasets. Furthermore, adding TF-IDF measures improves the performance of +large-scale fine tuned models on these tasks. + +
+
+ comment: Short paper, 4 pages +
+
+
+
+
+ + ☆ Recursively Summarizing Enables Long-Term Dialogue Memory in Large + Language Models + + +
+ Most open-domain dialogue systems suffer from forgetting important +information, especially in a long-term conversation. Existing works usually +train the specific retriever or summarizer to obtain key information from the +past, which is time-consuming and highly depends on the quality of labeled +data. To alleviate this problem, we propose to recursively generate summaries/ +memory using large language models (LLMs) to enhance long-term memory ability. +Specifically, our method first stimulates LLMs to memorize small dialogue +contexts and then recursively produce new memory using previous memory and +following contexts. Finally, the LLM can easily generate a highly consistent +response with the help of the latest memory. We evaluate our method using +ChatGPT and text-davinci-003, and the experiments on the widely-used public +dataset show that our method can generate more consistent responses in a +long-context conversation. Notably, our method is a potential solution to +enable the LLM to model the extremely long context. Code and scripts will be +released later. + +
+
+
+
+
+ + ☆ TransPrompt v2: A Transferable Prompting Framework for Cross-task Text + Classification + + +
+ Text classification is one of the most imperative tasks in natural language +processing (NLP). Recent advances with pre-trained language models (PLMs) have +shown remarkable success on this task. However, the satisfying results obtained +by PLMs heavily depend on the large amounts of task-specific labeled data, +which may not be feasible in many application scenarios due to data access and +privacy constraints. The recently-proposed prompt-based fine-tuning paradigm +improves the performance of PLMs for few-shot text classification with +task-specific templates. Yet, it is unclear how the prompting knowledge can be +transferred across tasks, for the purpose of mutual reinforcement. We propose +TransPrompt v2, a novel transferable prompting framework for few-shot learning +across similar or distant text classification tasks. For learning across +similar tasks, we employ a multi-task meta-knowledge acquisition (MMA) +procedure to train a meta-learner that captures the cross-task transferable +knowledge. For learning across distant tasks, we further inject the task type +descriptions into the prompt, and capture the intra-type and inter-type prompt +embeddings among multiple distant tasks. Additionally, two de-biasing +techniques are further designed to make the trained meta-learner more +task-agnostic and unbiased towards any tasks. After that, the meta-learner can +be adapted to each specific task with better parameters initialization. +Extensive experiments show that TransPrompt v2 outperforms single-task and +cross-task strong baselines over multiple NLP tasks and datasets. We further +show that the meta-learner can effectively improve the performance of PLMs on +previously unseen tasks. In addition, TransPrompt v2 also outperforms strong +fine-tuning baselines when learning with full training sets. + +
+
+
+
+
+ + ☆ Robust Open-Set Spoken Language Identification and the CU MultiLang + Dataset + + +
+ Most state-of-the-art spoken language identification models are closed-set; +in other words, they can only output a language label from the set of classes +they were trained on. Open-set spoken language identification systems, however, +gain the ability to detect when an input exhibits none of the original +languages. In this paper, we implement a novel approach to open-set spoken +language identification that uses MFCC and pitch features, a TDNN model to +extract meaningful feature embeddings, confidence thresholding on softmax +outputs, and LDA and pLDA for learning to classify new unknown languages. We +present a spoken language identification system that achieves 91.76% accuracy +on trained languages and has the capability to adapt to unknown languages on +the fly. To that end, we also built the CU MultiLang Dataset, a large and +diverse multilingual speech corpus which was used to train and evaluate our +system. + +
+
+ comment: 6pages, 1 table, 6 figures +
+
+
+
+
+ + ☆ Document AI: A Comparative Study of Transformer-Based, Graph-Based + Models, and Convolutional Neural Networks For Document Layout Analysis + + +
+ Document AI aims to automatically analyze documents by leveraging natural +language processing and computer vision techniques. One of the major tasks of +Document AI is document layout analysis, which structures document pages by +interpreting the content and spatial relationships of layout, image, and text. +This task can be image-centric, wherein the aim is to identify and label +various regions such as authors and paragraphs, or text-centric, where the +focus is on classifying individual words in a document. Although there are +increasingly sophisticated methods for improving layout analysis, doubts remain +about the extent to which their findings can be generalized to a broader +context. Specifically, prior work developed systems based on very different +architectures, such as transformer-based, graph-based, and CNNs. However, no +work has mentioned the effectiveness of these models in a comparative analysis. +Moreover, while language-independent Document AI models capable of knowledge +transfer have been developed, it remains to be investigated to what degree they +can effectively transfer knowledge. In this study, we aim to fill these gaps by +conducting a comparative evaluation of state-of-the-art models in document +layout analysis and investigating the potential of cross-lingual layout +analysis by utilizing machine translation techniques. + +
+
+
+
+
+ + ♻ ☆ A Deep Convolutional Neural Networks Based Multi-Task Ensemble Model for + Aspect and Polarity Classification in Persian Reviews + + +
+ Aspect-based sentiment analysis is of great importance and application +because of its ability to identify all aspects discussed in the text. However, +aspect-based sentiment analysis will be most effective when, in addition to +identifying all the aspects discussed in the text, it can also identify their +polarity. Most previous methods use the pipeline approach, that is, they first +identify the aspects and then identify the polarities. Such methods are +unsuitable for practical applications since they can lead to model errors. +Therefore, in this study, we propose a multi-task learning model based on +Convolutional Neural Networks (CNNs), which can simultaneously detect aspect +category and detect aspect category polarity. creating a model alone may not +provide the best predictions and lead to errors such as bias and high variance. +To reduce these errors and improve the efficiency of model predictions, +combining several models known as ensemble learning may provide better results. +Therefore, the main purpose of this article is to create a model based on an +ensemble of multi-task deep convolutional neural networks to enhance sentiment +analysis in Persian reviews. We evaluated the proposed method using a Persian +language dataset in the movie domain. Jacquard index and Hamming loss measures +were used to evaluate the performance of the developed models. The results +indicate that this new approach increases the efficiency of the sentiment +analysis model in the Persian language. + +
+
+
+
+
+ + ♻ ☆ Empowering Clinicians and Democratizing Data Science: Large Language + Models Automate Machine Learning for Clinical Studies + + +
+ A knowledge gap persists between Machine Learning (ML) developers (e.g., data +scientists) and practitioners (e.g., clinicians), hampering the full +utilization of ML for clinical data analysis. We investigated the potential of +the chatGPT Advanced Data Analysis (ADA), an extension of GPT-4, to bridge this +gap and perform ML analyses efficiently. Real-world clinical datasets and study +details from large trials across various medical specialties were presented to +chatGPT ADA without specific guidance. ChatGPT ADA autonomously developed +state-of-the-art ML models based on the original study's training data to +predict clinical outcomes such as cancer development, cancer progression, +disease complications, or biomarkers such as pathogenic gene sequences. +Strikingly, these ML models matched or outperformed their published +counterparts. We conclude that chatGPT ADA offers a promising avenue to +democratize ML in medicine, making advanced analytics accessible to non-ML +experts and promoting broader applications in medical research and practice. + +
+
+
+
+
+ + ♻ ☆ Mol-Instructions: A Large-Scale Biomolecular Instruction Dataset for + Large Language Models + + +
+ Large Language Models (LLMs), with their remarkable task-handling +capabilities and innovative outputs, have catalyzed significant advancements +across a spectrum of fields. However, their proficiency within specialized +domains such as biomolecular studies remains limited. To address this +challenge, we introduce Mol-Instructions, a meticulously curated, comprehensive +instruction dataset expressly designed for the biomolecular realm. +Mol-Instructions is composed of three pivotal components: molecule-oriented +instructions, protein-oriented instructions, and biomolecular text +instructions, each curated to enhance the understanding and prediction +capabilities of LLMs concerning biomolecular features and behaviors. Through +extensive instruction tuning experiments on the representative LLM, we +underscore the potency of Mol-Instructions to enhance the adaptability and +cognitive acuity of large models within the complex sphere of biomolecular +studies, thereby promoting advancements in the biomolecular research community. +Mol-Instructions is made publicly accessible for future research endeavors and +will be subjected to continual updates for enhanced applicability. + +
+
+ comment: Project homepage: https://github.com/zjunlp/Mol-Instructions. Add + quantitative evaluations +
+
+
+
+
+ + ♻ ☆ An Empirical Investigation of the Role of Pre-training in Lifelong + Learning + + +
+ The lifelong learning paradigm in machine learning is an attractive +alternative to the more prominent isolated learning scheme not only due to its +resemblance to biological learning but also its potential to reduce energy +waste by obviating excessive model re-training. A key challenge to this +paradigm is the phenomenon of catastrophic forgetting. With the increasing +popularity and success of pre-trained models in machine learning, we pose the +question: What role does pre-training play in lifelong learning, specifically +with respect to catastrophic forgetting? We investigate existing methods in the +context of large, pre-trained models and evaluate their performance on a +variety of text and image classification tasks, including a large-scale study +using a novel data set of 15 diverse NLP tasks. Across all settings, we observe +that generic pre-training implicitly alleviates the effects of catastrophic +forgetting when learning multiple tasks sequentially compared to randomly +initialized models. We then further investigate why pre-training alleviates +forgetting in this setting. We study this phenomenon by analyzing the loss +landscape, finding that pre-trained weights appear to ease forgetting by +leading to wider minima. Based on this insight, we propose jointly optimizing +for current task loss and loss basin sharpness to explicitly encourage wider +basins during sequential fine-tuning. We show that this optimization approach +outperforms several state-of-the-art task-sequential continual learning +algorithms across multiple settings, occasionally even without retaining a +memory that scales in size with the number of tasks. + +
+
+
+
+
+ + ♻ ☆ Political Sentiment Analysis of Persian Tweets Using CNN-LSTM Model + + +
+ Sentiment analysis is the process of identifying and categorizing people's +emotions or opinions regarding various topics. The analysis of Twitter +sentiment has become an increasingly popular topic in recent years. In this +paper, we present several machine learning and a deep learning model to +analysis sentiment of Persian political tweets. Our analysis was conducted +using Bag of Words and ParsBERT for word representation. We applied Gaussian +Naive Bayes, Gradient Boosting, Logistic Regression, Decision Trees, Random +Forests, as well as a combination of CNN and LSTM to classify the polarities of +tweets. The results of this study indicate that deep learning with ParsBERT +embedding performs better than machine learning. The CNN-LSTM model had the +highest classification accuracy with 89 percent on the first dataset and 71 +percent on the second dataset. Due to the complexity of Persian, it was a +difficult task to achieve this level of efficiency. The main objective of our +research was to reduce the training time while maintaining the model's +performance. As a result, several adjustments were made to the model +architecture and parameters. In addition to achieving the objective, the +performance was slightly improved as well. + +
+
+
+
+
+ + ♻ ☆ Beyond Document Page Classification: Design, Datasets, and Challenges + + +
+ This paper highlights the need to bring document classification benchmarking +closer to real-world applications, both in the nature of data tested ($X$: +multi-channel, multi-paged, multi-industry; $Y$: class distributions and label +set variety) and in classification tasks considered ($f$: multi-page document, +page stream, and document bundle classification, ...). We identify the lack of +public multi-page document classification datasets, formalize different +classification tasks arising in application scenarios, and motivate the value +of targeting efficient multi-page document representations. An experimental +study on proposed multi-page document classification datasets demonstrates that +current benchmarks have become irrelevant and need to be updated to evaluate +complete documents, as they naturally occur in practice. This reality check +also calls for more mature evaluation methodologies, covering calibration +evaluation, inference complexity (time-memory), and a range of realistic +distribution shifts (e.g., born-digital vs. scanning noise, shifting page +order). Our study ends on a hopeful note by recommending concrete avenues for +future improvements.} + +
+
+ comment: 8 pages, under review +
+
+
+
+
+ + ♻ ☆ Challenges of GPT-3-based Conversational Agents for Healthcare + + +
+ The potential to provide patients with faster information access while +allowing medical specialists to concentrate on critical tasks makes medical +domain dialog agents appealing. However, the integration of large-language +models (LLMs) into these agents presents certain limitations that may result in +serious consequences. This paper investigates the challenges and risks of using +GPT-3-based models for medical question-answering (MedQA). We perform several +evaluations contextualized in terms of standard medical principles. We provide +a procedure for manually designing patient queries to stress-test high-risk +limitations of LLMs in MedQA systems. Our analysis reveals that LLMs fail to +respond adequately to these queries, generating erroneous medical information, +unsafe recommendations, and content that may be considered offensive. + +
+
+ comment: 12 pages, 9 Tables, accepted to RANLP 2023 +
+
+
+
+
+ + ♻ ☆ OLISIA: a Cascade System for Spoken Dialogue State Tracking + + +
+ Though Dialogue State Tracking (DST) is a core component of spoken dialogue +systems, recent work on this task mostly deals with chat corpora, disregarding +the discrepancies between spoken and written language.In this paper, we propose +OLISIA, a cascade system which integrates an Automatic Speech Recognition (ASR) +model and a DST model. We introduce several adaptations in the ASR and DST +modules to improve integration and robustness to spoken conversations.With +these adaptations, our system ranked first in DSTC11 Track 3, a benchmark to +evaluate spoken DST. We conduct an in-depth analysis of the results and find +that normalizing the ASR outputs and adapting the DST inputs through data +augmentation, along with increasing the pre-trained models size all play an +important role in reducing the performance discrepancy between written and +spoken conversations. + +
+
+
+
+
+ + ♻ ☆ Theory of Mind Might Have Spontaneously Emerged in Large Language Models + + +
+ We explore the intriguing possibility that theory of mind (ToM), or the +uniquely human ability to impute unobservable mental states to others, might +have spontaneously emerged in large language models (LLMs). We designed 40 +false-belief tasks, considered a gold standard in testing ToM in humans, and +administered them to several LLMs. Each task included a false-belief scenario, +three closely matched true-belief controls, and the reversed versions of all +four. Smaller and older models solved no tasks; GPT-3-davinci-001 (from May +2020) and GPT-3-davinci-002 (from January 2022) solved 10%; and +GPT-3-davinci-003 (from November 2022) and ChatGPT-3.5-turbo (from March 2023) +solved 35% of the tasks, mirroring the performance of three-year-old children. +ChatGPT-4 (from June 2023) solved 90% of the tasks, matching the performance of +seven-year-old children. These findings suggest the intriguing possibility that +ToM, previously considered exclusive to humans, may have spontaneously emerged +as a byproduct of LLMs' improving language skills. + +
+
+ comment: TRY RUNNING ToM EXPERIMENTS ON YOUR OWN: The code and tasks used in + this study are available at Colab + (https://colab.research.google.com/drive/1ZRtmw87CdA4xp24DNS_Ik_uA2ypaRnoU). + Don't worry if you are not an expert coder, you should be able to run this + code with no-to-minimum Python skills. Or copy-paste the tasks to ChatGPT's + web interface +
+
+
+
+
+ + ♻ ☆ Cross-Lingual Constituency Parsing for Middle High German: A + Delexicalized Approach + + +
+ Constituency parsing plays a fundamental role in advancing natural language +processing (NLP) tasks. However, training an automatic syntactic analysis +system for ancient languages solely relying on annotated parse data is a +formidable task due to the inherent challenges in building treebanks for such +languages. It demands extensive linguistic expertise, leading to a scarcity of +available resources. To overcome this hurdle, cross-lingual transfer techniques +which require minimal or even no annotated data for low-resource target +languages offer a promising solution. In this study, we focus on building a +constituency parser for $\mathbf{M}$iddle $\mathbf{H}$igh $\mathbf{G}$erman +($\mathbf{MHG}$) under realistic conditions, where no annotated MHG treebank is +available for training. In our approach, we leverage the linguistic continuity +and structural similarity between MHG and $\mathbf{M}$odern $\mathbf{G}$erman +($\mathbf{MG}$), along with the abundance of MG treebank resources. +Specifically, by employing the $\mathit{delexicalization}$ method, we train a +constituency parser on MG parse datasets and perform cross-lingual transfer to +MHG parsing. Our delexicalized constituency parser demonstrates remarkable +performance on the MHG test set, achieving an F1-score of 67.3%. It outperforms +the best zero-shot cross-lingual baseline by a margin of 28.6% points. These +encouraging results underscore the practicality and potential for automatic +syntactic analysis in other ancient languages that face similar challenges as +MHG. + +
+
+ comment: Accepted to ALP 2023 +
+
+
+
+
+ + ♻ ☆ A Trip Towards Fairness: Bias and De-Biasing in Large Language Models + + +
+ Cheap-to-Build Very Large-Language Models (CtB-LLMs) with affordable training +are emerging as the next big revolution in natural language processing and +understanding. These CtB-LLMs are democratizing access to trainable Very +Large-Language Models (VLLMs) and, thus, may represent the building blocks of +many NLP systems solving downstream tasks. Hence, a little or a large bias in +CtB-LLMs may cause huge harm. In this paper, we performed a large investigation +of the bias of three families of CtB-LLMs, and we showed that debiasing +techniques are effective and usable. Indeed, according to current tests, the +LLaMA and the OPT families have an important bias in gender, race, religion, +and profession. In contrast to the analysis for other LLMs, we discovered that +bias depends not on the number of parameters but on the perplexity. Finally, +the debiasing of OPT using LoRA reduces bias up to 4.12 points in the +normalized stereotype score. + +
+
+
+
+
+ + ♻ ☆ Large Language Models are Fixated by Red Herrings: Exploring Creative + Problem Solving and Einstellung Effect using the Only Connect Wall Dataset + + +
+ The quest for human imitative AI has been an enduring topic in AI research +since its inception. The technical evolution and emerging capabilities of the +latest cohort of large language models (LLMs) have reinvigorated the subject +beyond academia to the cultural zeitgeist. While recent NLP evaluation +benchmark tasks test some aspects of human-imitative behaviour (e.g., +BIG-bench's 'human-like behavior' tasks), few, if not none, examine creative +problem solving abilities. Creative problem solving in humans is a well-studied +topic in cognitive neuroscience with standardized tests that predominantly use +the ability to associate (heterogeneous) connections among clue words as a +metric for creativity. Exposure to misleading stimuli - distractors dubbed red +herrings - impede human performance in such tasks via the fixation effect and +Einstellung paradigm. In cognitive neuroscience studies, such fixations are +experimentally induced by pre-exposing participants to orthographically similar +incorrect words to subsequent word-fragments or clues. The popular British quiz +show Only Connect's Connecting Wall segment essentially mimics Mednick's Remote +Associates Test (RAT) formulation with built-in, deliberate red herrings, which +makes it an ideal proxy dataset to explore and study fixation effect and +Einstellung paradigm from cognitive neuroscience in LLMs. In this paper we +present the novel Only Connect Wall (OCW) dataset and report results from our +evaluation of selected pre-trained language models and LLMs on creative problem +solving tasks like grouping clue words by heterogeneous connections, and +identifying correct open knowledge domain connections in respective groups. We +synthetically generate two additional datasets: OCW-Randomized, OCW-WordNet to +further analyze our red-herrings hypothesis in language models. The code and +link to the dataset are available at https://github.com/TaatiTeam/OCW. + +
+
+ comment: V3: Minor cosmetic adjustment from V2. Fixed Fig. 2 caption + overlapping with text in S2.2. V2: with added OCW-Randomized and OCW-WordNet + results in Section 4.3 (added). 22 pages with Appendix +
+
+
+
+
+ + ♻ ☆ Unveiling Gender Bias in Terms of Profession Across LLMs: Analyzing and + Addressing Sociological Implications + + +
+ Gender bias in artificial intelligence (AI) and natural language processing +has garnered significant attention due to its potential impact on societal +perceptions and biases. This research paper aims to analyze gender bias in +Large Language Models (LLMs) with a focus on multiple comparisons between GPT-2 +and GPT-3.5, some prominent language models, to better understand its +implications. Through a comprehensive literature review, the study examines +existing research on gender bias in AI language models and identifies gaps in +the current knowledge. The methodology involves collecting and preprocessing +data from GPT-2 and GPT-3.5, and employing in-depth quantitative analysis +techniques to evaluate gender bias in the generated text. The findings shed +light on gendered word associations, language usage, and biased narratives +present in the outputs of these Large Language Models. The discussion explores +the ethical implications of gender bias and its potential consequences on +social perceptions and marginalized communities. Additionally, the paper +presents strategies for reducing gender bias in LLMs, including algorithmic +approaches and data augmentation techniques. The research highlights the +importance of interdisciplinary collaborations and the role of sociological +studies in mitigating gender bias in AI models. By addressing these issues, we +can pave the way for more inclusive and unbiased AI systems that have a +positive impact on society. + +
+
+
+
+
+ + ♻ ☆ NBIAS: A Natural Language Processing Framework for Bias Identification + in Text + + +
+ Bias in textual data can lead to skewed interpretations and outcomes when the +data is used. These biases could perpetuate stereotypes, discrimination, or +other forms of unfair treatment. An algorithm trained on biased data may end up +making decisions that disproportionately impact a certain group of people. +Therefore, it is crucial to detect and remove these biases to ensure the fair +and ethical use of data. To this end, we develop a comprehensive and robust +framework NBIAS that consists of four main layers: data, corpus construction, +model development and an evaluation layer. The dataset is constructed by +collecting diverse data from various domains, including social media, +healthcare, and job hiring portals. As such, we applied a transformer-based +token classification model that is able to identify bias words/ phrases through +a unique named entity BIAS. In the evaluation procedure, we incorporate a blend +of quantitative and qualitative measures to gauge the effectiveness of our +models. We achieve accuracy improvements ranging from 1% to 8% compared to +baselines. We are also able to generate a robust understanding of the model +functioning. The proposed approach is applicable to a variety of biases and +contributes to the fair and ethical use of textual data. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ a unified front-end framework for english text-to-speech synthesis + + +
+ The front-end is a critical component of English text-to-speech (TTS) +systems, responsible for extracting linguistic features that are essential for +a text-to-speech model to synthesize speech, such as prosodies and phonemes. +The English TTS front-end typically consists of a text normalization (TN) +module, a prosody word prosody phrase (PWPP) module, and a grapheme-to-phoneme +(G2P) module. However, current research on the English TTS front-end focuses +solely on individual modules, neglecting the interdependence between them and +resulting in sub-optimal performance for each module. Therefore, this paper +proposes a unified front-end framework that captures the dependencies among the +English TTS front-end modules. Extensive experiments have demonstrated that the +proposed method achieves state-of-the-art (SOTA) performance in all modules. + +
+
+ comment: 5 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ On the Robustness of ChatGPT: An Adversarial and Out-of-distribution + Perspective ICLR 2023 + + +
+ ChatGPT is a recent chatbot service released by OpenAI and is receiving +increasing attention over the past few months. While evaluations of various +aspects of ChatGPT have been done, its robustness, i.e., the performance to +unexpected inputs, is still unclear to the public. Robustness is of particular +concern in responsible AI, especially for safety-critical applications. In this +paper, we conduct a thorough evaluation of the robustness of ChatGPT from the +adversarial and out-of-distribution (OOD) perspective. To do so, we employ the +AdvGLUE and ANLI benchmarks to assess adversarial robustness and the Flipkart +review and DDXPlus medical diagnosis datasets for OOD evaluation. We select +several popular foundation models as baselines. Results show that ChatGPT shows +consistent advantages on most adversarial and OOD classification and +translation tasks. However, the absolute performance is far from perfection, +which suggests that adversarial and OOD robustness remains a significant threat +to foundation models. Moreover, ChatGPT shows astounding performance in +understanding dialogue-related texts and we find that it tends to provide +informal suggestions for medical tasks instead of definitive answers. Finally, +we present in-depth discussions of possible research directions. + +
+
+ comment: Highlighted paper at ICLR 2023 workshop on Trustworthy and Reliable + Large-Scale Machine Learning Models; code is at: + https://github.com/microsoft/robustlearn; more works: + https://llm-eval.github.io/ +
+
+
+
+
+ + ♻ ☆ Block-State Transformer + + +
+ State space models (SSMs) have shown impressive results on tasks that require +modeling long-range dependencies and efficiently scale to long sequences owing +to their subquadratic runtime complexity. Originally designed for continuous +signals, SSMs have shown superior performance on a plethora of tasks, in vision +and audio; however, SSMs still lag Transformer performance in Language Modeling +tasks. In this work, we propose a hybrid layer named Block-State Transformer +(BST), that internally combines an SSM sublayer for long-range +contextualization, and a Block Transformer sublayer for short-term +representation of sequences. We study three different, and completely +parallelizable, variants that integrate SSMs and block-wise attention. We show +that our model outperforms similar Transformer-based architectures on language +modeling perplexity and generalizes to longer sequences. In addition, the +Block-State Transformer demonstrates more than tenfold increase in speed at the +layer level compared to the Block-Recurrent Transformer when model +parallelization is employed. + +
+
+
+
+
+ + ♻ ☆ Asymmetric feature interaction for interpreting model predictions ACL 2023 + + +
+ In natural language processing (NLP), deep neural networks (DNNs) could model +complex interactions between context and have achieved impressive results on a +range of NLP tasks. Prior works on feature interaction attribution mainly focus +on studying symmetric interaction that only explains the additional influence +of a set of words in combination, which fails to capture asymmetric influence +that contributes to model prediction. In this work, we propose an asymmetric +feature interaction attribution explanation model that aims to explore +asymmetric higher-order feature interactions in the inference of deep neural +NLP models. By representing our explanation with an directed interaction graph, +we experimentally demonstrate interpretability of the graph to discover +asymmetric feature interactions. Experimental results on two sentiment +classification datasets show the superiority of our model against the +state-of-the-art feature interaction attribution methods in identifying +influential features for model predictions. Our code is available at +https://github.com/StillLu/ASIV. + +
+
+ comment: Accepted by Findings of the Association for Computational + Linguistics: ACL 2023 (long paper) +
+
+
+
+
+ + ♻ ☆ Soft Prompt Tuning for Augmenting Dense Retrieval with Large Language + Models + + +
+ Dense retrieval (DR) converts queries and documents into dense embeddings and +measures the similarity between queries and documents in vector space. One of +the challenges in DR is the lack of domain-specific training data. While DR +models can learn from large-scale public datasets like MS MARCO through +transfer learning, evidence shows that not all DR models and domains can +benefit from transfer learning equally. Recently, some researchers have +resorted to large language models (LLMs) to improve the zero-shot and few-shot +DR models. However, the hard prompts or human-written prompts utilized in these +works cannot guarantee the good quality of generated weak queries. To tackle +this, we propose soft prompt tuning for augmenting DR (SPTAR): For each task, +we leverage soft prompt-tuning to optimize a task-specific soft prompt on +limited ground truth data and then prompt the LLMs to tag unlabeled documents +with weak queries, yielding enough weak document-query pairs to train +task-specific dense retrievers. We design a filter to select high-quality +example document-query pairs in the prompt to further improve the quality of +weak tagged queries. To the best of our knowledge, there is no prior work +utilizing soft prompt tuning to augment DR models. The experiments demonstrate +that SPTAR outperforms the unsupervised baselines BM25 and the recently +proposed LLMs-based augmentation method for DR. + +
+
+ comment: fix typos +
+
+
+
+
+ + ♻ ☆ Evaluating the Robustness to Instructions of Large Language Models + + +
+ Recently, Instruction fine-tuning has risen to prominence as a potential +method for enhancing the zero-shot capabilities of Large Language Models (LLMs) +on novel tasks. This technique has shown an exceptional ability to boost the +performance of moderately sized LLMs, sometimes even reaching performance +levels comparable to those of much larger model variants. The focus is on the +robustness of instruction-tuned LLMs to seen and unseen tasks. We conducted an +exploration of six models including Alpaca, Vicuna, WizardLM, and Traditional +Task-oriented Models(Flan-T5-XL/XXL, T0++) using real-world relation extraction +datasets as case studies. We carried out a comprehensive evaluation of these +instruction-following LLMs which have been tuned based on open-domain +instructions and task-oriented instructions. The main discussion is their +performance and robustness towards instructions. We have observed that in most +cases, the model's performance in dealing with unfamiliar instructions tends to +worsen significantly, and the robustness of the model for RE instructions +deteriorates compared to QA. Further, we discovered that up until a certain +parameter size threshold (3B), the performance of the FLAN-T5 model improves as +the parameter count increases. The robustness of different scales of FLAN-T5 +models to RE instruction is worse than the robustness to QA instruction. + +
+
+ comment: In our study, erroneous data analysis inadvertently led to misleading + outcomes. Incorrect variables were included, distorting results. This + emphasizes the significance of robust data processing and analysis techniques + in research +
+
+
+
+
+ + ♻ ☆ (QA)$^2$: Question Answering with Questionable Assumptions ACL 2023 + + +
+ Naturally occurring information-seeking questions often contain questionable +assumptions -- assumptions that are false or unverifiable. Questions containing +questionable assumptions are challenging because they require a distinct answer +strategy that deviates from typical answers for information-seeking questions. +For instance, the question "When did Marie Curie discover Uranium?" cannot be +answered as a typical "when" question without addressing the false assumption +"Marie Curie discovered Uranium". In this work, we propose (QA)$^2$ (Question +Answering with Questionable Assumptions), an open-domain evaluation dataset +consisting of naturally occurring search engine queries that may or may not +contain questionable assumptions. To be successful on (QA)$^2$, systems must be +able to detect questionable assumptions and also be able to produce adequate +responses for both typical information-seeking questions and ones with +questionable assumptions. Through human rater acceptability on end-to-end QA +with (QA)$^2$, we find that current models do struggle with handling +questionable assumptions, leaving substantial headroom for progress. + +
+
+ comment: ACL 2023 camera-ready +
+
+
+
+
+ + ♻ ☆ EntropyRank: Unsupervised Keyphrase Extraction via Side-Information + Optimization for Language Model-based Text Compression + + +
+ We propose an unsupervised method to extract keywords and keyphrases from +texts based on a pre-trained language model (LM) and Shannon's information +maximization. Specifically, our method extracts phrases having the highest +conditional entropy under the LM. The resulting set of keyphrases turns out to +solve a relevant information-theoretic problem: if provided as side +information, it leads to the expected minimal binary code length in compressing +the text using the LM and an entropy encoder. Alternately, the resulting set is +an approximation via a causal LM to the set of phrases that minimize the +entropy of the text when conditioned upon it. Empirically, the method provides +results comparable to the most commonly used methods in various keyphrase +extraction benchmark challenges. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 130 + +
+
+
+ + ☆ 3D Adversarial Augmentations for Robust Out-of-Domain Predictions + + +
+ Since real-world training datasets cannot properly sample the long tail of +the underlying data distribution, corner cases and rare out-of-domain samples +can severely hinder the performance of state-of-the-art models. This problem +becomes even more severe for dense tasks, such as 3D semantic segmentation, +where points of non-standard objects can be confidently associated to the wrong +class. In this work, we focus on improving the generalization to out-of-domain +data. We achieve this by augmenting the training set with adversarial examples. +First, we learn a set of vectors that deform the objects in an adversarial +fashion. To prevent the adversarial examples from being too far from the +existing data distribution, we preserve their plausibility through a series of +constraints, ensuring sensor-awareness and shapes smoothness. Then, we perform +adversarial augmentation by applying the learned sample-independent vectors to +the available objects when training a model. We conduct extensive experiments +across a variety of scenarios on data from KITTI, Waymo, and CrashD for 3D +object detection, and on data from SemanticKITTI, Waymo, and nuScenes for 3D +semantic segmentation. Despite training on a standard single dataset, our +approach substantially improves the robustness and generalization of both 3D +object detection and 3D semantic segmentation methods to out-of-domain data. + +
+
+ comment: 37 pages, 12 figures +
+
+
+
+
+ + ☆ An Adaptive Tangent Feature Perspective of Neural Networks + + +
+ In order to better understand feature learning in neural networks, we propose +a framework for understanding linear models in tangent feature space where the +features are allowed to be transformed during training. We consider linear +transformations of features, resulting in a joint optimization over parameters +and transformations with a bilinear interpolation constraint. We show that this +optimization problem has an equivalent linearly constrained optimization with +structured regularization that encourages approximately low rank solutions. +Specializing to neural network structure, we gain insights into how the +features and thus the kernel function change, providing additional nuance to +the phenomenon of kernel alignment when the target function is poorly +represented using tangent features. In addition to verifying our theoretical +observations in real neural networks on a simple regression problem, we +empirically show that an adaptive feature implementation of tangent feature +classification has an order of magnitude lower sample complexity than the fixed +tangent feature model on MNIST and CIFAR-10. + +
+
+ comment: 15 pages, 4 figures +
+
+
+
+
+ + ☆ A General-Purpose Self-Supervised Model for Computational Pathology + + +
+ Tissue phenotyping is a fundamental computational pathology (CPath) task in +learning objective characterizations of histopathologic biomarkers in anatomic +pathology. However, whole-slide imaging (WSI) poses a complex computer vision +problem in which the large-scale image resolutions of WSIs and the enormous +diversity of morphological phenotypes preclude large-scale data annotation. +Current efforts have proposed using pretrained image encoders with either +transfer learning from natural image datasets or self-supervised pretraining on +publicly-available histopathology datasets, but have not been extensively +developed and evaluated across diverse tissue types at scale. We introduce UNI, +a general-purpose self-supervised model for pathology, pretrained using over +100 million tissue patches from over 100,000 diagnostic haematoxylin and +eosin-stained WSIs across 20 major tissue types, and evaluated on 33 +representative CPath clinical tasks in CPath of varying diagnostic +difficulties. In addition to outperforming previous state-of-the-art models, we +demonstrate new modeling capabilities in CPath such as resolution-agnostic +tissue classification, slide classification using few-shot class prototypes, +and disease subtyping generalization in classifying up to 108 cancer types in +the OncoTree code classification system. UNI advances unsupervised +representation learning at scale in CPath in terms of both pretraining data and +downstream evaluation, enabling data-efficient AI models that can generalize +and transfer to a gamut of diagnostically-challenging tasks and clinical +workflows in anatomic pathology. + +
+
+
+
+
+ + ☆ Learning Modulated Transformation in GANs + + +
+ The success of style-based generators largely benefits from style modulation, +which helps take care of the cross-instance variation within data. However, the +instance-wise stochasticity is typically introduced via regular convolution, +where kernels interact with features at some fixed locations, limiting its +capacity for modeling geometric variation. To alleviate this problem, we equip +the generator in generative adversarial networks (GANs) with a plug-and-play +module, termed as modulated transformation module (MTM). This module predicts +spatial offsets under the control of latent codes, based on which the +convolution operation can be applied at variable locations for different +instances, and hence offers the model an additional degree of freedom to handle +geometry deformation. Extensive experiments suggest that our approach can be +faithfully generalized to various generative tasks, including image generation, +3D-aware image synthesis, and video generation, and get compatible with +state-of-the-art frameworks without any hyper-parameter tuning. It is +noteworthy that, towards human generation on the challenging TaiChi dataset, we +improve the FID of StyleGAN3 from 21.36 to 13.60, demonstrating the efficacy of +learning modulated geometry transformation. + +
+
+ comment: Technical report +
+
+
+
+
+ + ☆ Multimodal Contrastive Learning and Tabular Attention for Automated + Alzheimer's Disease Prediction + + +
+ Alongside neuroimaging such as MRI scans and PET, Alzheimer's disease (AD) +datasets contain valuable tabular data including AD biomarkers and clinical +assessments. Existing computer vision approaches struggle to utilize this +additional information. To address these needs, we propose a generalizable +framework for multimodal contrastive learning of image data and tabular data, a +novel tabular attention module for amplifying and ranking salient features in +tables, and the application of these techniques onto Alzheimer's disease +prediction. Experimental evaulations demonstrate the strength of our framework +by detecting Alzheimer's disease (AD) from over 882 MR image slices from the +ADNI database. We take advantage of the high interpretability of tabular data +and our novel tabular attention approach and through attribution of the +attention scores for each row of the table, we note and rank the most +predominant features. Results show that the model is capable of an accuracy of +over 83.8%, almost a 10% increase from previous state of the art. + +
+
+
+
+
+ + ☆ Input margins can predict generalization too + + +
+ Understanding generalization in deep neural networks is an active area of +research. A promising avenue of exploration has been that of margin +measurements: the shortest distance to the decision boundary for a given sample +or its representation internal to the network. While margins have been shown to +be correlated with the generalization ability of a model when measured at its +hidden representations (hidden margins), no such link between large margins and +generalization has been established for input margins. We show that while input +margins are not generally predictive of generalization, they can be if the +search space is appropriately constrained. We develop such a measure based on +input margins, which we refer to as `constrained margins'. The predictive power +of this new measure is demonstrated on the 'Predicting Generalization in Deep +Learning' (PGDL) dataset and contrasted with hidden representation margins. We +find that constrained margins achieve highly competitive scores and outperform +other margin measurements in general. This provides a novel insight on the +relationship between generalization and classification margins, and highlights +the importance of considering the data manifold for investigations of +generalization in DNNs. + +
+
+
+
+
+ + ☆ Online Overexposed Pixels Hallucination in Videos with Adaptive + Reference Frame Selection + + +
+ Low dynamic range (LDR) cameras cannot deal with wide dynamic range inputs, +frequently leading to local overexposure issues. We present a learning-based +system to reduce these artifacts without resorting to complex acquisition +mechanisms like alternating exposures or costly processing that are typical of +high dynamic range (HDR) imaging. We propose a transformer-based deep neural +network (DNN) to infer the missing HDR details. In an ablation study, we show +the importance of using a multiscale DNN and train it with the proper cost +function to achieve state-of-the-art quality. To aid the reconstruction of the +overexposed areas, our DNN takes a reference frame from the past as an +additional input. This leverages the commonly occurring temporal instabilities +of autoexposure to our advantage: since well-exposed details in the current +frame may be overexposed in the future, we use reinforcement learning to train +a reference frame selection DNN that decides whether to adopt the current frame +as a future reference. Without resorting to alternating exposures, we obtain +therefore a causal, HDR hallucination algorithm with potential application in +common video acquisition settings. Our demo video can be found at +https://drive.google.com/file/d/1-r12BKImLOYCLUoPzdebnMyNjJ4Rk360/view + +
+
+ comment: The demo video can be found at + https://drive.google.com/file/d/1-r12BKImLOYCLUoPzdebnMyNjJ4Rk360/view +
+
+
+
+
+ + ☆ Canonical Factors for Hybrid Neural Fields ICCV 2023 + + +
+ Factored feature volumes offer a simple way to build more compact, efficient, +and intepretable neural fields, but also introduce biases that are not +necessarily beneficial for real-world data. In this work, we (1) characterize +the undesirable biases that these architectures have for axis-aligned signals +-- they can lead to radiance field reconstruction differences of as high as 2 +PSNR -- and (2) explore how learning a set of canonicalizing transformations +can improve representations by removing these biases. We prove in a +two-dimensional model problem that simultaneously learning these +transformations together with scene appearance succeeds with drastically +improved efficiency. We validate the resulting architectures, which we call +TILTED, using image, signed distance, and radiance field reconstruction tasks, +where we observe improvements across quality, robustness, compactness, and +runtime. Results demonstrate that TILTED can enable capabilities comparable to +baselines that are 2x larger, while highlighting weaknesses of neural field +evaluation procedures. + +
+
+ comment: ICCV 2023. Project webpage: https://brentyi.github.io/tilted/ +
+
+
+
+
+ + ☆ Pseudo-Boolean Polynomials Approach To Edge Detection And Image + Segmentation + + +
+ We introduce a deterministic approach to edge detection and image +segmentation by formulating pseudo-Boolean polynomials on image patches. The +approach works by applying a binary classification of blob and edge regions in +an image based on the degrees of pseudo-Boolean polynomials calculated on +patches extracted from the provided image. We test our method on simple images +containing primitive shapes of constant and contrasting colour and establish +the feasibility before applying it to complex instances like aerial landscape +images. The proposed method is based on the exploitation of the reduction, +polynomial degree, and equivalence properties of penalty-based pseudo-Boolean +polynomials. + +
+
+ comment: 14 pages, 8 figures, submitted to the International Conference Data + Analysis, Optimization and Their Applications on the Occasion of Boris + Mirkin's 80th Birthday January 30-31, 2023, Dolgoprudny, Moscow Region, + Moscow Institute of Physics and Technology + https://mipt.ru/education/chairs/dm/conferences/data-analysis-optimization-and-their-applications-2023.php +
+
+
+
+
+ + ☆ Complementing Onboard Sensors with Satellite Map: A New Perspective for + HD Map Construction + + +
+ High-Definition (HD) maps play a crucial role in autonomous driving systems. +Recent methods have attempted to construct HD maps in real-time based on +information obtained from vehicle onboard sensors. However, the performance of +these methods is significantly susceptible to the environment surrounding the +vehicle due to the inherent limitation of onboard sensors, such as weak +capacity for long-range detection. In this study, we demonstrate that +supplementing onboard sensors with satellite maps can enhance the performance +of HD map construction methods, leveraging the broad coverage capability of +satellite maps. For the purpose of further research, we release the satellite +map tiles as a complementary dataset of nuScenes dataset. Meanwhile, we propose +a hierarchical fusion module that enables better fusion of satellite maps +information with existing methods. Specifically, we design an attention mask +based on segmentation and distance, applying the cross-attention mechanism to +fuse onboard Bird's Eye View (BEV) features and satellite features in +feature-level fusion. An alignment module is introduced before concatenation in +BEV-level fusion to mitigate the impact of misalignment between the two +features. The experimental results on the augmented nuScenes dataset showcase +the seamless integration of our module into three existing HD map construction +methods. It notably enhances their performance in both HD map semantic +segmentation and instance detection tasks. + +
+
+
+
+
+ + ☆ WrappingNet: Mesh Autoencoder via Deep Sphere Deformation + + +
+ There have been recent efforts to learn more meaningful representations via +fixed length codewords from mesh data, since a mesh serves as a complete model +of underlying 3D shape compared to a point cloud. However, the mesh +connectivity presents new difficulties when constructing a deep learning +pipeline for meshes. Previous mesh unsupervised learning approaches typically +assume category-specific templates, e.g., human face/body templates. It +restricts the learned latent codes to only be meaningful for objects in a +specific category, so the learned latent spaces are unable to be used across +different types of objects. In this work, we present WrappingNet, the first +mesh autoencoder enabling general mesh unsupervised learning over heterogeneous +objects. It introduces a novel base graph in the bottleneck dedicated to +representing mesh connectivity, which is shown to facilitate learning a shared +latent space representing object shape. The superiority of WrappingNet mesh +learning is further demonstrated via improved reconstruction quality and +competitive classification compared to point cloud learning, as well as latent +interpolation between meshes of different categories. + +
+
+
+
+
+ + ☆ Robust Long-Tailed Learning via Label-Aware Bounded CVaR + + +
+ Data in the real-world classification problems are always imbalanced or +long-tailed, wherein the majority classes have the most of the samples that +dominate the model training. In such setting, the naive model tends to have +poor performance on the minority classes. Previously, a variety of loss +modifications have been proposed to address the long-tailed leaning problem, +while these methods either treat the samples in the same class +indiscriminatingly or lack a theoretical guarantee. In this paper, we propose +two novel approaches based on CVaR (Conditional Value at Risk) to improve the +performance of long-tailed learning with a solid theoretical ground. +Specifically, we firstly introduce a Label-Aware Bounded CVaR (LAB-CVaR) loss +to overcome the pessimistic result of the original CVaR, and further design the +optimal weight bounds for LAB-CVaR theoretically. Based on LAB-CVaR, we +additionally propose a LAB-CVaR with logit adjustment (LAB-CVaR-logit) loss to +stabilize the optimization process, where we also offer the theoretical +support. Extensive experiments on real-world datasets with long-tailed label +distributions verify the superiority of our proposed methods. + +
+
+
+
+
+ + ☆ Color Aesthetics: Fuzzy based User-driven Method for Harmony and + Preference Prediction SC + + +
+ Color is the most important intrinsic sensory feature that has a powerful +impact on product sales. Color is even responsible for raising the aesthetic +senses in our brains. Account for individual differences is crucial in color +aesthetics. It requires user-driven mechanisms for various e-commerce +applications. We propose a method for quantitative evaluation of all types of +perceptual responses to color(s): distinct color preference, color harmony, and +color combination preference. Preference for color schemes can be predicted by +combining preferences for the basic colors and ratings of color harmony. +Harmonious pallets are extracted from big data set using comparison algorithms +based on fuzzy similarity and grouping. The proposed model results in useful +predictions of harmony and preference of multicolored images. For example, in +the context of apparel coordination, it allows predicting a preference for a +look based on clothing colors. Our approach differs from standard aesthetic +models, since in accounts for a personal variation. In addition, it can process +not only lower-order color pairs, but also groups of several colors. + +
+
+ comment: It was accepted as a short paper. IFSA-SCIS 2017 Conference held in + Otsu, Japan +
+
+
+
+
+ + ☆ Shape-Margin Knowledge Augmented Network for Thyroid Nodule Segmentation + and Diagnosis + + +
+ Thyroid nodule segmentation is a crucial step in the diagnostic procedure of +physicians and computer-aided diagnosis systems. Mostly, current studies treat +segmentation and diagnosis as independent tasks without considering the +correlation between these tasks. The sequence steps of these independent tasks +in computer-aided diagnosis systems may lead to the accumulation of errors. +Therefore, it is worth combining them as a whole through exploring the +relationship between thyroid nodule segmentation and diagnosis. According to +the thyroid imaging reporting and data system (TI-RADS), the assessment of +shape and margin characteristics is the prerequisite for the discrimination of +benign and malignant thyroid nodules. These characteristics can be observed in +the thyroid nodule segmentation masks. Inspired by the diagnostic procedure of +TI-RADS, this paper proposes a shape-margin knowledge augmented network +(SkaNet) for simultaneously thyroid nodule segmentation and diagnosis. Due to +the similarity in visual features between segmentation and diagnosis, SkaNet +shares visual features in the feature extraction stage and then utilizes a +dual-branch architecture to perform thyroid nodule segmentation and diagnosis +tasks simultaneously. To enhance effective discriminative features, an +exponential mixture module is devised, which incorporates convolutional feature +maps and self-attention maps by exponential weighting. Then, SkaNet is jointly +optimized by a knowledge augmented multi-task loss function with a constraint +penalty term. It embeds shape and margin characteristics through numerical +computation and models the relationship between the thyroid nodule diagnosis +results and segmentation masks. + +
+
+
+
+
+ + ☆ On the Robustness of Object Detection Models in Aerial Images + + +
+ The robustness of object detection models is a major concern when applied to +real-world scenarios. However, the performance of most object detection models +degrades when applied to images subjected to corruptions, since they are +usually trained and evaluated on clean datasets. Enhancing the robustness of +object detection models is of utmost importance, especially for those designed +for aerial images, which feature complex backgrounds, substantial variations in +scales and orientations of objects. This paper addresses the challenge of +assessing the robustness of object detection models in aerial images, with a +specific emphasis on scenarios where images are affected by clouds. In this +study, we introduce two novel benchmarks based on DOTA-v1.0. The first +benchmark encompasses 19 prevalent corruptions, while the second focuses on +cloud-corrupted images-a phenomenon uncommon in natural pictures yet frequent +in aerial photography. We systematically evaluate the robustness of mainstream +object detection models and perform numerous ablation experiments. Through our +investigations, we find that enhanced model architectures, larger networks, +well-crafted modules, and judicious data augmentation strategies collectively +enhance the robustness of aerial object detection models. The benchmarks we +propose and our comprehensive experimental analyses can facilitate research on +robust object detection in aerial images. Codes and datasets are available at: +(https://github.com/hehaodong530/DOTA-C) + +
+
+ comment: 16 pages +
+
+
+
+
+ + ☆ Efficient Model Personalization in Federated Learning via + Client-Specific Prompt Generation ICCV 2023 + + +
+ Federated learning (FL) emerges as a decentralized learning framework which +trains models from multiple distributed clients without sharing their data to +preserve privacy. Recently, large-scale pre-trained models (e.g., Vision +Transformer) have shown a strong capability of deriving robust representations. +However, the data heterogeneity among clients, the limited computation +resources, and the communication bandwidth restrict the deployment of +large-scale models in FL frameworks. To leverage robust representations from +large-scale models while enabling efficient model personalization for +heterogeneous clients, we propose a novel personalized FL framework of +client-specific Prompt Generation (pFedPG), which learns to deploy a +personalized prompt generator at the server for producing client-specific +visual prompts that efficiently adapts frozen backbones to local data +distributions. Our proposed framework jointly optimizes the stages of +personalized prompt adaptation locally and personalized prompt generation +globally. The former aims to train visual prompts that adapt foundation models +to each client, while the latter observes local optimization directions to +generate personalized prompts for all clients. Through extensive experiments on +benchmark datasets, we show that our pFedPG is favorable against +state-of-the-art personalized FL methods under various types of data +heterogeneity, allowing computation and communication efficient model +personalization. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ AnomalyGPT: Detecting Industrial Anomalies using Large Vision-Language + Models + + +
+ Large Vision-Language Models (LVLMs) such as MiniGPT-4 and LLaVA have +demonstrated the capability of understanding images and achieved remarkable +performance in various visual tasks. Despite their strong abilities in +recognizing common objects due to extensive training datasets, they lack +specific domain knowledge and have a weaker understanding of localized details +within objects, which hinders their effectiveness in the Industrial Anomaly +Detection (IAD) task. On the other hand, most existing IAD methods only provide +anomaly scores and necessitate the manual setting of thresholds to distinguish +between normal and abnormal samples, which restricts their practical +implementation. In this paper, we explore the utilization of LVLM to address +the IAD problem and propose AnomalyGPT, a novel IAD approach based on LVLM. We +generate training data by simulating anomalous images and producing +corresponding textual descriptions for each image. We also employ an image +decoder to provide fine-grained semantic and design a prompt learner to +fine-tune the LVLM using prompt embeddings. Our AnomalyGPT eliminates the need +for manual threshold adjustments, thus directly assesses the presence and +locations of anomalies. Additionally, AnomalyGPT supports multi-turn dialogues +and exhibits impressive few-shot in-context learning capabilities. With only +one normal shot, AnomalyGPT achieves the state-of-the-art performance with an +accuracy of 86.1%, an image-level AUC of 94.1%, and a pixel-level AUC of 95.3% +on the MVTec-AD dataset. Code is available at +https://github.com/CASIA-IVA-Lab/AnomalyGPT. + +
+
+
+
+
+ + ☆ Ego-Motion Estimation and Dynamic Motion Separation from 3D Point Clouds + for Accumulating Data and Improving 3D Object Detection + + +
+ New 3+1D high-resolution radar sensors are gaining importance for 3D object +detection in the automotive domain due to their relative affordability and +improved detection compared to classic low-resolution radar sensors. One +limitation of high-resolution radar sensors, compared to lidar sensors, is the +sparsity of the generated point cloud. This sparsity could be partially +overcome by accumulating radar point clouds of subsequent time steps. This +contribution analyzes limitations of accumulating radar point clouds on the +View-of-Delft dataset. By employing different ego-motion estimation approaches, +the dataset's inherent constraints, and possible solutions are analyzed. +Additionally, a learning-based instance motion estimation approach is deployed +to investigate the influence of dynamic motion on the accumulated point cloud +for object detection. Experiments document an improved object detection +performance by applying an ego-motion estimation and dynamic motion correction +approach. + +
+
+ comment: Published at: AmE 2023 - Automotive meets Electronics; 14. GMM + Symposium (https://ieeexplore.ieee.org/document/10227711) +
+
+
+
+
+ + ☆ Detect, Augment, Compose, and Adapt: Four Steps for Unsupervised Domain + Adaptation in Object Detection + + +
+ Unsupervised domain adaptation (UDA) plays a crucial role in object detection +when adapting a source-trained detector to a target domain without annotated +data. In this paper, we propose a novel and effective four-step UDA approach +that leverages self-supervision and trains source and target data concurrently. +We harness self-supervised learning to mitigate the lack of ground truth in the +target domain. Our method consists of the following steps: (1) identify the +region with the highest-confidence set of detections in each target image, +which serve as our pseudo-labels; (2) crop the identified region and generate a +collection of its augmented versions; (3) combine these latter into a composite +image; (4) adapt the network to the target domain using the composed image. +Through extensive experiments under cross-camera, cross-weather, and +synthetic-to-real scenarios, our approach achieves state-of-the-art +performance, improving upon the nearest competitor by more than 2% in terms of +mean Average Precision (mAP). The code is available at +https://github.com/MohamedTEV/DACA. + +
+
+
+
+
+ + ☆ Enhancing Mobile Face Anti-Spoofing: A Robust Framework for Diverse + Attack Types under Screen Flash + + +
+ Face anti-spoofing (FAS) is crucial for securing face recognition systems. +However, existing FAS methods with handcrafted binary or pixel-wise labels have +limitations due to diverse presentation attacks (PAs). In this paper, we +propose an attack type robust face anti-spoofing framework under light flash, +called ATR-FAS. Due to imaging differences caused by various attack types, +traditional FAS methods based on single binary classification network may +result in excessive intra-class distance of spoof faces, leading to a challenge +of decision boundary learning. Therefore, we employed multiple networks to +reconstruct multi-frame depth maps as auxiliary supervision, and each network +experts in one type of attack. A dual gate module (DGM) consisting of a type +gate and a frame-attention gate is introduced, which perform attack type +recognition and multi-frame attention generation, respectively. The outputs of +DGM are utilized as weight to mix the result of multiple expert networks. The +multi-experts mixture enables ATR-FAS to generate spoof-differentiated depth +maps, and stably detects spoof faces without being affected by different types +of PAs. Moreover, we design a differential normalization procedure to convert +original flash frames into differential frames. This simple but effective +processing enhances the details in flash frames, aiding in the generation of +depth maps. To verify the effectiveness of our framework, we collected a +large-scale dataset containing 12,660 live and spoof videos with diverse PAs +under dynamic flash from the smartphone screen. Extensive experiments +illustrate that the proposed ATR-FAS significantly outperforms existing +state-of-the-art methods. The code and dataset will be available at +https://github.com/Chaochao-Lin/ATR-FAS. + +
+
+
+
+
+ + ☆ IndGIC: Supervised Action Recognition under Low Illumination + + +
+ Technologies of human action recognition in the dark are gaining more and +more attention as huge demand in surveillance, motion control and +human-computer interaction. However, because of limitation in image enhancement +method and low-lighting video datasets, e.g. labeling cost, existing methods +meet some problems. Some video-based approached are effect and efficient in +specific datasets but cannot generalize to most cases while others methods +using multiple sensors rely heavily to prior knowledge to deal with noisy +nature from video stream. In this paper, we proposes action recognition method +using deep multi-input network. Furthermore, we proposed a Independent Gamma +Intensity Corretion (Ind-GIC) to enhance poor-illumination video, generating +one gamma for one frame to increase enhancement performance. To prove our +method is effective, there is some evaluation and comparison between our method +and existing methods. Experimental results show that our model achieves high +accuracy in on ARID dataset. + +
+
+
+
+
+ + ☆ Imperceptible Adversarial Attack on Deep Neural Networks from Image + Boundary + + +
+ Although Deep Neural Networks (DNNs), such as the convolutional neural +networks (CNN) and Vision Transformers (ViTs), have been successfully applied +in the field of computer vision, they are demonstrated to be vulnerable to +well-sought Adversarial Examples (AEs) that can easily fool the DNNs. The +research in AEs has been active, and many adversarial attacks and explanations +have been proposed since they were discovered in 2014. The mystery of the AE's +existence is still an open question, and many studies suggest that DNN training +algorithms have blind spots. The salient objects usually do not overlap with +boundaries; hence, the boundaries are not the DNN model's attention. +Nevertheless, recent studies show that the boundaries can dominate the behavior +of the DNN models. Hence, this study aims to look at the AEs from a different +perspective and proposes an imperceptible adversarial attack that systemically +attacks the input image boundary for finding the AEs. The experimental results +have shown that the proposed boundary attacking method effectively attacks six +CNN models and the ViT using only 32% of the input image content (from the +boundaries) with an average success rate (SR) of 95.2% and an average peak +signal-to-noise ratio of 41.37 dB. Correlation analyses are conducted, +including the relation between the adversarial boundary's width and the SR and +how the adversarial boundary changes the DNN model's attention. This paper's +discoveries can potentially advance the understanding of AEs and provide a +different perspective on how AEs can be constructed. + +
+
+
+
+
+ + ☆ Enhancing Robot Learning through Learned Human-Attention Feature Maps ICRA 2023 + + +
+ Robust and efficient learning remains a challenging problem in robotics, in +particular with complex visual inputs. Inspired by human attention mechanism, +with which we quickly process complex visual scenes and react to changes in the +environment, we think that embedding auxiliary information about focus point +into robot learning would enhance efficiency and robustness of the learning +process. In this paper, we propose a novel approach to model and emulate the +human attention with an approximate prediction model. We then leverage this +output and feed it as a structured auxiliary feature map into downstream +learning tasks. We validate this idea by learning a prediction model from +human-gaze recordings of manual driving in the real world. We test our approach +on two learning tasks - object detection and imitation learning. Our +experiments demonstrate that the inclusion of predicted human attention leads +to improved robustness of the trained models to out-of-distribution samples and +faster learning in low-data regime settings. Our work highlights the potential +of incorporating structured auxiliary information in representation learning +for robotics and opens up new avenues for research in this direction. All code +and data are available online. + +
+
+ comment: This work has been accepted for the RAP4Robots workshop at ICRA 2023 + in London +
+
+
+
+
+ + ☆ Occlusion-Aware Deep Convolutional Neural Network via Homogeneous + Tanh-transforms for Face Parsing + + +
+ Face parsing infers a pixel-wise label map for each semantic facial +component. Previous methods generally work well for uncovered faces, however +overlook the facial occlusion and ignore some contextual area outside a single +face, especially when facial occlusion has become a common situation during the +COVID-19 epidemic. Inspired by the illumination theory of image, we propose a +novel homogeneous tanh-transforms for image preprocessing, which made up of +four tanh-transforms, that fuse the central vision and the peripheral vision +together. Our proposed method addresses the dilemma of face parsing under +occlusion and compresses more information of surrounding context. Based on +homogeneous tanh-transforms, we propose an occlusion-aware convolutional neural +network for occluded face parsing. It combines the information both in +Tanh-polar space and Tanh-Cartesian space, capable of enhancing receptive +fields. Furthermore, we introduce an occlusion-aware loss to focus on the +boundaries of occluded regions. The network is simple and flexible, and can be +trained end-to-end. To facilitate future research of occluded face parsing, we +also contribute a new cleaned face parsing dataset, which is manually purified +from several academic or industrial datasets, including CelebAMask-HQ, +Short-video Face Parsing as well as Helen dataset and will make it public. +Experiments demonstrate that our method surpasses state-of-art methods of face +parsing under occlusion. + +
+
+
+
+
+ + ☆ Elucidating the Exposure Bias in Diffusion Models + + +
+ Diffusion models have demonstrated impressive generative capabilities, but +their 'exposure bias' problem, described as the input mismatch between training +and sampling, lacks in-depth exploration. In this paper, we systematically +investigate the exposure bias problem in diffusion models by first analytically +modelling the sampling distribution, based on which we then attribute the +prediction error at each sampling step as the root cause of the exposure bias +issue. Furthermore, we discuss potential solutions to this issue and propose an +intuitive metric for it. Along with the elucidation of exposure bias, we +propose a simple, yet effective, training-free method called Epsilon Scaling to +alleviate the exposure bias. We show that Epsilon Scaling explicitly moves the +sampling trajectory closer to the vector field learned in the training phase by +scaling down the network output (Epsilon), mitigating the input mismatch +between training and sampling. Experiments on various diffusion frameworks +(ADM, DDPM/DDIM, LDM), unconditional and conditional settings, and +deterministic vs. stochastic sampling verify the effectiveness of our method. + +
+
+ comment: 7 pages, code available soon +
+
+
+
+
+ + ☆ 3D-MuPPET: 3D Multi-Pigeon Pose Estimation and Tracking + + +
+ Markerless methods for animal posture tracking have been developing recently, +but frameworks and benchmarks for tracking large animal groups in 3D are still +lacking. To overcome this gap in the literature, we present 3D-MuPPET, a +framework to estimate and track 3D poses of up to 10 pigeons at interactive +speed using multiple-views. We train a pose estimator to infer 2D keypoints and +bounding boxes of multiple pigeons, then triangulate the keypoints to 3D. For +correspondence matching, we first dynamically match 2D detections to global +identities in the first frame, then use a 2D tracker to maintain +correspondences accross views in subsequent frames. We achieve comparable +accuracy to a state of the art 3D pose estimator for Root Mean Square Error +(RMSE) and Percentage of Correct Keypoints (PCK). We also showcase a novel use +case where our model trained with data of single pigeons provides comparable +results on data containing multiple pigeons. This can simplify the domain shift +to new species because annotating single animal data is less labour intensive +than multi-animal data. Additionally, we benchmark the inference speed of +3D-MuPPET, with up to 10 fps in 2D and 1.5 fps in 3D, and perform quantitative +tracking evaluation, which yields encouraging results. Finally, we show that +3D-MuPPET also works in natural environments without model fine-tuning on +additional annotations. To the best of our knowledge we are the first to +present a framework for 2D/3D posture and trajectory tracking that works in +both indoor and outdoor environments. + +
+
+
+
+
+ + ☆ Spatio-temporal MLP-graph network for 3D human pose estimation + + +
+ Graph convolutional networks and their variants have shown significant +promise in 3D human pose estimation. Despite their success, most of these +methods only consider spatial correlations between body joints and do not take +into account temporal correlations, thereby limiting their ability to capture +relationships in the presence of occlusions and inherent ambiguity. To address +this potential weakness, we propose a spatio-temporal network architecture +composed of a joint-mixing multi-layer perceptron block that facilitates +communication among different joints and a graph weighted Jacobi network block +that enables communication among various feature channels. The major novelty of +our approach lies in a new weighted Jacobi feature propagation rule obtained +through graph filtering with implicit fairing. We leverage temporal information +from the 2D pose sequences, and integrate weight modulation into the model to +enable untangling of the feature transformations of distinct nodes. We also +employ adjacency modulation with the aim of learning meaningful correlations +beyond defined linkages between body joints by altering the graph topology +through a learnable modulation matrix. Extensive experiments on two benchmark +datasets demonstrate the effectiveness of our model, outperforming recent +state-of-the-art methods for 3D human pose estimation. + +
+
+
+
+
+ + ☆ MSFlow: Multi-Scale Flow-based Framework for Unsupervised Anomaly + Detection + + +
+ Unsupervised anomaly detection (UAD) attracts a lot of research interest and +drives widespread applications, where only anomaly-free samples are available +for training. Some UAD applications intend to further locate the anomalous +regions without any anomaly information. + Although the absence of anomalous samples and annotations deteriorates the +UAD performance, an inconspicuous yet powerful statistics model, the +normalizing flows, is appropriate for anomaly detection and localization in an +unsupervised fashion. The flow-based probabilistic models, only trained on +anomaly-free data, can efficiently distinguish unpredictable anomalies by +assigning them much lower likelihoods than normal data. + Nevertheless, the size variation of unpredictable anomalies introduces +another inconvenience to the flow-based methods for high-precision anomaly +detection and localization. To generalize the anomaly size variation, we +propose a novel Multi-Scale Flow-based framework dubbed MSFlow composed of +asymmetrical parallel flows followed by a fusion flow to exchange multi-scale +perceptions. Moreover, different multi-scale aggregation strategies are adopted +for image-wise anomaly detection and pixel-wise anomaly localization according +to the discrepancy between them. The proposed MSFlow is evaluated on three +anomaly detection datasets, significantly outperforming existing methods. +Notably, on the challenging MVTec AD benchmark, our MSFlow achieves a new +state-of-the-art with a detection AUORC score of up to 99.7%, localization +AUCROC score of 98.8%, and PRO score of 97.1%. The reproducible code is +available at https://github.com/cool-xuan/msflow. + +
+
+
+
+
+ + ☆ ARTxAI: Explainable Artificial Intelligence Curates Deep Representation + Learning for Artistic Images using Fuzzy Techniques + + +
+ Automatic art analysis employs different image processing techniques to +classify and categorize works of art. When working with artistic images, we +need to take into account further considerations compared to classical image +processing. This is because such artistic paintings change drastically +depending on the author, the scene depicted, and their artistic style. This can +result in features that perform very well in a given task but do not grasp the +whole of the visual and symbolic information contained in a painting. In this +paper, we show how the features obtained from different tasks in artistic image +classification are suitable to solve other ones of similar nature. We present +different methods to improve the generalization capabilities and performance of +artistic classification systems. Furthermore, we propose an explainable +artificial intelligence method to map known visual traits of an image with the +features used by the deep learning model considering fuzzy rules. These rules +show the patterns and variables that are relevant to solve each task and how +effective is each of the patterns found. Our results show that our proposed +context-aware features can achieve up to $6\%$ and $26\%$ more accurate results +than other context- and non-context-aware solutions, respectively, depending on +the specific task. We also show that some of the features used by these models +can be more clearly correlated to visual traits in the original image than +others. + +
+
+
+
+
+ + ☆ ADFA: Attention-augmented Differentiable top-k Feature Adaptation for + Unsupervised Medical Anomaly Detection + + +
+ The scarcity of annotated data, particularly for rare diseases, limits the +variability of training data and the range of detectable lesions, presenting a +significant challenge for supervised anomaly detection in medical imaging. To +solve this problem, we propose a novel unsupervised method for medical image +anomaly detection: Attention-Augmented Differentiable top-k Feature Adaptation +(ADFA). The method utilizes Wide-ResNet50-2 (WR50) network pre-trained on +ImageNet to extract initial feature representations. To reduce the channel +dimensionality while preserving relevant channel information, we employ an +attention-augmented patch descriptor on the extracted features. We then apply +differentiable top-k feature adaptation to train the patch descriptor, mapping +the extracted feature representations to a new vector space, enabling effective +detection of anomalies. Experiments show that ADFA outperforms state-of-the-art +(SOTA) methods on multiple challenging medical image datasets, confirming its +effectiveness in medical anomaly detection. + +
+
+
+
+
+ + ☆ Cross-Modal Retrieval Meets Inference:Improving Zero-Shot Classification + with Cross-Modal Retrieval + + +
+ Contrastive language-image pre-training (CLIP) has demonstrated remarkable +zero-shot classification ability, namely image classification using novel text +labels. Existing works have attempted to enhance CLIP by fine-tuning on +downstream tasks, but these have inadvertently led to performance degradation +on unseen classes, thus harming zero-shot generalization. This paper aims to +address this challenge by leveraging readily available image-text pairs from an +external dataset for cross-modal guidance during inference. To this end, we +propose X-MoRe, a novel inference method comprising two key steps: (1) +cross-modal retrieval and (2) modal-confidence-based ensemble. Given a query +image, we harness the power of CLIP's cross-modal representations to retrieve +relevant textual information from an external image-text pair dataset. Then, we +assign higher weights to the more reliable modality between the original query +image and retrieved text, contributing to the final prediction. X-MoRe +demonstrates robust performance across a diverse set of tasks without the need +for additional training, showcasing the effectiveness of utilizing cross-modal +features to maximize CLIP's zero-shot ability. + +
+
+
+
+
+ + ☆ NOVIS: A Case for End-to-End Near-Online Video Instance Segmentation + + +
+ Until recently, the Video Instance Segmentation (VIS) community operated +under the common belief that offline methods are generally superior to a frame +by frame online processing. However, the recent success of online methods +questions this belief, in particular, for challenging and long video sequences. +We understand this work as a rebuttal of those recent observations and an +appeal to the community to focus on dedicated near-online VIS approaches. To +support our argument, we present a detailed analysis on different processing +paradigms and the new end-to-end trainable NOVIS (Near-Online Video Instance +Segmentation) method. Our transformer-based model directly predicts +spatio-temporal mask volumes for clips of frames and performs instance tracking +between clips via overlap embeddings. NOVIS represents the first near-online +VIS approach which avoids any handcrafted tracking heuristics. We outperform +all existing VIS methods by large margins and provide new state-of-the-art +results on both YouTube-VIS (2019/2021) and the OVIS benchmarks. + +
+
+
+
+
+ + ☆ Enhancing OCR Performance through Post-OCR Models: Adopting Glyph + Embedding for Improved Correction + + +
+ The study investigates the potential of post-OCR models to overcome +limitations in OCR models and explores the impact of incorporating glyph +embedding on post-OCR correction performance. In this study, we have developed +our own post-OCR correction model. The novelty of our approach lies in +embedding the OCR output using CharBERT and our unique embedding technique, +capturing the visual characteristics of characters. Our findings show that +post-OCR correction effectively addresses deficiencies in inferior OCR models, +and glyph embedding enables the model to achieve superior results, including +the ability to correct individual words. + +
+
+
+
+
+ + ☆ Rotation Augmented Distillation for Exemplar-Free Class Incremental + Learning with Detailed Analysis + + +
+ Class incremental learning (CIL) aims to recognize both the old and new +classes along the increment tasks. Deep neural networks in CIL suffer from +catastrophic forgetting and some approaches rely on saving exemplars from +previous tasks, known as the exemplar-based setting, to alleviate this problem. +On the contrary, this paper focuses on the Exemplar-Free setting with no old +class sample preserved. Balancing the plasticity and stability in deep feature +learning with only supervision from new classes is more challenging. Most +existing Exemplar-Free CIL methods report the overall performance only and lack +further analysis. In this work, different methods are examined with +complementary metrics in greater detail. Moreover, we propose a simple CIL +method, Rotation Augmented Distillation (RAD), which achieves one of the +top-tier performances under the Exemplar-Free setting. Detailed analysis shows +our RAD benefits from the superior balance between plasticity and stability. +Finally, more challenging exemplar-free settings with fewer initial classes are +undertaken for further demonstrations and comparisons among the +state-of-the-art methods. + +
+
+
+
+
+ + ☆ CLIPTrans: Transferring Visual Knowledge with Pre-trained Models for + Multimodal Machine Translation ICCV + + +
+ There has been a growing interest in developing multimodal machine +translation (MMT) systems that enhance neural machine translation (NMT) with +visual knowledge. This problem setup involves using images as auxiliary +information during training, and more recently, eliminating their use during +inference. Towards this end, previous works face a challenge in training +powerful MMT models from scratch due to the scarcity of annotated multilingual +vision-language data, especially for low-resource languages. Simultaneously, +there has been an influx of multilingual pre-trained models for NMT and +multimodal pre-trained models for vision-language tasks, primarily in English, +which have shown exceptional generalisation ability. However, these are not +directly applicable to MMT since they do not provide aligned multimodal +multilingual features for generative tasks. To alleviate this issue, instead of +designing complex modules for MMT, we propose CLIPTrans, which simply adapts +the independently pre-trained multimodal M-CLIP and the multilingual mBART. In +order to align their embedding spaces, mBART is conditioned on the M-CLIP +features by a prefix sequence generated through a lightweight mapping network. +We train this in a two-stage pipeline which warms up the model with image +captioning before the actual translation task. Through experiments, we +demonstrate the merits of this framework and consequently push forward the +state-of-the-art across standard benchmarks by an average of +2.67 BLEU. The +code can be found at www.github.com/devaansh100/CLIPTrans. + +
+
+ comment: 15 pages, 9 figures, to be published In Proceedings of International + Conference of Computer Vision(ICCV), 2023 +
+
+
+
+
+ + ☆ Optron: Better Medical Image Registration via Training in the Loop + + +
+ Previously, in the field of medical image registration, there are primarily +two paradigms, the traditional optimization-based methods, and the +deep-learning-based methods. Each of these paradigms has its advantages, and in +this work, we aim to take the best of both worlds. Instead of developing a new +deep learning model, we designed a robust training architecture that is simple +and generalizable. We present Optron, a general training architecture +incorporating the idea of training-in-the-loop. By iteratively optimizing the +prediction result of a deep learning model through a plug-and-play optimizer +module in the training loop, Optron introduces pseudo ground truth to an +unsupervised training process. And by bringing the training process closer to +that of supervised training, Optron can consistently improve the models' +performance and convergence speed. We evaluated our method on various +combinations of models and datasets, and we have achieved state-of-the-art +performance on the IXI dataset, improving the previous state-of-the-art method +TransMorph by a significant margin of +1.6% DSC. Moreover, Optron also +consistently achieved positive results with other models and datasets. It +increases the validation DSC for VoxelMorph and ViT-V-Net by +2.3% and +2.2% +respectively on IXI, demonstrating our method's generalizability. Our +implementation is publicly available at +https://github.com/miraclefactory/optron + +
+
+ comment: 10 pages, 5 figures, 4 tables +
+
+
+
+
+ + ☆ Is visual explanation with Grad-CAM more reliable for deeper neural + networks? a case study with automatic pneumothorax diagnosis + + +
+ While deep learning techniques have provided the state-of-the-art performance +in various clinical tasks, explainability regarding their decision-making +process can greatly enhance the credence of these methods for safer and quicker +clinical adoption. With high flexibility, Gradient-weighted Class Activation +Mapping (Grad-CAM) has been widely adopted to offer intuitive visual +interpretation of various deep learning models' reasoning processes in +computer-assisted diagnosis. However, despite the popularity of the technique, +there is still a lack of systematic study on Grad-CAM's performance on +different deep learning architectures. In this study, we investigate its +robustness and effectiveness across different popular deep learning models, +with a focus on the impact of the networks' depths and architecture types, by +using a case study of automatic pneumothorax diagnosis in X-ray scans. Our +results show that deeper neural networks do not necessarily contribute to a +strong improvement of pneumothorax diagnosis accuracy, and the effectiveness of +GradCAM also varies among different network architectures. + +
+
+
+
+
+ + ☆ A lightweight 3D dense facial landmark estimation model from position + map data + + +
+ The incorporation of 3D data in facial analysis tasks has gained popularity +in recent years. Though it provides a more accurate and detailed representation +of the human face, accruing 3D face data is more complex and expensive than 2D +face images. Either one has to rely on expensive 3D scanners or depth sensors +which are prone to noise. An alternative option is the reconstruction of 3D +faces from uncalibrated 2D images in an unsupervised way without any ground +truth 3D data. However, such approaches are computationally expensive and the +learned model size is not suitable for mobile or other edge device +applications. Predicting dense 3D landmarks over the whole face can overcome +this issue. As there is no public dataset available containing dense landmarks, +we propose a pipeline to create a dense keypoint training dataset containing +520 key points across the whole face from an existing facial position map data. +We train a lightweight MobileNet-based regressor model with the generated data. +As we do not have access to any evaluation dataset with dense landmarks in it +we evaluate our model against the 68 keypoint detection task. Experimental +results show that our trained model outperforms many of the existing methods in +spite of its lower model size and minimal computational cost. Also, the +qualitative evaluation shows the efficiency of our trained models in extreme +head pose angles as well as other facial variations and occlusions. + +
+
+ comment: 8 pages, The Irish Machine Vision and Image Processing + Conference(IMVIP) +
+
+
+
+
+ + ☆ Uncovering the Unseen: Discover Hidden Intentions by Micro-Behavior + Graph Reasoning + + +
+ This paper introduces a new and challenging Hidden Intention Discovery (HID) +task. Unlike existing intention recognition tasks, which are based on obvious +visual representations to identify common intentions for normal behavior, HID +focuses on discovering hidden intentions when humans try to hide their +intentions for abnormal behavior. HID presents a unique challenge in that +hidden intentions lack the obvious visual representations to distinguish them +from normal intentions. Fortunately, from a sociological and psychological +perspective, we find that the difference between hidden and normal intentions +can be reasoned from multiple micro-behaviors, such as gaze, attention, and +facial expressions. Therefore, we first discover the relationship between +micro-behavior and hidden intentions and use graph structure to reason about +hidden intentions. To facilitate research in the field of HID, we also +constructed a seminal dataset containing a hidden intention annotation of a +typical theft scenario for HID. Extensive experiments show that the proposed +network improves performance on the HID task by 9.9\% over the state-of-the-art +method SBP. + +
+
+
+
+
+ + ☆ A Multimodal Visual Encoding Model Aided by Introducing Verbal Semantic + Information + + +
+ Biological research has revealed that the verbal semantic information in the +brain cortex, as an additional source, participates in nonverbal semantic +tasks, such as visual encoding. However, previous visual encoding models did +not incorporate verbal semantic information, contradicting this biological +finding. This paper proposes a multimodal visual information encoding network +model based on stimulus images and associated textual information in response +to this issue. Our visual information encoding network model takes stimulus +images as input and leverages textual information generated by a text-image +generation model as verbal semantic information. This approach injects new +information into the visual encoding model. Subsequently, a Transformer network +aligns image and text feature information, creating a multimodal feature space. +A convolutional network then maps from this multimodal feature space to voxel +space, constructing the multimodal visual information encoding network model. +Experimental results demonstrate that the proposed multimodal visual +information encoding network model outperforms previous models under the exact +training cost. In voxel prediction of the left hemisphere of subject 1's brain, +the performance improves by approximately 15.87%, while in the right +hemisphere, the performance improves by about 4.6%. The multimodal visual +encoding network model exhibits superior encoding performance. Additionally, +ablation experiments indicate that our proposed model better simulates the +brain's visual information processing. + +
+
+
+
+
+ + ☆ Uncertainty Aware Training to Improve Deep Learning Model Calibration + for Classification of Cardiac MR Images + + +
+ Quantifying uncertainty of predictions has been identified as one way to +develop more trustworthy artificial intelligence (AI) models beyond +conventional reporting of performance metrics. When considering their role in a +clinical decision support setting, AI classification models should ideally +avoid confident wrong predictions and maximise the confidence of correct +predictions. Models that do this are said to be well-calibrated with regard to +confidence. However, relatively little attention has been paid to how to +improve calibration when training these models, i.e., to make the training +strategy uncertainty-aware. In this work we evaluate three novel +uncertainty-aware training strategies comparing against two state-of-the-art +approaches. We analyse performance on two different clinical applications: +cardiac resynchronisation therapy (CRT) response prediction and coronary artery +disease (CAD) diagnosis from cardiac magnetic resonance (CMR) images. The +best-performing model in terms of both classification accuracy and the most +common calibration measure, expected calibration error (ECE) was the Confidence +Weight method, a novel approach that weights the loss of samples to explicitly +penalise confident incorrect predictions. The method reduced the ECE by 17% for +CRT response prediction and by 22% for CAD diagnosis when compared to a +baseline classifier in which no uncertainty-aware strategy was included. In +both applications, as well as reducing the ECE there was a slight increase in +accuracy from 69% to 70% and 70% to 72% for CRT response prediction and CAD +diagnosis respectively. However, our analysis showed a lack of consistency in +terms of optimal models when using different calibration measures. This +indicates the need for careful consideration of performance metrics when +training and selecting models for complex high-risk applications in healthcare. + +
+
+
+
+
+ + ☆ Abdominal Multi-Organ Segmentation Based on Feature Pyramid Network and + Spatial Recurrent Neural Network + + +
+ As recent advances in AI are causing the decline of conventional diagnostic +methods, the realization of end-to-end diagnosis is fast approaching. +Ultrasound image segmentation is an important step in the diagnostic process. +An accurate and robust segmentation model accelerates the process and reduces +the burden of sonographers. In contrast to previous research, we take two +inherent features of ultrasound images into consideration: (1) different organs +and tissues vary in spatial sizes, (2) the anatomical structures inside human +body form a relatively constant spatial relationship. Based on those two ideas, +we propose a new image segmentation model combining Feature Pyramid Network +(FPN) and Spatial Recurrent Neural Network (SRNN). We discuss why we use FPN to +extract anatomical structures of different scales and how SRNN is implemented +to extract the spatial context features in abdominal ultrasound images. + +
+
+ comment: IFAC World Congress 2023 paper +
+
+
+
+
+ + ☆ CAGRA: Highly Parallel Graph Construction and Approximate Nearest + Neighbor Search for GPUs + + +
+ Approximate Nearest Neighbor Search (ANNS) plays a critical role in various +disciplines spanning data mining and artificial intelligence, from information +retrieval and computer vision to natural language processing and recommender +systems. Data volumes have soared in recent years and the computational cost of +an exhaustive exact nearest neighbor search is often prohibitive, necessitating +the adoption of approximate techniques. The balanced performance and recall of +graph-based approaches have more recently garnered significant attention in +ANNS algorithms, however, only a few studies have explored harnessing the power +of GPUs and multi-core processors despite the widespread use of massively +parallel and general-purpose computing. To bridge this gap, we introduce a +novel parallel computing hardware-based proximity graph and search algorithm. +By leveraging the high-performance capabilities of modern hardware, our +approach achieves remarkable efficiency gains. In particular, our method +surpasses existing CPU and GPU-based methods in constructing the proximity +graph, demonstrating higher throughput in both large- and small-batch searches +while maintaining compatible accuracy. In graph construction time, our method, +CAGRA, is 2.2~27x faster than HNSW, which is one of the CPU SOTA +implementations. In large-batch query throughput in the 90% to 95% recall +range, our method is 33~77x faster than HNSW, and is 3.8~8.8x faster than the +SOTA implementations for GPU. For a single query, our method is 3.4~53x faster +than HNSW at 95% recall. + +
+
+
+
+
+ + ☆ Evaluation and Analysis of Hallucination in Large Vision-Language Models + + +
+ Large Vision-Language Models (LVLMs) have recently achieved remarkable +success. However, LVLMs are still plagued by the hallucination problem, which +limits the practicality in many scenarios. Hallucination refers to the +information of LVLMs' responses that does not exist in the visual input, which +poses potential risks of substantial consequences. There has been limited work +studying hallucination evaluation in LVLMs. In this paper, we propose +Hallucination Evaluation based on Large Language Models (HaELM), an LLM-based +hallucination evaluation framework. HaELM achieves an approximate 95% +performance comparable to ChatGPT and has additional advantages including low +cost, reproducibility, privacy preservation and local deployment. Leveraging +the HaELM, we evaluate the hallucination in current LVLMs. Furthermore, we +analyze the factors contributing to hallucination in LVLMs and offer helpful +suggestions to mitigate the hallucination problem. Our training data and human +annotation hallucination data will be made public soon. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ☆ AI-Based Facial Emotion Recognition Solutions for Education: A Study of + Teacher-User and Other Categories + + +
+ Existing information on AI-based facial emotion recognition (FER) is not +easily comprehensible by those outside the field of computer science, requiring +cross-disciplinary effort to determine a categorisation framework that promotes +the understanding of this technology, and its impact on users. Most proponents +classify FER in terms of methodology, implementation and analysis; relatively +few by its application in education; and none by its users. This paper is +concerned primarily with (potential) teacher-users of FER tools for education. +It proposes a three-part classification of these teachers, by orientation, +condition and preference, based on a classical taxonomy of affective +educational objectives, and related theories. It also compiles and organises +the types of FER solutions found in or inferred from the literature into +"technology" and "applications" categories, as a prerequisite for structuring +the proposed "teacher-user" category. This work has implications for +proponents', critics', and users' understanding of the relationship between +teachers and FER. + +
+
+
+
+
+ + ☆ DiffusionVMR: Diffusion Model for Video Moment Retrieval + + +
+ Video moment retrieval is a fundamental visual-language task that aims to +retrieve target moments from an untrimmed video based on a language query. +Existing methods typically generate numerous proposals manually or via +generative networks in advance as the support set for retrieval, which is not +only inflexible but also time-consuming. Inspired by the success of diffusion +models on object detection, this work aims at reformulating video moment +retrieval as a denoising generation process to get rid of the inflexible and +time-consuming proposal generation. To this end, we propose a novel +proposal-free framework, namely DiffusionVMR, which directly samples random +spans from noise as candidates and introduces denoising learning to ground +target moments. During training, Gaussian noise is added to the real moments, +and the model is trained to learn how to reverse this process. In inference, a +set of time spans is progressively refined from the initial noise to the final +output. Notably, the training and inference of DiffusionVMR are decoupled, and +an arbitrary number of random spans can be used in inference without being +consistent with the training phase. Extensive experiments conducted on three +widely-used benchmarks (i.e., QVHighlight, Charades-STA, and TACoS) demonstrate +the effectiveness of the proposed DiffusionVMR by comparing it with +state-of-the-art methods. + +
+
+
+
+
+ + ☆ Group-Conditional Conformal Prediction via Quantile Regression + Calibration for Crop and Weed Classification + + +
+ As deep learning predictive models become an integral part of a large +spectrum of precision agricultural systems, a barrier to the adoption of such +automated solutions is the lack of user trust in these highly complex, opaque +and uncertain models. Indeed, deep neural networks are not equipped with any +explicit guarantees that can be used to certify the system's performance, +especially in highly varying uncontrolled environments such as the ones +typically faced in computer vision for agriculture.Fortunately, certain methods +developed in other communities can prove to be important for agricultural +applications. This article presents the conformal prediction framework that +provides valid statistical guarantees on the predictive performance of any +black box prediction machine, with almost no assumptions, applied to the +problem of deep visual classification of weeds and crops in real-world +conditions. The framework is exposed with a focus on its practical aspects and +special attention accorded to the Adaptive Prediction Sets (APS) approach that +delivers marginal guarantees on the model's coverage. Marginal results are then +shown to be insufficient to guarantee performance on all groups of individuals +in the population as characterized by their environmental and pedo-climatic +auxiliary data gathered during image acquisition.To tackle this shortcoming, +group-conditional conformal approaches are presented: the ''classical'' method +that consists of iteratively applying the APS procedure on all groups, and a +proposed elegant reformulation and implementation of the procedure using +quantile regression on group membership indicators. Empirical results showing +the validity of the proposed approach are presented and compared to the +marginal APS then discussed. + +
+
+
+
+
+ + ☆ Using deep learning for an automatic detection and classification of the + vascular bifurcations along the Circle of Willis + + +
+ Most of the intracranial aneurysms (ICA) occur on a specific portion of the +cerebral vascular tree named the Circle of Willis (CoW). More particularly, +they mainly arise onto fifteen of the major arterial bifurcations constituting +this circular structure. Hence, for an efficient and timely diagnosis it is +critical to develop some methods being able to accurately recognize each +Bifurcation of Interest (BoI). Indeed, an automatic extraction of the +bifurcations presenting the higher risk of developing an ICA would offer the +neuroradiologists a quick glance at the most alarming areas. Due to the recent +efforts on Artificial Intelligence, Deep Learning turned out to be the best +performing technology for many pattern recognition tasks. Moreover, various +methods have been particularly designed for medical image analysis purposes. +This study intends to assist the neuroradiologists to promptly locate any +bifurcation presenting a high risk of ICA occurrence. It can be seen as a +Computer Aided Diagnosis scheme, where the Artificial Intelligence facilitates +the access to the regions of interest within the MRI. In this work, we propose +a method for a fully automatic detection and recognition of the bifurcations of +interest forming the Circle of Willis. Several neural networks architectures +have been tested, and we thoroughly evaluate the bifurcation recognition rate. + +
+
+
+
+
+ + ☆ Learning to Upsample by Learning to Sample ICCV 2023 + + +
+ We present DySample, an ultra-lightweight and effective dynamic upsampler. +While impressive performance gains have been witnessed from recent kernel-based +dynamic upsamplers such as CARAFE, FADE, and SAPA, they introduce much +workload, mostly due to the time-consuming dynamic convolution and the +additional sub-network used to generate dynamic kernels. Further, the need for +high-res feature guidance of FADE and SAPA somehow limits their application +scenarios. To address these concerns, we bypass dynamic convolution and +formulate upsampling from the perspective of point sampling, which is more +resource-efficient and can be easily implemented with the standard built-in +function in PyTorch. We first showcase a naive design, and then demonstrate how +to strengthen its upsampling behavior step by step towards our new upsampler, +DySample. Compared with former kernel-based dynamic upsamplers, DySample +requires no customized CUDA package and has much fewer parameters, FLOPs, GPU +memory, and latency. Besides the light-weight characteristics, DySample +outperforms other upsamplers across five dense prediction tasks, including +semantic segmentation, object detection, instance segmentation, panoptic +segmentation, and monocular depth estimation. Code is available at +https://github.com/tiny-smart/dysample. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Class Prior-Free Positive-Unlabeled Learning with Taylor Variational + Loss for Hyperspectral Remote Sensing Imagery ICCV 2023 + + +
+ Positive-unlabeled learning (PU learning) in hyperspectral remote sensing +imagery (HSI) is aimed at learning a binary classifier from positive and +unlabeled data, which has broad prospects in various earth vision applications. +However, when PU learning meets limited labeled HSI, the unlabeled data may +dominate the optimization process, which makes the neural networks overfit the +unlabeled data. In this paper, a Taylor variational loss is proposed for HSI PU +learning, which reduces the weight of the gradient of the unlabeled data by +Taylor series expansion to enable the network to find a balance between +overfitting and underfitting. In addition, the self-calibrated optimization +strategy is designed to stabilize the training process. Experiments on 7 +benchmark datasets (21 tasks in total) validate the effectiveness of the +proposed method. Code is at: https://github.com/Hengwei-Zhao96/T-HOneCls. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ Exploring Model Transferability through the Lens of Potential Energy ICCV 2023 + + +
+ Transfer learning has become crucial in computer vision tasks due to the vast +availability of pre-trained deep learning models. However, selecting the +optimal pre-trained model from a diverse pool for a specific downstream task +remains a challenge. Existing methods for measuring the transferability of +pre-trained models rely on statistical correlations between encoded static +features and task labels, but they overlook the impact of underlying +representation dynamics during fine-tuning, leading to unreliable results, +especially for self-supervised models. In this paper, we present an insightful +physics-inspired approach named PED to address these challenges. We reframe the +challenge of model selection through the lens of potential energy and directly +model the interaction forces that influence fine-tuning dynamics. By capturing +the motion of dynamic representations to decline the potential energy within a +force-driven physical model, we can acquire an enhanced and more stable +observation for estimating transferability. The experimental results on 10 +downstream tasks and 12 self-supervised models demonstrate that our approach +can seamlessly integrate into existing ranking techniques and enhance their +performances, revealing its effectiveness for the model selection task and its +potential for understanding the mechanism in transfer learning. Code will be +available at https://github.com/lixiaotong97/PED. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ DiffBIR: Towards Blind Image Restoration with Generative Diffusion Prior + + +
+ We present DiffBIR, which leverages pretrained text-to-image diffusion models +for blind image restoration problem. Our framework adopts a two-stage pipeline. +In the first stage, we pretrain a restoration module across diversified +degradations to improve generalization capability in real-world scenarios. The +second stage leverages the generative ability of latent diffusion models, to +achieve realistic image restoration. Specifically, we introduce an injective +modulation sub-network -- LAControlNet for finetuning, while the pre-trained +Stable Diffusion is to maintain its generative ability. Finally, we introduce a +controllable module that allows users to balance quality and fidelity by +introducing the latent image guidance in the denoising process during +inference. Extensive experiments have demonstrated its superiority over +state-of-the-art approaches for both blind image super-resolution and blind +face restoration tasks on synthetic and real-world datasets. The code is +available at https://github.com/XPixelGroup/DiffBIR. + +
+
+
+
+
+ + ☆ A Comprehensive Augmentation Framework for Anomaly Detection + + +
+ Data augmentation methods are commonly integrated into the training of +anomaly detection models. Previous approaches have primarily focused on +replicating real-world anomalies or enhancing diversity, without considering +that the standard of anomaly varies across different classes, potentially +leading to a biased training distribution.This paper analyzes crucial traits of +simulated anomalies that contribute to the training of reconstructive networks +and condenses them into several methods, thus creating a comprehensive +framework by selectively utilizing appropriate combinations.Furthermore, we +integrate this framework with a reconstruction-based approach and concurrently +propose a split training strategy that alleviates the issue of overfitting +while avoiding introducing interference to the reconstruction process. The +evaluations conducted on the MVTec anomaly detection dataset demonstrate that +our method outperforms the previous state-of-the-art approach, particularly in +terms of object classes.To evaluate generalizability, we generate a simulated +dataset comprising anomalies with diverse characteristics since the original +test samples only include specific types of anomalies and may lead to biased +evaluations. Experimental results demonstrate that our approach exhibits +promising potential for generalizing effectively to various unforeseen +anomalies encountered in real-world scenarios. + +
+
+
+
+
+ + ☆ Learning Cross-modality Information Bottleneck Representation for + Heterogeneous Person Re-Identification + + +
+ Visible-Infrared person re-identification (VI-ReID) is an important and +challenging task in intelligent video surveillance. Existing methods mainly +focus on learning a shared feature space to reduce the modality discrepancy +between visible and infrared modalities, which still leave two problems +underexplored: information redundancy and modality complementarity. To this +end, properly eliminating the identity-irrelevant information as well as making +up for the modality-specific information are critical and remains a challenging +endeavor. To tackle the above problems, we present a novel mutual information +and modality consensus network, namely CMInfoNet, to extract modality-invariant +identity features with the most representative information and reduce the +redundancies. The key insight of our method is to find an optimal +representation to capture more identity-relevant information and compress the +irrelevant parts by optimizing a mutual information bottleneck trade-off. +Besides, we propose an automatically search strategy to find the most prominent +parts that identify the pedestrians. To eliminate the cross- and intra-modality +variations, we also devise a modality consensus module to align the visible and +infrared modalities for task-specific guidance. Moreover, the global-local +feature representations can also be acquired for key parts discrimination. +Experimental results on four benchmarks, i.e., SYSU-MM01, RegDB, +Occluded-DukeMTMC, Occluded-REID, Partial-REID and Partial\_iLIDS dataset, have +demonstrated the effectiveness of CMInfoNet. + +
+
+
+
+
+ + ☆ AIoT-Based Drum Transcription Robot using Convolutional Neural Networks + + +
+ With the development of information technology, robot technology has made +great progress in various fields. These new technologies enable robots to be +used in industry, agriculture, education and other aspects. In this paper, we +propose a drum robot that can automatically complete music transcription in +real-time, which is based on AIoT and fog computing technology. Specifically, +this drum robot system consists of a cloud node for data storage, edge nodes +for real-time computing, and data-oriented execution application nodes. In +order to analyze drumming music and realize drum transcription, we further +propose a light-weight convolutional neural network model to classify drums, +which can be more effectively deployed in terminal devices for fast edge +calculations. The experimental results show that the proposed system can +achieve more competitive performance and enjoy a variety of smart applications +and services. + +
+
+
+
+
+ + ☆ A Consumer-tier based Visual-Brain Machine Interface for Augmented + Reality Glasses Interactions + + +
+ Objective.Visual-Brain Machine Interface(V-BMI) has provide a novel +interaction technique for Augmented Reality (AR) industries. Several +state-of-arts work has demonstates its high accuracy and real-time interaction +capbilities. However, most of the studies employ EEGs devices that are rigid +and difficult to apply in real-life AR glasseses application sceniraros. Here +we develop a consumer-tier Visual-Brain Machine Inteface(V-BMI) system +specialized for Augmented Reality(AR) glasses interactions. Approach. The +developed system consists of a wearable hardware which takes advantages of fast +set-up, reliable recording and comfortable wearable experience that +specificized for AR glasses applications. Complementing this hardware, we have +devised a software framework that facilitates real-time interactions within the +system while accommodating a modular configuration to enhance scalability. Main +results. The developed hardware is only 110g and 120x85x23 mm, which with 1 +Tohm and peak to peak voltage is less than 1.5 uV, and a V-BMI based angry bird +game and an Internet of Thing (IoT) AR applications are deisgned, we +demonstrated such technology merits of intuitive experience and efficiency +interaction. The real-time interaction accuracy is between 85 and 96 +percentages in a commercial AR glasses (DTI is 2.24s and ITR 65 bits-min ). +Significance. Our study indicates the developed system can provide an essential +hardware-software framework for consumer based V-BMI AR glasses. Also, we +derive several pivotal design factors for a consumer-grade V-BMI-based AR +system: 1) Dynamic adaptation of stimulation patterns-classification methods +via computer vision algorithms is necessary for AR glasses applications; and 2) +Algorithmic localization to foster system stability and latency reduction. + +
+
+ comment: 15 pages,10 figures +
+
+
+
+
+ + ☆ iBARLE: imBalance-Aware Room Layout Estimation + + +
+ Room layout estimation predicts layouts from a single panorama. It requires +datasets with large-scale and diverse room shapes to train the models. However, +there are significant imbalances in real-world datasets including the +dimensions of layout complexity, camera locations, and variation in scene +appearance. These issues considerably influence the model training performance. +In this work, we propose the imBalance-Aware Room Layout Estimation (iBARLE) +framework to address these issues. iBARLE consists of (1) Appearance Variation +Generation (AVG) module, which promotes visual appearance domain +generalization, (2) Complex Structure Mix-up (CSMix) module, which enhances +generalizability w.r.t. room structure, and (3) a gradient-based layout +objective function, which allows more effective accounting for occlusions in +complex layouts. All modules are jointly trained and help each other to achieve +the best performance. Experiments and ablation studies based on +ZInD~\cite{cruz2021zillow} dataset illustrate that iBARLE has state-of-the-art +performance compared with other layout estimation baselines. + +
+
+
+
+
+ + ☆ Pose-Free Neural Radiance Fields via Implicit Pose Regularization ICCV2023 + + +
+ Pose-free neural radiance fields (NeRF) aim to train NeRF with unposed +multi-view images and it has achieved very impressive success in recent years. +Most existing works share the pipeline of training a coarse pose estimator with +rendered images at first, followed by a joint optimization of estimated poses +and neural radiance field. However, as the pose estimator is trained with only +rendered images, the pose estimation is usually biased or inaccurate for real +images due to the domain gap between real images and rendered images, leading +to poor robustness for the pose estimation of real images and further local +minima in joint optimization. We design IR-NeRF, an innovative pose-free NeRF +that introduces implicit pose regularization to refine pose estimator with +unposed real images and improve the robustness of the pose estimation for real +images. With a collection of 2D images of a specific scene, IR-NeRF constructs +a scene codebook that stores scene features and captures the scene-specific +pose distribution implicitly as priors. Thus, the robustness of pose estimation +can be promoted with the scene priors according to the rationale that a 2D real +image can be well reconstructed from the scene codebook only when its estimated +pose lies within the pose distribution. Extensive experiments show that IR-NeRF +achieves superior novel view synthesis and outperforms the state-of-the-art +consistently across multiple synthetic and real datasets. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ Is it an i or an l: Test-time Adaptation of Text Line Recognition Models + + +
+ Recognizing text lines from images is a challenging problem, especially for +handwritten documents due to large variations in writing styles. While text +line recognition models are generally trained on large corpora of real and +synthetic data, such models can still make frequent mistakes if the handwriting +is inscrutable or the image acquisition process adds corruptions, such as +noise, blur, compression, etc. Writing style is generally quite consistent for +an individual, which can be leveraged to correct mistakes made by such models. +Motivated by this, we introduce the problem of adapting text line recognition +models during test time. We focus on a challenging and realistic setting where, +given only a single test image consisting of multiple text lines, the task is +to adapt the model such that it performs better on the image, without any +labels. We propose an iterative self-training approach that uses feedback from +the language model to update the optical model, with confident self-labels in +each iteration. The confidence measure is based on an augmentation mechanism +that evaluates the divergence of the prediction of the model in a local region. +We perform rigorous evaluation of our method on several benchmark datasets as +well as their corrupted versions. Experimental results on multiple datasets +spanning multiple scripts show that the proposed adaptation method offers an +absolute improvement of up to 8% in character error rate with just a few +iterations of self-training at test time. + +
+
+
+
+
+ + ☆ Pyramid diffractive optical networks for unidirectional magnification + and demagnification + + +
+ Diffractive deep neural networks (D2NNs) are composed of successive +transmissive layers optimized using supervised deep learning to all-optically +implement various computational tasks between an input and output field-of-view +(FOV). Here, we present a pyramid-structured diffractive optical network design +(which we term P-D2NN), optimized specifically for unidirectional image +magnification and demagnification. In this P-D2NN design, the diffractive +layers are pyramidally scaled in alignment with the direction of the image +magnification or demagnification. Our analyses revealed the efficacy of this +P-D2NN design in unidirectional image magnification and demagnification tasks, +producing high-fidelity magnified or demagnified images in only one direction, +while inhibiting the image formation in the opposite direction - confirming the +desired unidirectional imaging operation. Compared to the conventional D2NN +designs with uniform-sized successive diffractive layers, P-D2NN design +achieves similar performance in unidirectional magnification tasks using only +half of the diffractive degrees of freedom within the optical processor volume. +Furthermore, it maintains its unidirectional image +magnification/demagnification functionality across a large band of illumination +wavelengths despite being trained with a single illumination wavelength. With +this pyramidal architecture, we also designed a wavelength-multiplexed +diffractive network, where a unidirectional magnifier and a unidirectional +demagnifier operate simultaneously in opposite directions, at two distinct +illumination wavelengths. The efficacy of the P-D2NN architecture was also +validated experimentally using monochromatic terahertz illumination, +successfully matching our numerical simulations. P-D2NN offers a +physics-inspired strategy for designing task-specific visual processors. + +
+
+ comment: 26 Pages, 7 Figures +
+
+
+
+
+ + ☆ C2G2: Controllable Co-speech Gesture Generation with Latent Diffusion + Model + + +
+ Co-speech gesture generation is crucial for automatic digital avatar +animation. However, existing methods suffer from issues such as unstable +training and temporal inconsistency, particularly in generating high-fidelity +and comprehensive gestures. Additionally, these methods lack effective control +over speaker identity and temporal editing of the generated gestures. Focusing +on capturing temporal latent information and applying practical controlling, we +propose a Controllable Co-speech Gesture Generation framework, named C2G2. +Specifically, we propose a two-stage temporal dependency enhancement strategy +motivated by latent diffusion models. We further introduce two key features to +C2G2, namely a speaker-specific decoder to generate speaker-related real-length +skeletons and a repainting strategy for flexible gesture generation/editing. +Extensive experiments on benchmark gesture datasets verify the effectiveness of +our proposed C2G2 compared with several state-of-the-art baselines. The link of +the project demo page can be found at https://c2g2-gesture.github.io/c2_gesture + +
+
+ comment: 12 pages, 6 figures, 7 tables +
+
+
+
+
+ + ☆ Few-Shot Object Detection via Synthetic Features with Optimal Transport + + +
+ Few-shot object detection aims to simultaneously localize and classify the +objects in an image with limited training samples. However, most existing +few-shot object detection methods focus on extracting the features of a few +samples of novel classes that lack diversity. Hence, they may not be sufficient +to capture the data distribution. To address that limitation, in this paper, we +propose a novel approach in which we train a generator to generate synthetic +data for novel classes. Still, directly training a generator on the novel class +is not effective due to the lack of novel data. To overcome that issue, we +leverage the large-scale dataset of base classes. Our overarching goal is to +train a generator that captures the data variations of the base dataset. We +then transform the captured variations into novel classes by generating +synthetic data with the trained generator. To encourage the generator to +capture data variations on base classes, we propose to train the generator with +an optimal transport loss that minimizes the optimal transport distance between +the distributions of real and synthetic data. Extensive experiments on two +benchmark datasets demonstrate that the proposed method outperforms the state +of the art. Source code will be available. + +
+
+
+
+
+ + ☆ PBFormer: Capturing Complex Scene Text Shape with Polynomial Band + Transformer ACM MM 2023 + + +
+ We present PBFormer, an efficient yet powerful scene text detector that +unifies the transformer with a novel text shape representation Polynomial Band +(PB). The representation has four polynomial curves to fit a text's top, +bottom, left, and right sides, which can capture a text with a complex shape by +varying polynomial coefficients. PB has appealing features compared with +conventional representations: 1) It can model different curvatures with a fixed +number of parameters, while polygon-points-based methods need to utilize a +different number of points. 2) It can distinguish adjacent or overlapping texts +as they have apparent different curve coefficients, while segmentation-based or +points-based methods suffer from adhesive spatial positions. PBFormer combines +the PB with the transformer, which can directly generate smooth text contours +sampled from predicted curves without interpolation. A parameter-free +cross-scale pixel attention (CPA) module is employed to highlight the feature +map of a suitable scale while suppressing the other feature maps. The simple +operation can help detect small-scale texts and is compatible with the +one-stage DETR framework, where no postprocessing exists for NMS. Furthermore, +PBFormer is trained with a shape-contained loss, which not only enforces the +piecewise alignment between the ground truth and the predicted curves but also +makes curves' positions and shapes consistent with each other. Without bells +and whistles about text pre-training, our method is superior to the previous +state-of-the-art text detectors on the arbitrary-shaped text datasets. + +
+
+ comment: 9 pages, 8 figures, accepted by ACM MM 2023 +
+
+
+
+
+ + ☆ WSAM: Visual Explanations from Style Augmentation as Adversarial + Attacker and Their Influence in Image Classification + + +
+ Currently, style augmentation is capturing attention due to convolutional +neural networks (CNN) being strongly biased toward recognizing textures rather +than shapes. Most existing styling methods either perform a low-fidelity style +transfer or a weak style representation in the embedding vector. This paper +outlines a style augmentation algorithm using stochastic-based sampling with +noise addition to improving randomization on a general linear transformation +for style transfer. With our augmentation strategy, all models not only present +incredible robustness against image stylizing but also outperform all previous +methods and surpass the state-of-the-art performance for the STL-10 dataset. In +addition, we present an analysis of the model interpretations under different +style variations. At the same time, we compare comprehensive experiments +demonstrating the performance when applied to deep neural architectures in +training settings. + +
+
+ comment: 8 pages, 10 figures +
+
+
+
+
+ + ☆ ICARUS: An Android-Based Unmanned Aerial Vehicle (UAV) Search and Rescue + Eye in the Sky + + +
+ The purpose of this paper is to develop an unmanned aerial vehicle (UAV) +using a quadcopter with the capability of video surveillance, map coordinates, +a deployable parachute with a medicine kit or a food pack as a payload, a +collision warning system, remotely controlled, integrated with an android +application to assist in search and rescue operations. + Applied research for the development of the functional prototype, +quantitative and descriptive statistics to summarize data by describing the +relationship between variables in a sample or population. The quadcopter +underwent an evaluation using a survey instrument to test its acceptability +using predefined variables to select respondents within Caloocan City and +Quezon City, Philippines. + Demographic profiles and known issues and concerns were answered by 30 +respondents. The results were summarized and distributed in Tables 1 and 2. + In terms of demographic profiles, the number of SAR operators within the +specified areas is distributed equally, most are male, single, and within the +age bracket of 31 and above. In issues and concerns, the most common type of +search and rescue was ground search and rescue. Human error is the primary +cause of most injuries in operating units. The prototype was useful and +everyone agreed, in terms of acceptability, drone technology will improve +search and rescue operations. + The innovative way of utilizing Android and drone technology is a new step +towards the improvement of SAR operations in the Philippines. + The LiPo battery must be replaced with a higher capacity and the drone +operator should undergo a training course and secure a permit from the Civil +Aviation Authority of the Philippines (CAAP). + +
+
+ comment: 15 pages, 14 figures, Special Issue: IRCCETE 2023 +
+
+
+
+
+ + ☆ Vision Grid Transformer for Document Layout Analysis ICCV2023 + + +
+ Document pre-trained models and grid-based models have proven to be very +effective on various tasks in Document AI. However, for the document layout +analysis (DLA) task, existing document pre-trained models, even those +pre-trained in a multi-modal fashion, usually rely on either textual features +or visual features. Grid-based models for DLA are multi-modality but largely +neglect the effect of pre-training. To fully leverage multi-modal information +and exploit pre-training techniques to learn better representation for DLA, in +this paper, we present VGT, a two-stream Vision Grid Transformer, in which Grid +Transformer (GiT) is proposed and pre-trained for 2D token-level and +segment-level semantic understanding. Furthermore, a new dataset named D$^4$LA, +which is so far the most diverse and detailed manually-annotated benchmark for +document layout analysis, is curated and released. Experiment results have +illustrated that the proposed VGT model achieves new state-of-the-art results +on DLA tasks, e.g. PubLayNet ($95.7\%$$\rightarrow$$96.2\%$), DocBank +($79.6\%$$\rightarrow$$84.1\%$), and D$^4$LA ($67.7\%$$\rightarrow$$68.8\%$). +The code and models as well as the D$^4$LA dataset will be made publicly +available ~\url{https://github.com/AlibabaResearch/AdvancedLiterateMachinery}. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ Reprogramming under constraints: Revisiting efficient and reliable + transferability of lottery tickets + + +
+ In the era of foundation models with huge pre-training budgets, the +downstream tasks have been shifted to the narrative of efficient and fast +adaptation. For classification-based tasks in the domain of computer vision, +the two most efficient approaches have been linear probing (LP) and visual +prompting/reprogramming (VP); the former aims to learn a classifier in the form +of a linear head on the features extracted by the pre-trained model, while the +latter maps the input data to the domain of the source data on which the model +was originally pre-trained on. Although extensive studies have demonstrated the +differences between LP and VP in terms of downstream performance, we explore +the capabilities of the two aforementioned methods via the sparsity axis: (a) +Data sparsity: the impact of few-shot adaptation and (b) Model sparsity: the +impact of lottery tickets (LT). We demonstrate that LT are not universal +reprogrammers, i.e., for certain target datasets, reprogramming an LT yields +significantly lower performance than the reprogrammed dense model although +their corresponding upstream performance is similar. Further, we demonstrate +that the calibration of dense models is always superior to that of their +lottery ticket counterparts under both LP and VP regimes. Our empirical study +opens a new avenue of research into VP for sparse models and encourages further +understanding of the performance beyond the accuracy achieved by VP under +constraints of sparsity. Code and logs can be accessed at +\url{https://github.com/landskape-ai/Reprogram_LT}. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ CEFHRI: A Communication Efficient Federated Learning Framework for + Recognizing Industrial Human-Robot Interaction IROS 2023 + + +
+ Human-robot interaction (HRI) is a rapidly growing field that encompasses +social and industrial applications. Machine learning plays a vital role in +industrial HRI by enhancing the adaptability and autonomy of robots in complex +environments. However, data privacy is a crucial concern in the interaction +between humans and robots, as companies need to protect sensitive data while +machine learning algorithms require access to large datasets. Federated +Learning (FL) offers a solution by enabling the distributed training of models +without sharing raw data. Despite extensive research on Federated learning (FL) +for tasks such as natural language processing (NLP) and image classification, +the question of how to use FL for HRI remains an open research problem. The +traditional FL approach involves transmitting large neural network parameter +matrices between the server and clients, which can lead to high communication +costs and often becomes a bottleneck in FL. This paper proposes a +communication-efficient FL framework for human-robot interaction (CEFHRI) to +address the challenges of data heterogeneity and communication costs. The +framework leverages pre-trained models and introduces a trainable +spatiotemporal adapter for video understanding tasks in HRI. Experimental +results on three human-robot interaction benchmark datasets: HRI30, InHARD, and +COIN demonstrate the superiority of CEFHRI over full fine-tuning in terms of +communication costs. The proposed methodology provides a secure and efficient +approach to HRI federated learning, particularly in industrial environments +with data privacy concerns and limited communication bandwidth. Our code is +available at +https://github.com/umarkhalidAI/CEFHRI-Efficient-Federated-Learning. + +
+
+ comment: Accepted in IROS 2023 +
+
+
+
+
+ + ☆ Read-only Prompt Optimization for Vision-Language Few-shot Learning ICCV2023 + + +
+ In recent years, prompt tuning has proven effective in adapting pre-trained +vision-language models to downstream tasks. These methods aim to adapt the +pre-trained models by introducing learnable prompts while keeping pre-trained +weights frozen. However, learnable prompts can affect the internal +representation within the self-attention module, which may negatively impact +performance variance and generalization, especially in data-deficient settings. +To address these issues, we propose a novel approach, Read-only Prompt +Optimization (RPO). RPO leverages masked attention to prevent the internal +representation shift in the pre-trained model. Further, to facilitate the +optimization of RPO, the read-only prompts are initialized based on special +tokens of the pre-trained model. Our extensive experiments demonstrate that RPO +outperforms CLIP and CoCoOp in base-to-new generalization and domain +generalization while displaying better robustness. Also, the proposed method +achieves better generalization on extremely data-deficient settings, while +improving parameter efficiency and computational overhead. Code is available at +https://github.com/mlvlab/RPO. + +
+
+ comment: Accepted at ICCV2023 +
+
+
+
+
+ + ☆ Multimodal Foundation Models For Echocardiogram Interpretation + + +
+ Multimodal deep learning foundation models can learn the relationship between +images and text. In the context of medical imaging, mapping images to language +concepts reflects the clinical task of diagnostic image interpretation, however +current general-purpose foundation models do not perform well in this context +because their training corpus have limited medical text and images. To address +this challenge and account for the range of cardiac physiology, we leverage +1,032,975 cardiac ultrasound videos and corresponding expert interpretations to +develop EchoCLIP, a multimodal foundation model for echocardiography. EchoCLIP +displays strong zero-shot (not explicitly trained) performance in cardiac +function assessment (external validation left ventricular ejection fraction +mean absolute error (MAE) of 7.1%) and identification of implanted intracardiac +devices (areas under the curve (AUC) between 0.84 and 0.98 for pacemakers and +artificial heart valves). We also developed a long-context variant (EchoCLIP-R) +with a custom echocardiography report text tokenizer which can accurately +identify unique patients across multiple videos (AUC of 0.86), identify +clinical changes such as orthotopic heart transplants (AUC of 0.79) or cardiac +surgery (AUC 0.77), and enable robust image-to-text search (mean cross-modal +retrieval rank in the top 1% of candidate text reports). These emergent +capabilities can be used for preliminary assessment and summarization of +echocardiographic findings. + +
+
+
+
+
+ + ☆ Bridging Distribution Learning and Image Clustering in High-dimensional + Space + + +
+ Distribution learning focuses on learning the probability density function +from a set of data samples. In contrast, clustering aims to group similar +objects together in an unsupervised manner. Usually, these two tasks are +considered unrelated. However, the relationship between the two may be +indirectly correlated, with Gaussian Mixture Models (GMM) acting as a bridge. +In this paper, we focus on exploring the correlation between distribution +learning and clustering, with the motivation to fill the gap between these two +fields, utilizing an autoencoder (AE) to encode images into a high-dimensional +latent space. Then, Monte-Carlo Marginalization (MCMarg) and Kullback-Leibler +(KL) divergence loss are used to fit the Gaussian components of the GMM and +learn the data distribution. Finally, image clustering is achieved through each +Gaussian component of GMM. Yet, the "curse of dimensionality" poses severe +challenges for most clustering algorithms. Compared with the classic +Expectation-Maximization (EM) Algorithm, experimental results show that MCMarg +and KL divergence can greatly alleviate the difficulty. Based on the +experimental results, we believe distribution learning can exploit the +potential of GMM in image clustering within high-dimensional space. + +
+
+
+
+
+ + ☆ Unveiling Camouflage: A Learnable Fourier-based Augmentation for + Camouflaged Object Detection and Instance Segmentation + + +
+ Camouflaged object detection (COD) and camouflaged instance segmentation +(CIS) aim to recognize and segment objects that are blended into their +surroundings, respectively. While several deep neural network models have been +proposed to tackle those tasks, augmentation methods for COD and CIS have not +been thoroughly explored. Augmentation strategies can help improve the +performance of models by increasing the size and diversity of the training data +and exposing the model to a wider range of variations in the data. Besides, we +aim to automatically learn transformations that help to reveal the underlying +structure of camouflaged objects and allow the model to learn to better +identify and segment camouflaged objects. To achieve this, we propose a +learnable augmentation method in the frequency domain for COD and CIS via +Fourier transform approach, dubbed CamoFourier. Our method leverages a +conditional generative adversarial network and cross-attention mechanism to +generate a reference image and an adaptive hybrid swapping with parameters to +mix the low-frequency component of the reference image and the high-frequency +component of the input image. This approach aims to make camouflaged objects +more visible for detection and segmentation models. Without bells and whistles, +our proposed augmentation method boosts the performance of camouflaged object +detectors and camouflaged instance segmenters by large margins. + +
+
+
+
+
+ + ☆ Detection of Mild Cognitive Impairment Using Facial Features in Video + Conversations + + +
+ Early detection of Mild Cognitive Impairment (MCI) leads to early +interventions to slow the progression from MCI into dementia. Deep Learning +(DL) algorithms could help achieve early non-invasive, low-cost detection of +MCI. This paper presents the detection of MCI in older adults using DL models +based only on facial features extracted from video-recorded conversations at +home. We used the data collected from the I-CONECT behavioral intervention +study (NCT02871921), where several sessions of semi-structured interviews +between socially isolated older individuals and interviewers were video +recorded. We develop a framework that extracts spatial holistic facial features +using a convolutional autoencoder and temporal information using transformers. +Our proposed DL model was able to detect the I-CONECT study participants' +cognitive conditions (MCI vs. those with normal cognition (NC)) using facial +features. The segments and sequence information of the facial features improved +the prediction performance compared with the non-temporal features. The +detection accuracy using this combined method reached 88% whereas 84% is the +accuracy without applying the segments and sequences information of the facial +features within a video on a certain theme. + +
+
+
+
+
+ + ☆ RACR-MIL: Weakly Supervised Skin Cancer Grading using Rank-Aware + Contextual Reasoning on Whole Slide Images AAAI + + +
+ Cutaneous squamous cell cancer (cSCC) is the second most common skin cancer +in the US. It is diagnosed by manual multi-class tumor grading using a tissue +whole slide image (WSI), which is subjective and suffers from inter-pathologist +variability. We propose an automated weakly-supervised grading approach for +cSCC WSIs that is trained using WSI-level grade and does not require +fine-grained tumor annotations. The proposed model, RACR-MIL, transforms each +WSI into a bag of tiled patches and leverages attention-based multiple-instance +learning to assign a WSI-level grade. We propose three key innovations to +address general as well as cSCC-specific challenges in tumor grading. First, we +leverage spatial and semantic proximity to define a WSI graph that encodes both +local and non-local dependencies between tumor regions and leverage graph +attention convolution to derive contextual patch features. Second, we introduce +a novel ordinal ranking constraint on the patch attention network to ensure +that higher-grade tumor regions are assigned higher attention. Third, we use +tumor depth as an auxiliary task to improve grade classification in a multitask +learning framework. RACR-MIL achieves 2-9% improvement in grade classification +over existing weakly-supervised approaches on a dataset of 718 cSCC tissue +images and localizes the tumor better. The model achieves 5-20% higher accuracy +in difficult-to-classify high-risk grade classes and is robust to class +imbalance. + +
+
+ comment: 7 pages main text, 2 page references, 3 page appendix; submitted to + AAAI +
+
+
+
+
+ + ☆ Prototype Fission: Closing Set for Robust Open-set Semi-supervised + Learning + + +
+ Semi-supervised Learning (SSL) has been proven vulnerable to +out-of-distribution (OOD) samples in realistic large-scale unsupervised +datasets due to over-confident pseudo-labeling OODs as in-distribution (ID). A +key underlying problem is class-wise latent space spreading from closed seen +space to open unseen space, and the bias is further magnified in SSL's +self-training loops. To close the ID distribution set so that OODs are better +rejected for safe SSL, we propose Prototype Fission(PF) to divide class-wise +latent spaces into compact sub-spaces by automatic fine-grained latent space +mining, driven by coarse-grained labels only. Specifically, we form multiple +unique learnable sub-class prototypes for each class, optimized towards both +diversity and consistency. The Diversity Modeling term encourages samples to be +clustered by one of the multiple sub-class prototypes, while the Consistency +Modeling term clusters all samples of the same class to a global prototype. +Instead of "opening set", i.e., modeling OOD distribution, Prototype Fission +"closes set" and makes it hard for OOD samples to fit in sub-class latent +space. Therefore, PF is compatible with existing methods for further +performance gains. Extensive experiments validate the effectiveness of our +method in open-set SSL settings in terms of successfully forming sub-classes, +discriminating OODs from IDs and improving overall accuracy. Codes will be +released. + +
+
+
+
+
+ + ☆ Learning Sequential Information in Task-based fMRI for Synthetic Data + Augmentation MICCAI + + +
+ Insufficiency of training data is a persistent issue in medical image +analysis, especially for task-based functional magnetic resonance images (fMRI) +with spatio-temporal imaging data acquired using specific cognitive tasks. In +this paper, we propose an approach for generating synthetic fMRI sequences that +can then be used to create augmented training datasets in downstream learning +tasks. To synthesize high-resolution task-specific fMRI, we adapt the +$\alpha$-GAN structure, leveraging advantages of both GAN and variational +autoencoder models, and propose different alternatives in aggregating temporal +information. The synthetic images are evaluated from multiple perspectives +including visualizations and an autism spectrum disorder (ASD) classification +task. The results show that the synthetic task-based fMRI can provide effective +data augmentation in learning the ASD classification task. + +
+
+ comment: Accepted by Machine Learning in Clinical Neuroimaging 2023 (MICCAI + workshop), preprint version +
+
+
+
+
+ + ☆ A Pseudo-Boolean Polynomials Approach for Image Edge Detection + + +
+ We introduce a novel approach for image edge detection based on +pseudo-Boolean polynomials for image patches. We show that patches covering +edge regions in the image result in pseudo-Boolean polynomials with higher +degrees compared to patches that cover blob regions. The proposed approach is +based on reduction of polynomial degree and equivalence properties of +penalty-based pseudo-Boolean polynomials. + +
+
+ comment: 8 pages, 7 figures +
+
+
+
+
+ + ☆ Efficient Ray Sampling for Radiance Fields Reconstruction + + +
+ Accelerating neural radiance fields training is of substantial practical +value, as the ray sampling strategy profoundly impacts network convergence. +More efficient ray sampling can thus directly enhance existing NeRF models' +training efficiency. We therefore propose a novel ray sampling approach for +neural radiance fields that improves training efficiency while retaining +photorealistic rendering results. First, we analyze the relationship between +the pixel loss distribution of sampled rays and rendering quality. This reveals +redundancy in the original NeRF's uniform ray sampling. Guided by this finding, +we develop a sampling method leveraging pixel regions and depth boundaries. Our +main idea is to sample fewer rays in training views, yet with each ray more +informative for scene fitting. Sampling probability increases in pixel areas +exhibiting significant color and depth variation, greatly reducing wasteful +rays from other regions without sacrificing precision. Through this method, not +only can the convergence of the network be accelerated, but the spatial +geometry of a scene can also be perceived more accurately. Rendering outputs +are enhanced, especially for texture-complex regions. Experiments demonstrate +that our method significantly outperforms state-of-the-art techniques on public +benchmark datasets. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ DebSDF: Delving into the Details and Bias of Neural Indoor Scene + Reconstruction + + +
+ In recent years, the neural implicit surface has emerged as a powerful +representation for multi-view surface reconstruction due to its simplicity and +state-of-the-art performance. However, reconstructing smooth and detailed +surfaces in indoor scenes from multi-view images presents unique challenges. +Indoor scenes typically contain large texture-less regions, making the +photometric loss unreliable for optimizing the implicit surface. Previous work +utilizes monocular geometry priors to improve the reconstruction in indoor +scenes. However, monocular priors often contain substantial errors in thin +structure regions due to domain gaps and the inherent inconsistencies when +derived independently from different views. This paper presents \textbf{DebSDF} +to address these challenges, focusing on the utilization of uncertainty in +monocular priors and the bias in SDF-based volume rendering. We propose an +uncertainty modeling technique that associates larger uncertainties with larger +errors in the monocular priors. High-uncertainty priors are then excluded from +optimization to prevent bias. This uncertainty measure also informs an +importance-guided ray sampling and adaptive smoothness regularization, +enhancing the learning of fine structures. We further introduce a bias-aware +signed distance function to density transformation that takes into account the +curvature and the angle between the view direction and the SDF normals to +reconstruct fine details better. Our approach has been validated through +extensive experiments on several challenging datasets, demonstrating improved +qualitative and quantitative results in reconstructing thin structures in +indoor scenes, thereby outperforming previous work. + +
+
+
+
+
+ + ☆ Document AI: A Comparative Study of Transformer-Based, Graph-Based + Models, and Convolutional Neural Networks For Document Layout Analysis + + +
+ Document AI aims to automatically analyze documents by leveraging natural +language processing and computer vision techniques. One of the major tasks of +Document AI is document layout analysis, which structures document pages by +interpreting the content and spatial relationships of layout, image, and text. +This task can be image-centric, wherein the aim is to identify and label +various regions such as authors and paragraphs, or text-centric, where the +focus is on classifying individual words in a document. Although there are +increasingly sophisticated methods for improving layout analysis, doubts remain +about the extent to which their findings can be generalized to a broader +context. Specifically, prior work developed systems based on very different +architectures, such as transformer-based, graph-based, and CNNs. However, no +work has mentioned the effectiveness of these models in a comparative analysis. +Moreover, while language-independent Document AI models capable of knowledge +transfer have been developed, it remains to be investigated to what degree they +can effectively transfer knowledge. In this study, we aim to fill these gaps by +conducting a comparative evaluation of state-of-the-art models in document +layout analysis and investigating the potential of cross-lingual layout +analysis by utilizing machine translation techniques. + +
+
+
+
+
+ + ☆ Shatter and Gather: Learning Referring Image Segmentation with Text + Supervision ICCV 2023 + + +
+ Referring image segmentation, the task of segmenting any arbitrary entities +described in free-form texts, opens up a variety of vision applications. +However, manual labeling of training data for this task is prohibitively +costly, leading to lack of labeled data for training. We address this issue by +a weakly supervised learning approach using text descriptions of training +images as the only source of supervision. To this end, we first present a new +model that discovers semantic entities in input image and then combines such +entities relevant to text query to predict the mask of the referent. We also +present a new loss function that allows the model to be trained without any +further supervision. Our method was evaluated on four public benchmarks for +referring image segmentation, where it clearly outperformed the existing method +for the same task and recent open-vocabulary segmentation models on all the +benchmarks. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Fairness-aware Vision Transformer via Debiased Self-Attention + + +
+ Vision Transformer (ViT) has recently gained significant interest in solving +computer vision (CV) problems due to its capability of extracting informative +features and modeling long-range dependencies through the self-attention +mechanism. To fully realize the advantages of ViT in real-world applications, +recent works have explored the trustworthiness of ViT, including its robustness +and explainability. However, another desiderata, fairness has not yet been +adequately addressed in the literature. We establish that the existing +fairness-aware algorithms (primarily designed for CNNs) do not perform well on +ViT. This necessitates the need for developing our novel framework via Debiased +Self-Attention (DSA). DSA is a fairness-through-blindness approach that +enforces ViT to eliminate spurious features correlated with the sensitive +attributes for bias mitigation. Notably, adversarial examples are leveraged to +locate and mask the spurious features in the input image patches. In addition, +DSA utilizes an attention weights alignment regularizer in the training +objective to encourage learning informative features for target prediction. +Importantly, our DSA framework leads to improved fairness guarantees over prior +works on multiple prediction tasks without compromising target prediction +performance. + +
+
+
+
+
+ + ♻ ☆ An Empirical Investigation of the Role of Pre-training in Lifelong + Learning + + +
+ The lifelong learning paradigm in machine learning is an attractive +alternative to the more prominent isolated learning scheme not only due to its +resemblance to biological learning but also its potential to reduce energy +waste by obviating excessive model re-training. A key challenge to this +paradigm is the phenomenon of catastrophic forgetting. With the increasing +popularity and success of pre-trained models in machine learning, we pose the +question: What role does pre-training play in lifelong learning, specifically +with respect to catastrophic forgetting? We investigate existing methods in the +context of large, pre-trained models and evaluate their performance on a +variety of text and image classification tasks, including a large-scale study +using a novel data set of 15 diverse NLP tasks. Across all settings, we observe +that generic pre-training implicitly alleviates the effects of catastrophic +forgetting when learning multiple tasks sequentially compared to randomly +initialized models. We then further investigate why pre-training alleviates +forgetting in this setting. We study this phenomenon by analyzing the loss +landscape, finding that pre-trained weights appear to ease forgetting by +leading to wider minima. Based on this insight, we propose jointly optimizing +for current task loss and loss basin sharpness to explicitly encourage wider +basins during sequential fine-tuning. We show that this optimization approach +outperforms several state-of-the-art task-sequential continual learning +algorithms across multiple settings, occasionally even without retaining a +memory that scales in size with the number of tasks. + +
+
+
+
+
+ + ♻ ☆ Beyond Document Page Classification: Design, Datasets, and Challenges + + +
+ This paper highlights the need to bring document classification benchmarking +closer to real-world applications, both in the nature of data tested ($X$: +multi-channel, multi-paged, multi-industry; $Y$: class distributions and label +set variety) and in classification tasks considered ($f$: multi-page document, +page stream, and document bundle classification, ...). We identify the lack of +public multi-page document classification datasets, formalize different +classification tasks arising in application scenarios, and motivate the value +of targeting efficient multi-page document representations. An experimental +study on proposed multi-page document classification datasets demonstrates that +current benchmarks have become irrelevant and need to be updated to evaluate +complete documents, as they naturally occur in practice. This reality check +also calls for more mature evaluation methodologies, covering calibration +evaluation, inference complexity (time-memory), and a range of realistic +distribution shifts (e.g., born-digital vs. scanning noise, shifting page +order). Our study ends on a hopeful note by recommending concrete avenues for +future improvements.} + +
+
+ comment: 8 pages, under review +
+
+
+
+
+ + ♻ ☆ Learning Content-enhanced Mask Transformer for Domain Generalized + Urban-Scene Segmentation + + +
+ Domain-generalized urban-scene semantic segmentation (USSS) aims to learn +generalized semantic predictions across diverse urban-scene styles. Unlike +domain gap challenges, USSS is unique in that the semantic categories are often +similar in different urban scenes, while the styles can vary significantly due +to changes in urban landscapes, weather conditions, lighting, and other +factors. Existing approaches typically rely on convolutional neural networks +(CNNs) to learn the content of urban scenes. + In this paper, we propose a Content-enhanced Mask TransFormer (CMFormer) for +domain-generalized USSS. The main idea is to enhance the focus of the +fundamental component, the mask attention mechanism, in Transformer +segmentation models on content information. To achieve this, we introduce a +novel content-enhanced mask attention mechanism. It learns mask queries from +both the image feature and its down-sampled counterpart, as lower-resolution +image features usually contain more robust content information and are less +sensitive to style variations. These features are fused into a Transformer +decoder and integrated into a multi-resolution content-enhanced mask attention +learning scheme. + Extensive experiments conducted on various domain-generalized urban-scene +segmentation datasets demonstrate that the proposed CMFormer significantly +outperforms existing CNN-based methods for domain-generalized semantic +segmentation, achieving improvements of up to 14.00\% in terms of mIoU (mean +intersection over union). The source code for CMFormer will be made available +at this +\href{https://github.com/BiQiWHU/domain-generalized-urban-scene-segmentation}{repository}. + +
+
+ comment: 18 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Exploring the Relationship between Samples and Masks for Robust Defect + Localization + + +
+ Defect detection aims to detect and localize regions out of the normal +distribution.Previous approaches model normality and compare it with the input +to identify defective regions, potentially limiting their generalizability.This +paper proposes a one-stage framework that detects defective patterns directly +without the modeling process.This ability is adopted through the joint efforts +of three parties: a generative adversarial network (GAN), a newly proposed +scaled pattern loss, and a dynamic masked cycle-consistent auxiliary network. +Explicit information that could indicate the position of defects is +intentionally excluded to avoid learning any direct mapping.Experimental +results on the texture class of the challenging MVTec AD dataset show that the +proposed method is 2.9\% higher than the SOTA methods in F1-Score, while +substantially outperforming SOTA methods in generalizability. + +
+
+
+
+
+ + ♻ ☆ Few-shot $\mathbf{1/a}$ Anomalies Feedback : Damage Vision Mining + Opportunity and Embedding Feature Imbalance + + +
+ Over the past decade, previous balanced datasets have been used to advance +deep learning algorithms for industrial applications. In urban infrastructures +and living environments, damage data mining cannot avoid imbalanced data issues +because of rare unseen events and the high-quality status of improved +operations. For visual inspection, the deteriorated class acquired from the +surface of concrete and steel components are occasionally imbalanced. From +numerous related surveys, we conclude that imbalanced data problems can be +categorised into four types: 1) missing range of target and label valuables, 2) +majority-minority class imbalance, 3) foreground background of spatial +imbalance, and 4) long-tailed class of pixel-wise imbalance. Since 2015, many +imbalanced studies have been conducted using deep-learning approaches, +including regression, image classification, object detection, and semantic +segmentation. However, anomaly detection for imbalanced data is not well known. +In this study, we highlight a one-class anomaly detection application, whether +anomalous class or not, and demonstrate clear examples of imbalanced vision +datasets: medical disease, hazardous behaviour, material deterioration, plant +disease, river sludge, and disaster damage. We provide key results on the +advantage of damage-vision mining, hypothesising that the more effective the +range of the positive ratio, the higher the accuracy gain of the anomalies +feedback. In our imbalanced studies, compared with the balanced case with a +positive ratio of $1/1$, we find that there is an applicable positive ratio +$1/a$ where the accuracy is consistently high. However, the extremely +imbalanced range is from one shot to $1/2a$, the accuracy of which is inferior +to that of the applicable ratio. In contrast, with a positive ratio ranging +over $2/a$, it shifts in the over-mining phase without an effective gain in +accuracy. + +
+
+ comment: 34 pages, 53 figures, 28 tables +
+
+
+
+
+ + ♻ ☆ Reliable Multimodality Eye Disease Screening via Mixture of Student's t + Distributions MICCAI 2023 + + +
+ Multimodality eye disease screening is crucial in ophthalmology as it +integrates information from diverse sources to complement their respective +performances. However, the existing methods are weak in assessing the +reliability of each unimodality, and directly fusing an unreliable modality may +cause screening errors. To address this issue, we introduce a novel +multimodality evidential fusion pipeline for eye disease screening, EyeMoSt, +which provides a measure of confidence for unimodality and elegantly integrates +the multimodality information from a multi-distribution fusion perspective. +Specifically, our model estimates both local uncertainty for unimodality and +global uncertainty for the fusion modality to produce reliable classification +results. More importantly, the proposed mixture of Student's $t$ distributions +adaptively integrates different modalities to endow the model with heavy-tailed +properties, increasing robustness and reliability. Our experimental findings on +both public and in-house datasets show that our model is more reliable than +current methods. Additionally, EyeMost has the potential ability to serve as a +data quality discriminator, enabling reliable decision-making for multimodality +eye disease screening. + +
+
+ comment: MICCAI 2023 (Early accept):11 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Uncertainty-inspired Open Set Learning for Retinal Anomaly + Identification + + +
+ Failure to recognize samples from the classes unseen during training is a +major limitation of artificial intelligence in the real-world implementation +for recognition and classification of retinal anomalies. We established an +uncertainty-inspired open-set (UIOS) model, which was trained with fundus +images of 9 retinal conditions. Besides assessing the probability of each +category, UIOS also calculated an uncertainty score to express its confidence. +Our UIOS model with thresholding strategy achieved an F1 score of 99.55%, +97.01% and 91.91% for the internal testing set, external target categories +(TC)-JSIEC dataset and TC-unseen testing set, respectively, compared to the F1 +score of 92.20%, 80.69% and 64.74% by the standard AI model. Furthermore, UIOS +correctly predicted high uncertainty scores, which would prompt the need for a +manual check in the datasets of non-target categories retinal diseases, +low-quality fundus images, and non-fundus images. UIOS provides a robust method +for real-world screening of retinal anomalies. + +
+
+
+
+
+ + ♻ ☆ TALL: Thumbnail Layout for Deepfake Video Detection ICCV 2023 + + +
+ The growing threats of deepfakes to society and cybersecurity have raised +enormous public concerns, and increasing efforts have been devoted to this +critical topic of deepfake video detection. Existing video methods achieve good +performance but are computationally intensive. This paper introduces a simple +yet effective strategy named Thumbnail Layout (TALL), which transforms a video +clip into a pre-defined layout to realize the preservation of spatial and +temporal dependencies. Specifically, consecutive frames are masked in a fixed +position in each frame to improve generalization, then resized to sub-images +and rearranged into a pre-defined layout as the thumbnail. TALL is +model-agnostic and extremely simple by only modifying a few lines of code. +Inspired by the success of vision transformers, we incorporate TALL into Swin +Transformer, forming an efficient and effective method TALL-Swin. Extensive +experiments on intra-dataset and cross-dataset validate the validity and +superiority of TALL and SOTA TALL-Swin. TALL-Swin achieves 90.79$\%$ AUC on the +challenging cross-dataset task, FaceForensics++ $\to$ Celeb-DF. The code is +available at https://github.com/rainy-xu/TALL4Deepfake. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Weakly Supervised Intracranial Hemorrhage Segmentation using Head-Wise + Gradient-Infused Self-Attention Maps from a Swin Transformer in Categorical + Learning + + +
+ Intracranial hemorrhage (ICH) is a life-threatening medical emergency that +requires timely and accurate diagnosis for effective treatment and improved +patient survival rates. While deep learning techniques have emerged as the +leading approach for medical image analysis and processing, the most commonly +employed supervised learning often requires large, high-quality annotated +datasets that can be costly to obtain, particularly for pixel/voxel-wise image +segmentation. To address this challenge and facilitate ICH treatment decisions, +we introduce a novel weakly supervised method for ICH segmentation, utilizing a +Swin transformer trained on an ICH classification task with categorical labels. +Our approach leverages a hierarchical combination of head-wise gradient-infused +self-attention maps to generate accurate image segmentation. Additionally, we +conducted an exploratory study on different learning strategies and showed that +binary ICH classification has a more positive impact on self-attention maps +compared to full ICH subtyping. With a mean Dice score of 0.44, our technique +achieved similar ICH segmentation performance as the popular U-Net and +Swin-UNETR models with full supervision and outperformed a similar weakly +supervised approach using GradCAM, demonstrating the excellent potential of the +proposed framework in challenging medical image segmentation tasks. Our code is +available at https://github.com/HealthX-Lab/HGI-SAM. + +
+
+ comment: Accepted for publication at the Journal of Machine Learning for + Biomedical Imaging (MELBA) https://melba-journal.org/2023:012 +
+
+
+
+
+ + ♻ ☆ TeViS:Translating Text Synopses to Video Storyboards + + +
+ A video storyboard is a roadmap for video creation which consists of +shot-by-shot images to visualize key plots in a text synopsis. Creating video +storyboards, however, remains challenging which not only requires cross-modal +association between high-level texts and images but also demands long-term +reasoning to make transitions smooth across shots. In this paper, we propose a +new task called Text synopsis to Video Storyboard (TeViS) which aims to +retrieve an ordered sequence of images as the video storyboard to visualize the +text synopsis. We construct a MovieNet-TeViS dataset based on the public +MovieNet dataset. It contains 10K text synopses each paired with keyframes +manually selected from corresponding movies by considering both relevance and +cinematic coherence. To benchmark the task, we present strong CLIP-based +baselines and a novel VQ-Trans. VQ-Trans first encodes text synopsis and images +into a joint embedding space and uses vector quantization (VQ) to improve the +visual representation. Then, it auto-regressively generates a sequence of +visual features for retrieval and ordering. Experimental results demonstrate +that VQ-Trans significantly outperforms prior methods and the CLIP-based +baselines. Nevertheless, there is still a large gap compared to human +performance suggesting room for promising future work. The code and data are +available at: \url{https://ruc-aimind.github.io/projects/TeViS/} + +
+
+ comment: Accepted to ACM Multimedia 2023 +
+
+
+
+
+ + ♻ ☆ Atlas-Based Interpretable Age Prediction In Whole-Body MR Images + + +
+ Age prediction is an important part of medical assessments and research. It +can aid in detecting diseases as well as abnormal ageing by highlighting the +discrepancy between chronological and biological age. To gain a comprehensive +understanding of age-related changes observed in various body parts, we +investigate them on a larger scale by using whole-body images. We utilise the +Grad-CAM interpretability method to determine the body areas most predictive of +a person's age. We expand our analysis beyond individual subjects by employing +registration techniques to generate population-wide interpretability maps. +Furthermore, we set state-of-the-art whole-body age prediction with a model +that achieves a mean absolute error of 2.76 years. Our findings reveal three +primary areas of interest: the spine, the autochthonous back muscles, and the +cardiac region, which exhibits the highest importance. + +
+
+
+
+
+ + ♻ ☆ Unified and Dynamic Graph for Temporal Character Grouping in Long Videos + + +
+ Video temporal character grouping locates appearing moments of major +characters within a video according to their identities. To this end, recent +works have evolved from unsupervised clustering to graph-based supervised +clustering. However, graph methods are built upon the premise of fixed affinity +graphs, bringing many inexact connections. Besides, they extract multi-modal +features with kinds of models, which are unfriendly to deployment. In this +paper, we present a unified and dynamic graph (UniDG) framework for temporal +character grouping. This is accomplished firstly by a unified representation +network that learns representations of multiple modalities within the same +space and still preserves the modality's uniqueness simultaneously. Secondly, +we present a dynamic graph clustering where the neighbors of different +quantities are dynamically constructed for each node via a cyclic matching +strategy, leading to a more reliable affinity graph. Thirdly, a progressive +association method is introduced to exploit spatial and temporal contexts among +different modalities, allowing multi-modal clustering results to be well fused. +As current datasets only provide pre-extracted features, we evaluate our UniDG +method on a collected dataset named MTCG, which contains each character's +appearing clips of face and body and speaking voice tracks. We also evaluate +our key components on existing clustering and retrieval datasets to verify the +generalization ability. Experimental results manifest that our method can +achieve promising results and outperform several state-of-the-art approaches. + +
+
+
+
+
+ + ♻ ☆ A Unified Query-based Paradigm for Camouflaged Instance Segmentation ACM MM2023 + + +
+ Due to the high similarity between camouflaged instances and the background, +the recently proposed camouflaged instance segmentation (CIS) faces challenges +in accurate localization and instance segmentation. To this end, inspired by +query-based transformers, we propose a unified query-based multi-task learning +framework for camouflaged instance segmentation, termed UQFormer, which builds +a set of mask queries and a set of boundary queries to learn a shared composed +query representation and efficiently integrates global camouflaged object +region and boundary cues, for simultaneous instance segmentation and instance +boundary detection in camouflaged scenarios. Specifically, we design a composed +query learning paradigm that learns a shared representation to capture object +region and boundary features by the cross-attention interaction of mask queries +and boundary queries in the designed multi-scale unified learning transformer +decoder. Then, we present a transformer-based multi-task learning framework for +simultaneous camouflaged instance segmentation and camouflaged instance +boundary detection based on the learned composed query representation, which +also forces the model to learn a strong instance-level query representation. +Notably, our model views the instance segmentation as a query-based direct set +prediction problem, without other post-processing such as non-maximal +suppression. Compared with 14 state-of-the-art approaches, our UQFormer +significantly improves the performance of camouflaged instance segmentation. +Our code will be available at https://github.com/dongbo811/UQFormer. + +
+
+ comment: This paper has been accepted by ACM MM2023 +
+
+
+
+
+ + ♻ ☆ Fast Neural Scene Flow + + +
+ Neural Scene Flow Prior (NSFP) is of significant interest to the vision +community due to its inherent robustness to out-of-distribution (OOD) effects +and its ability to deal with dense lidar points. The approach utilizes a +coordinate neural network to estimate scene flow at runtime, without any +training. However, it is up to 100 times slower than current state-of-the-art +learning methods. In other applications such as image, video, and radiance +function reconstruction innovations in speeding up the runtime performance of +coordinate networks have centered upon architectural changes. In this paper, we +demonstrate that scene flow is different -- with the dominant computational +bottleneck stemming from the loss function itself (i.e., Chamfer distance). +Further, we rediscover the distance transform (DT) as an efficient, +correspondence-free loss function that dramatically speeds up the runtime +optimization. Our fast neural scene flow (FNSF) approach reports for the first +time real-time performance comparable to learning methods, without any training +or OOD bias on two of the largest open autonomous driving (AV) lidar datasets +Waymo Open and Argoverse. + +
+
+ comment: 17 pages, 11 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Ensemble of Anchor-Free Models for Robust Bangla Document Layout + Segmentation + + +
+ In this research paper, we introduce a novel approach designed for the +purpose of segmenting the layout of Bangla documents. Our methodology involves +the utilization of a sophisticated ensemble of YOLOv8 models, which were +trained for the DL Sprint 2.0 - BUET CSE Fest 2023 Competition focused on +Bangla document layout segmentation. Our primary emphasis lies in enhancing +various aspects of the task, including techniques such as image augmentation, +model architecture, and the incorporation of model ensembles. We deliberately +reduce the quality of a subset of document images to enhance the resilience of +model training, thereby resulting in an improvement in our cross-validation +score. By employing Bayesian optimization, we determine the optimal confidence +and Intersection over Union (IoU) thresholds for our model ensemble. Through +our approach, we successfully demonstrate the effectiveness of anchor-free +models in achieving robust layout segmentation in Bangla documents. + +
+
+ comment: 4 pages, 5 figures, 6 Tables +
+
+
+
+
+ + ♻ ☆ HeadSculpt: Crafting 3D Head Avatars with Text + + +
+ Recently, text-guided 3D generative methods have made remarkable advancements +in producing high-quality textures and geometry, capitalizing on the +proliferation of large vision-language and image diffusion models. However, +existing methods still struggle to create high-fidelity 3D head avatars in two +aspects: (1) They rely mostly on a pre-trained text-to-image diffusion model +whilst missing the necessary 3D awareness and head priors. This makes them +prone to inconsistency and geometric distortions in the generated avatars. (2) +They fall short in fine-grained editing. This is primarily due to the inherited +limitations from the pre-trained 2D image diffusion models, which become more +pronounced when it comes to 3D head avatars. In this work, we address these +challenges by introducing a versatile coarse-to-fine pipeline dubbed HeadSculpt +for crafting (i.e., generating and editing) 3D head avatars from textual +prompts. Specifically, we first equip the diffusion model with 3D awareness by +leveraging landmark-based control and a learned textual embedding representing +the back view appearance of heads, enabling 3D-consistent head avatar +generations. We further propose a novel identity-aware editing score +distillation strategy to optimize a textured mesh with a high-resolution +differentiable rendering technique. This enables identity preservation while +following the editing instruction. We showcase HeadSculpt's superior fidelity +and editing capabilities through comprehensive experiments and comparisons with +existing methods. + +
+
+ comment: Webpage: https://brandonhan.uk/HeadSculpt/ +
+
+
+
+
+ + ♻ ☆ Deep Curvilinear Editing: Commutative and Nonlinear Image Manipulation + for Pretrained Deep Generative Model CVPR2023 + + +
+ Semantic editing of images is the fundamental goal of computer vision. +Although deep learning methods, such as generative adversarial networks (GANs), +are capable of producing high-quality images, they often do not have an +inherent way of editing generated images semantically. Recent studies have +investigated a way of manipulating the latent variable to determine the images +to be generated. However, methods that assume linear semantic arithmetic have +certain limitations in terms of the quality of image editing, whereas methods +that discover nonlinear semantic pathways provide non-commutative editing, +which is inconsistent when applied in different orders. This study proposes a +novel method called deep curvilinear editing (DeCurvEd) to determine semantic +commuting vector fields on the latent space. We theoretically demonstrate that +owing to commutativity, the editing of multiple attributes depends only on the +quantities and not on the order. Furthermore, we experimentally demonstrate +that compared to previous methods, the nonlinear and commutative nature of +DeCurvEd facilitates the disentanglement of image attributes and provides +higher-quality editing. + +
+
+ comment: 15 pages. The last update made no changes except for adding the + following link to the CVF repository: + https://openaccess.thecvf.com/content/CVPR2023/html/Aoshima_Deep_Curvilinear_Editing_Commutative_and_Nonlinear_Image_Manipulation_for_Pretrained_CVPR_2023_paper.html. + Here, you can find our code to reproduce our results +
+
+
+
+
+ + ♻ ☆ Learning Clothing and Pose Invariant 3D Shape Representation for + Long-Term Person Re-Identification ICCV 2023 + + +
+ Long-Term Person Re-Identification (LT-ReID) has become increasingly crucial +in computer vision and biometrics. In this work, we aim to extend LT-ReID +beyond pedestrian recognition to include a wider range of real-world human +activities while still accounting for cloth-changing scenarios over large time +gaps. This setting poses additional challenges due to the geometric +misalignment and appearance ambiguity caused by the diversity of human pose and +clothing. To address these challenges, we propose a new approach 3DInvarReID +for (i) disentangling identity from non-identity components (pose, clothing +shape, and texture) of 3D clothed humans, and (ii) reconstructing accurate 3D +clothed body shapes and learning discriminative features of naked body shapes +for person ReID in a joint manner. To better evaluate our study of LT-ReID, we +collect a real-world dataset called CCDA, which contains a wide variety of +human activities and clothing changes. Experimentally, we show the superior +performance of our approach for person ReID. + +
+
+ comment: 10 pages, 7 figures, accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Walking Your LiDOG: A Journey Through Multiple Domains for LiDAR + Semantic Segmentation ICCV 2023 + + +
+ The ability to deploy robots that can operate safely in diverse environments +is crucial for developing embodied intelligent agents. As a community, we have +made tremendous progress in within-domain LiDAR semantic segmentation. However, +do these methods generalize across domains? To answer this question, we design +the first experimental setup for studying domain generalization (DG) for LiDAR +semantic segmentation (DG-LSS). Our results confirm a significant gap between +methods, evaluated in a cross-domain setting: for example, a model trained on +the source dataset (SemanticKITTI) obtains $26.53$ mIoU on the target data, +compared to $48.49$ mIoU obtained by the model trained on the target domain +(nuScenes). To tackle this gap, we propose the first method specifically +designed for DG-LSS, which obtains $34.88$ mIoU on the target domain, +outperforming all baselines. Our method augments a sparse-convolutional +encoder-decoder 3D segmentation network with an additional, dense 2D +convolutional decoder that learns to classify a birds-eye view of the point +cloud. This simple auxiliary task encourages the 3D network to learn features +that are robust to sensor placement shifts and resolution, and are transferable +across domains. With this work, we aim to inspire the community to develop and +evaluate future models in such cross-domain conditions. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Colab NAS: Obtaining lightweight task-specific convolutional neural + networks following Occam's razor + + +
+ The current trend of applying transfer learning from convolutional neural +networks (CNNs) trained on large datasets can be an overkill when the target +application is a custom and delimited problem, with enough data to train a +network from scratch. On the other hand, the training of custom and lighter +CNNs requires expertise, in the from-scratch case, and or high-end resources, +as in the case of hardware-aware neural architecture search (HW NAS), limiting +access to the technology by non-habitual NN developers. + For this reason, we present ColabNAS, an affordable HW NAS technique for +producing lightweight task-specific CNNs. Its novel derivative-free search +strategy, inspired by Occam's razor, allows to obtain state-of-the-art results +on the Visual Wake Word dataset, a standard TinyML benchmark, in just 3.1 GPU +hours using free online GPU services such as Google Colaboratory and Kaggle +Kernel. + +
+
+
+
+
+ + ♻ ☆ Sat2Density: Faithful Density Learning from Satellite-Ground Image Pairs ICCV 2023 + + +
+ This paper aims to develop an accurate 3D geometry representation of +satellite images using satellite-ground image pairs. Our focus is on the +challenging problem of 3D-aware ground-views synthesis from a satellite image. +We draw inspiration from the density field representation used in volumetric +neural rendering and propose a new approach, called Sat2Density. Our method +utilizes the properties of ground-view panoramas for the sky and non-sky +regions to learn faithful density fields of 3D scenes in a geometric +perspective. Unlike other methods that require extra depth information during +training, our Sat2Density can automatically learn accurate and faithful 3D +geometry via density representation without depth supervision. This advancement +significantly improves the ground-view panorama synthesis task. Additionally, +our study provides a new geometric perspective to understand the relationship +between satellite and ground-view images in 3D space. + +
+
+ comment: ICCV 2023, project page: https://sat2density.github.io/, code: + https://github.com/qianmingduowan/Sat2Density +
+
+
+
+
+ + ♻ ☆ Compositional Semantic Mix for Domain Adaptation in Point Cloud + Segmentation + + +
+ Deep-learning models for 3D point cloud semantic segmentation exhibit limited +generalization capabilities when trained and tested on data captured with +different sensors or in varying environments due to domain shift. Domain +adaptation methods can be employed to mitigate this domain shift, for instance, +by simulating sensor noise, developing domain-agnostic generators, or training +point cloud completion networks. Often, these methods are tailored for range +view maps or necessitate multi-modal input. In contrast, domain adaptation in +the image domain can be executed through sample mixing, which emphasizes input +data manipulation rather than employing distinct adaptation modules. In this +study, we introduce compositional semantic mixing for point cloud domain +adaptation, representing the first unsupervised domain adaptation technique for +point cloud segmentation based on semantic and geometric sample mixing. We +present a two-branch symmetric network architecture capable of concurrently +processing point clouds from a source domain (e.g. synthetic) and point clouds +from a target domain (e.g. real-world). Each branch operates within one domain +by integrating selected data fragments from the other domain and utilizing +semantic information derived from source labels and target (pseudo) labels. +Additionally, our method can leverage a limited number of human point-level +annotations (semi-supervised) to further enhance performance. We assess our +approach in both synthetic-to-real and real-to-real scenarios using LiDAR +datasets and demonstrate that it significantly outperforms state-of-the-art +methods in both unsupervised and semi-supervised settings. + +
+
+ comment: TPAMI. arXiv admin note: text overlap with arXiv:2207.09778 +
+
+
+
+
+ + ♻ ☆ Cross-Domain Few-Shot Classification via Inter-Source Stylization + + +
+ The goal of Cross-Domain Few-Shot Classification (CDFSC) is to accurately +classify a target dataset with limited labelled data by exploiting the +knowledge of a richly labelled auxiliary dataset, despite the differences +between the domains of the two datasets. Some existing approaches require +labelled samples from multiple domains for model training. However, these +methods fail when the sample labels are scarce. To overcome this challenge, +this paper proposes a solution that makes use of multiple source domains +without the need for additional labeling costs. Specifically, one of the source +domains is completely tagged, while the others are untagged. An Inter-Source +Stylization Network (ISSNet) is then introduced to enhance stylisation across +multiple source domains, enriching data distribution and model's generalization +capabilities. Experiments on 8 target datasets show that ISSNet leverages +unlabelled data from multiple source data and significantly reduces the +negative impact of domain gaps on classification performance compared to +several baseline methods. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ♻ ☆ Open Gaze: Open Source eye tracker for smartphone devices using Deep + Learning + + +
+ Eye tracking has been a pivotal tool in diverse fields such as vision +research, language analysis, and usability assessment. The majority of prior +investigations, however, have concentrated on expansive desktop displays +employing specialized, costly eye tracking hardware that lacks scalability. +Remarkably little insight exists into ocular movement patterns on smartphones, +despite their widespread adoption and significant usage. In this manuscript, we +present an open-source implementation of a smartphone-based gaze tracker that +emulates the methodology proposed by a GooglePaper (whose source code remains +proprietary). Our focus is on attaining accuracy comparable to that attained +through the GooglePaper's methodology, without the necessity for supplementary +hardware. Through the integration of machine learning techniques, we unveil an +accurate eye tracking solution that is native to smartphones. Our approach +demonstrates precision akin to the state-of-the-art mobile eye trackers, which +are characterized by a cost that is two orders of magnitude higher. Leveraging +the vast MIT GazeCapture dataset, which is available through registration on +the dataset's website, we successfully replicate crucial findings from previous +studies concerning ocular motion behavior in oculomotor tasks and saliency +analyses during natural image observation. Furthermore, we emphasize the +applicability of smartphone-based gaze tracking in discerning reading +comprehension challenges. Our findings exhibit the inherent potential to +amplify eye movement research by significant proportions, accommodating +participation from thousands of subjects with explicit consent. This +scalability not only fosters advancements in vision research, but also extends +its benefits to domains such as accessibility enhancement and healthcare +applications. + +
+
+ comment: 26 pages , 15 figures +
+
+
+
+
+ + ♻ ☆ Learning A Coarse-to-Fine Diffusion Transformer for Image Restoration + + +
+ Recent years have witnessed the remarkable performance of diffusion models in +various vision tasks. However, for image restoration that aims to recover clear +images with sharper details from given degraded observations, diffusion-based +methods may fail to recover promising results due to inaccurate noise +estimation. Moreover, simple constraining noises cannot effectively learn +complex degradation information, which subsequently hinders the model capacity. +To solve the above problems, we propose a coarse-to-fine diffusion Transformer +(C2F-DFT) for image restoration. Specifically, our C2F-DFT contains diffusion +self-attention (DFSA) and diffusion feed-forward network (DFN) within a new +coarse-to-fine training scheme. The DFSA and DFN respectively capture the +long-range diffusion dependencies and learn hierarchy diffusion representation +to facilitate better restoration. In the coarse training stage, our C2F-DFT +estimates noises and then generates the final clean image by a sampling +algorithm. To further improve the restoration quality, we propose a simple yet +effective fine training scheme. It first exploits the coarse-trained diffusion +model with fixed steps to generate restoration results, which then would be +constrained with corresponding ground-truth ones to optimize the models to +remedy the unsatisfactory results affected by inaccurate noise estimation. +Extensive experiments show that C2F-DFT significantly outperforms +diffusion-based restoration method IR-SDE and achieves competitive performance +compared with Transformer-based state-of-the-art methods on $3$ tasks, +including deraining, deblurring, and real denoising. The code is available at +https://github.com/wlydlut/C2F-DFT. + +
+
+ comment: 9 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Efficient Representation of Natural Image Patches + + +
+ In the complex domain of neural information processing, discerning +fundamental principles from ancillary details remains a significant challenge. +While there is extensive knowledge about the anatomy and physiology of the +early visual system, a comprehensive computational theory remains elusive. Can +we gain insights into the underlying principles of a biological system by +abstracting away from its detailed implementation and focusing on the +fundamental problems that the system is designed to solve? Utilizing an +abstract model based on minimal yet realistic assumptions, we show how to +achieve the early visual system's two ultimate objectives: efficient +information transmission and sensor probability distribution modeling. We show +that optimizing for information transmission does not yield optimal probability +distribution modeling. We illustrate, using a two-pixel (2D) system and image +patches, that an efficient representation can be realized via nonlinear +population code driven by two types of biologically plausible loss functions +that depend solely on output. After unsupervised learning, our abstract IPU +model bears remarkable resemblances to biological systems, despite not +mimicking many features of real neurons, such as spiking activity. A +preliminary comparison with a contemporary deep learning model suggests that +the IPU model offers a significant efficiency advantage. Our model provides +novel insights into the computational theory of early visual systems as well as +a potential new approach to enhance the efficiency of deep learning models. + +
+
+
+
+
+ + ♻ ☆ IntrinsicNeRF: Learning Intrinsic Neural Radiance Fields for Editable + Novel View Synthesis ICCV2023 + + +
+ Existing inverse rendering combined with neural rendering methods can only +perform editable novel view synthesis on object-specific scenes, while we +present intrinsic neural radiance fields, dubbed IntrinsicNeRF, which introduce +intrinsic decomposition into the NeRF-based neural rendering method and can +extend its application to room-scale scenes. Since intrinsic decomposition is a +fundamentally under-constrained inverse problem, we propose a novel +distance-aware point sampling and adaptive reflectance iterative clustering +optimization method, which enables IntrinsicNeRF with traditional intrinsic +decomposition constraints to be trained in an unsupervised manner, resulting in +multi-view consistent intrinsic decomposition results. To cope with the problem +that different adjacent instances of similar reflectance in a scene are +incorrectly clustered together, we further propose a hierarchical clustering +method with coarse-to-fine optimization to obtain a fast hierarchical indexing +representation. It supports compelling real-time augmented applications such as +recoloring and illumination variation. Extensive experiments and editing +samples on both object-specific/room-scale scenes and synthetic/real-word data +demonstrate that we can obtain consistent intrinsic decomposition results and +high-fidelity novel view synthesis even for challenging sequences. + +
+
+ comment: Accepted to ICCV2023, Project webpage: + https://zju3dv.github.io/intrinsic_nerf/, code: + https://github.com/zju3dv/IntrinsicNeRF +
+
+
+
+
+ + ♻ ☆ WALDO: Future Video Synthesis using Object Layer Decomposition and + Parametric Flow Prediction ICCV 2023 + + +
+ This paper presents WALDO (WArping Layer-Decomposed Objects), a novel +approach to the prediction of future video frames from past ones. Individual +images are decomposed into multiple layers combining object masks and a small +set of control points. The layer structure is shared across all frames in each +video to build dense inter-frame connections. Complex scene motions are modeled +by combining parametric geometric transformations associated with individual +layers, and video synthesis is broken down into discovering the layers +associated with past frames, predicting the corresponding transformations for +upcoming ones and warping the associated object regions accordingly, and +filling in the remaining image parts. Extensive experiments on multiple +benchmarks including urban videos (Cityscapes and KITTI) and videos featuring +nonrigid motions (UCF-Sports and H3.6M), show that our method consistently +outperforms the state of the art by a significant margin in every case. Code, +pretrained models, and video samples synthesized by our approach can be found +in the project webpage https://16lemoing.github.io/waldo. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Sample4Geo: Hard Negative Sampling For Cross-View Geo-Localisation + + +
+ Cross-View Geo-Localisation is still a challenging task where additional +modules, specific pre-processing or zooming strategies are necessary to +determine accurate positions of images. Since different views have different +geometries, pre-processing like polar transformation helps to merge them. +However, this results in distorted images which then have to be rectified. +Adding hard negatives to the training batch could improve the overall +performance but with the default loss functions in geo-localisation it is +difficult to include them. In this article, we present a simplified but +effective architecture based on contrastive learning with symmetric InfoNCE +loss that outperforms current state-of-the-art results. Our framework consists +of a narrow training pipeline that eliminates the need of using aggregation +modules, avoids further pre-processing steps and even increases the +generalisation capability of the model to unknown regions. We introduce two +types of sampling strategies for hard negatives. The first explicitly exploits +geographically neighboring locations to provide a good starting point. The +second leverages the visual similarity between the image embeddings in order to +mine hard negative samples. Our work shows excellent performance on common +cross-view datasets like CVUSA, CVACT, University-1652 and VIGOR. A comparison +between cross-area and same-area settings demonstrate the good generalisation +capability of our model. + +
+
+
+
+
+ + ♻ ☆ Confidence Attention and Generalization Enhanced Distillation for + Continuous Video Domain Adaptation + + +
+ Continuous Video Domain Adaptation (CVDA) is a scenario where a source model +is required to adapt to a series of individually available changing target +domains continuously without source data or target supervision. It has wide +applications, such as robotic vision and autonomous driving. The main +underlying challenge of CVDA is to learn helpful information only from the +unsupervised target data while avoiding forgetting previously learned knowledge +catastrophically, which is out of the capability of previous Video-based +Unsupervised Domain Adaptation methods. Therefore, we propose a +Confidence-Attentive network with geneRalization enhanced self-knowledge +disTillation (CART) to address the challenge in CVDA. Firstly, to learn from +unsupervised domains, we propose to learn from pseudo labels. However, in +continuous adaptation, prediction errors can accumulate rapidly in pseudo +labels, and CART effectively tackles this problem with two key modules. +Specifically, The first module generates refined pseudo labels using model +predictions and deploys a novel attentive learning strategy. The second module +compares the outputs of augmented data from the current model to the outputs of +weakly augmented data from the source model, forming a novel consistency +regularization on the model to alleviate the accumulation of prediction errors. +Extensive experiments suggest that the CVDA performance of CART outperforms +existing methods by a considerable margin. + +
+
+ comment: 16 pages, 9 tables, 10 figures +
+
+
+
+
+ + ♻ ☆ A Conditional Denoising Diffusion Probabilistic Model for Radio + Interferometric Image Reconstruction ECAI 2023 + + +
+ In radio astronomy, signals from radio telescopes are transformed into images +of observed celestial objects, or sources. However, these images, called dirty +images, contain real sources as well as artifacts due to signal sparsity and +other factors. Therefore, radio interferometric image reconstruction is +performed on dirty images, aiming to produce clean images in which artifacts +are reduced and real sources are recovered. So far, existing methods have +limited success on recovering faint sources, preserving detailed structures, +and eliminating artifacts. In this paper, we present VIC-DDPM, a Visibility and +Image Conditioned Denoising Diffusion Probabilistic Model. Our main idea is to +use both the original visibility data in the spectral domain and dirty images +in the spatial domain to guide the image generation process with DDPM. This +way, we can leverage DDPM to generate fine details and eliminate noise, while +utilizing visibility data to separate signals from noise and retaining spatial +information in dirty images. We have conducted experiments in comparison with +both traditional methods and recent deep learning based approaches. Our results +show that our method significantly improves the resulting images by reducing +artifacts, preserving fine details, and recovering dim sources. This +advancement further facilitates radio astronomical data analysis tasks on +celestial phenomena. + +
+
+ comment: Accepted by ECAI 2023 +
+
+
+
+
+ + ♻ ☆ Risk-optimized Outlier Removal for Robust Point Cloud Classification + + +
+ With the growth of 3D sensing technology, deep learning system for 3D point +clouds has become increasingly important, especially in applications like +autonomous vehicles where safety is a primary concern. However, there are also +growing concerns about the reliability of these systems when they encounter +noisy point clouds, whether occurring naturally or introduced with malicious +intent. This paper highlights the challenges of point cloud classification +posed by various forms of noise, from simple background noise to malicious +backdoor attacks that can intentionally skew model predictions. While there's +an urgent need for optimized point cloud denoising, current point outlier +removal approaches, an essential step for denoising, rely heavily on +handcrafted strategies and are not adapted for higher-level tasks, such as +classification. To address this issue, we introduce an innovative point outlier +cleansing method that harnesses the power of downstream classification models. +By employing gradient-based attribution analysis, we define a novel concept: +point risk. Drawing inspiration from tail risk minimization in finance, we +recast the outlier removal process as an optimization problem, named PointCVaR. +Extensive experiments show that our proposed technique not only robustly +filters diverse point cloud outliers but also consistently and significantly +enhances existing robust methods for point cloud classification. + +
+
+
+
+
+ + ♻ ☆ DiffusionDepth: Diffusion Denoising Approach for Monocular Depth + Estimation + + +
+ Monocular depth estimation is a challenging task that predicts the pixel-wise +depth from a single 2D image. Current methods typically model this problem as a +regression or classification task. We propose DiffusionDepth, a new approach +that reformulates monocular depth estimation as a denoising diffusion process. +It learns an iterative denoising process to `denoise' random depth distribution +into a depth map with the guidance of monocular visual conditions. The process +is performed in the latent space encoded by a dedicated depth encoder and +decoder. Instead of diffusing ground truth (GT) depth, the model learns to +reverse the process of diffusing the refined depth of itself into random depth +distribution. This self-diffusion formulation overcomes the difficulty of +applying generative models to sparse GT depth scenarios. The proposed approach +benefits this task by refining depth estimation step by step, which is superior +for generating accurate and highly detailed depth maps. Experimental results on +KITTI and NYU-Depth-V2 datasets suggest that a simple yet efficient diffusion +approach could reach state-of-the-art performance in both indoor and outdoor +scenarios with acceptable inference time. + +
+
+
+
+
+ + ♻ ☆ Bi-Modality Medical Image Synthesis Using Semi-Supervised Sequential + Generative Adversarial Networks + + +
+ In this paper, we propose a bi-modality medical image synthesis approach +based on sequential generative adversarial network (GAN) and semi-supervised +learning. Our approach consists of two generative modules that synthesize +images of the two modalities in a sequential order. A method for measuring the +synthesis complexity is proposed to automatically determine the synthesis order +in our sequential GAN. Images of the modality with a lower complexity are +synthesized first, and the counterparts with a higher complexity are generated +later. Our sequential GAN is trained end-to-end in a semi-supervised manner. In +supervised training, the joint distribution of bi-modality images are learned +from real paired images of the two modalities by explicitly minimizing the +reconstruction losses between the real and synthetic images. To avoid +overfitting limited training images, in unsupervised training, the marginal +distribution of each modality is learned based on unpaired images by minimizing +the Wasserstein distance between the distributions of real and fake images. We +comprehensively evaluate the proposed model using two synthesis tasks based on +three types of evaluate metrics and user studies. Visual and quantitative +results demonstrate the superiority of our method to the state-of-the-art +methods, and reasonable visual quality and clinical significance. Code is made +publicly available at +https://github.com/hustlinyi/Multimodal-Medical-Image-Synthesis. + +
+
+
+
+
+ + ♻ ☆ Anticipating Driving Behavior through Deep Learning-Based Policy + Prediction + + +
+ In this endeavor, we developed a comprehensive system that processes +integrated visual features derived from video frames captured by a regular +camera, along with depth details obtained from a point cloud scanner. This +system is designed to anticipate driving actions, encompassing both vehicle +speed and steering angle. To ensure its reliability, we conducted assessments +where we juxtaposed the projected outcomes with the established norms adhered +to by skilled real-world drivers. Our evaluation outcomes indicate that the +forecasts achieve a noteworthy level of accuracy in a minimum of half the test +scenarios (ranging around 50-80%, contingent on the specific model). Notably, +the utilization of amalgamated features yielded superior performance in +comparison to using video frames in isolation, as demonstrated by most of the +cases. + +
+
+ comment: 5 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ All-in-SAM: from Weak Annotation to Pixel-wise Nuclei Segmentation with + Prompt-based Finetuning + + +
+ The Segment Anything Model (SAM) is a recently proposed prompt-based +segmentation model in a generic zero-shot segmentation approach. With the +zero-shot segmentation capacity, SAM achieved impressive flexibility and +precision on various segmentation tasks. However, the current pipeline requires +manual prompts during the inference stage, which is still resource intensive +for biomedical image segmentation. In this paper, instead of using prompts +during the inference stage, we introduce a pipeline that utilizes the SAM, +called all-in-SAM, through the entire AI development workflow (from annotation +generation to model finetuning) without requiring manual prompts during the +inference stage. Specifically, SAM is first employed to generate pixel-level +annotations from weak prompts (e.g., points, bounding box). Then, the +pixel-level annotations are used to finetune the SAM segmentation model rather +than training from scratch. Our experimental results reveal two key findings: +1) the proposed pipeline surpasses the state-of-the-art (SOTA) methods in a +nuclei segmentation task on the public Monuseg dataset, and 2) the utilization +of weak and few annotations for SAM finetuning achieves competitive performance +compared to using strong pixel-wise annotated data. + +
+
+
+
+
+ + ♻ ☆ High-Resolution Document Shadow Removal via A Large-Scale Real-World + Dataset and A Frequency-Aware Shadow Erasing Net ICCV2023 + + +
+ Shadows often occur when we capture the documents with casual equipment, +which influences the visual quality and readability of the digital copies. +Different from the algorithms for natural shadow removal, the algorithms in +document shadow removal need to preserve the details of fonts and figures in +high-resolution input. Previous works ignore this problem and remove the +shadows via approximate attention and small datasets, which might not work in +real-world situations. We handle high-resolution document shadow removal +directly via a larger-scale real-world dataset and a carefully designed +frequency-aware network. As for the dataset, we acquire over 7k couples of +high-resolution (2462 x 3699) images of real-world document pairs with various +samples under different lighting circumstances, which is 10 times larger than +existing datasets. As for the design of the network, we decouple the +high-resolution images in the frequency domain, where the low-frequency details +and high-frequency boundaries can be effectively learned via the carefully +designed network structure. Powered by our network and dataset, the proposed +method clearly shows a better performance than previous methods in terms of +visual quality and numerical results. The code, models, and dataset are +available at: https://github.com/CXH-Research/DocShadow-SD7K + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ♻ ☆ ROSIA: Rotation-Search-Based Star Identification Algorithm + + +
+ This paper presents a rotation-search-based approach for addressing the star +identification (Star-ID) problem. The proposed algorithm, ROSIA, is a +heuristics-free algorithm that seeks the optimal rotation that maximally aligns +the input and catalog stars in their respective coordinates. ROSIA searches the +rotation space systematically with the Branch-and-Bound (BnB) method. Crucially +affecting the runtime feasibility of ROSIA is the upper bound function that +prioritizes the search space. In this paper, we make a theoretical contribution +by proposing a tight (provable) upper bound function that enables a 400x +speed-up compared to an existing formulation. Coupling the bounding function +with an efficient evaluation scheme that leverages stereographic projection and +the R-tree data structure, ROSIA achieves feasible operational speed on +embedded processors with state-of-the-art performances under different sources +of noise. The source code of ROSIA is available at +https://github.com/ckchng/ROSIA. + +
+
+ comment: 21 pages, 16 figures, Accepted to IEEE Transactions on Aerospace and + Electronic Systems +
+
+
+
+
+ + ♻ ☆ Real-time Strawberry Detection Based on Improved YOLOv5s Architecture + for Robotic Harvesting in open-field environment + + +
+ This study proposed a YOLOv5-based custom object detection model to detect +strawberries in an outdoor environment. The original architecture of the +YOLOv5s was modified by replacing the C3 module with the C2f module in the +backbone network, which provided a better feature gradient flow. Secondly, the +Spatial Pyramid Pooling Fast in the final layer of the backbone network of +YOLOv5s was combined with Cross Stage Partial Net to improve the generalization +ability over the strawberry dataset in this study. The proposed architecture +was named YOLOv5s-Straw. The RGB images dataset of the strawberry canopy with +three maturity classes (immature, nearly mature, and mature) was collected in +open-field environment and augmented through a series of operations including +brightness reduction, brightness increase, and noise adding. To verify the +superiority of the proposed method for strawberry detection in open-field +environment, four competitive detection models (YOLOv3-tiny, YOLOv5s, +YOLOv5s-C2f, and YOLOv8s) were trained, and tested under the same computational +environment and compared with YOLOv5s-Straw. The results showed that the +highest mean average precision of 80.3% was achieved using the proposed +architecture whereas the same was achieved with YOLOv3-tiny, YOLOv5s, +YOLOv5s-C2f, and YOLOv8s were 73.4%, 77.8%, 79.8%, 79.3%, respectively. +Specifically, the average precision of YOLOv5s-Straw was 82.1% in the immature +class, 73.5% in the nearly mature class, and 86.6% in the mature class, which +were 2.3% and 3.7%, respectively, higher than that of the latest YOLOv8s. The +model included 8.6*10^6 network parameters with an inference speed of 18ms per +image while the inference speed of YOLOv8s had a slower inference speed of +21.0ms and heavy parameters of 11.1*10^6, which indicates that the proposed +model is fast enough for real time strawberry detection and localization for +the robotic picking. + +
+
+ comment: 20 pages; 15 figures +
+
+
+
+
+ + ♻ ☆ Benchmarking Robustness of AI-Enabled Multi-sensor Fusion Systems: + Challenges and Opportunities + + +
+ Multi-Sensor Fusion (MSF) based perception systems have been the foundation +in supporting many industrial applications and domains, such as self-driving +cars, robotic arms, and unmanned aerial vehicles. Over the past few years, the +fast progress in data-driven artificial intelligence (AI) has brought a +fast-increasing trend to empower MSF systems by deep learning techniques to +further improve performance, especially on intelligent systems and their +perception systems. Although quite a few AI-enabled MSF perception systems and +techniques have been proposed, up to the present, limited benchmarks that focus +on MSF perception are publicly available. Given that many intelligent systems +such as self-driving cars are operated in safety-critical contexts where +perception systems play an important role, there comes an urgent need for a +more in-depth understanding of the performance and reliability of these MSF +systems. To bridge this gap, we initiate an early step in this direction and +construct a public benchmark of AI-enabled MSF-based perception systems +including three commonly adopted tasks (i.e., object detection, object +tracking, and depth completion). Based on this, to comprehensively understand +MSF systems' robustness and reliability, we design 14 common and realistic +corruption patterns to synthesize large-scale corrupted datasets. We further +perform a systematic evaluation of these systems through our large-scale +evaluation. Our results reveal the vulnerability of the current AI-enabled MSF +perception systems, calling for researchers and practitioners to take +robustness and reliability into account when designing AI-enabled MSF. + +
+
+ comment: To appear in ESEC/FSE 2023 +
+
+
+
+
+ + ♻ ☆ Streaming Object Detection on Fisheye Cameras for Automatic Parking + + +
+ Fisheye cameras are widely employed in automatic parking, and the video +stream object detection (VSOD) of the fisheye camera is a fundamental +perception function to ensure the safe operation of vehicles. In past research +work, the difference between the output of the deep learning model and the +actual situation at the current moment due to the existence of delay of the +perception system is generally ignored. But the environment will inevitably +change within the delay time which may cause a potential safety hazard. In this +paper, we propose a real-time detection framework equipped with a dual-flow +perception module (dynamic and static flows) that can predict the future and +alleviate the time-lag problem. Meanwhile, we use a new scheme to evaluate +latency and accuracy. The standard bounding box is unsuitable for the object in +fisheye camera images due to the strong radial distortion of the fisheye camera +and the primary detection objects of parking perception are vehicles and +pedestrians, so we adopt the rotate bounding box and propose a new periodic +angle loss function to regress the angle of the box, which is the simple and +accurate representation method of objects. The instance segmentation ground +truth is used to supervise the training. Experiments demonstrate the +effectiveness of our approach. Code is released at: +https://gitee.com/hiyanyx/fisheye-streaming-perception. + +
+
+
+
+
+ + ♻ ☆ Human from Blur: Human Pose Tracking from Blurry Images + + +
+ We propose a method to estimate 3D human poses from substantially blurred +images. The key idea is to tackle the inverse problem of image deblurring by +modeling the forward problem with a 3D human model, a texture map, and a +sequence of poses to describe human motion. The blurring process is then +modeled by a temporal image aggregation step. Using a differentiable renderer, +we can solve the inverse problem by backpropagating the pixel-wise reprojection +error to recover the best human motion representation that explains a single or +multiple input images. Since the image reconstruction loss alone is +insufficient, we present additional regularization terms. To the best of our +knowledge, we present the first method to tackle this problem. Our method +consistently outperforms other methods on significantly blurry inputs since +they lack one or multiple key functionalities that our method unifies, i.e. +image deblurring with sub-frame accuracy and explicit 3D modeling of non-rigid +human motion. + +
+
+
+
+
+ + ♻ ☆ Parkinson gait modelling from an anomaly deep representation + + +
+ Parkinson's Disease (PD) is associated with gait movement disorders, such as +bradykinesia, stiffness, tremors and postural instability, caused by +progressive dopamine deficiency. Today, some approaches have implemented +learning representations to quantify kinematic patterns during locomotion, +supporting clinical procedures such as diagnosis and treatment planning. These +approaches assumes a large amount of stratified and labeled data to optimize +discriminative representations. Nonetheless these considerations may restrict +the approaches to be operable in real scenarios during clinical practice. This +work introduces a self-supervised generative representation to learn +gait-motion-related patterns, under the pretext of video reconstruction and an +anomaly detection framework. This architecture is trained following a one-class +weakly supervised learning to avoid inter-class variance and approach the +multiple relationships that represent locomotion. The proposed approach was +validated with 14 PD patients and 23 control subjects, and trained with the +control population only, achieving an AUC of 95%, homocedasticity level of 70% +and shapeness level of 70% in the classification task considering its +generalization. + +
+
+ comment: Journal not submitted to any editorial +
+
+
+
+
+ + ♻ ☆ GAMIVAL: Video Quality Prediction on Mobile Cloud Gaming Content SP + + +
+ The mobile cloud gaming industry has been rapidly growing over the last +decade. When streaming gaming videos are transmitted to customers' client +devices from cloud servers, algorithms that can monitor distorted video quality +without having any reference video available are desirable tools. However, +creating No-Reference Video Quality Assessment (NR VQA) models that can +accurately predict the quality of streaming gaming videos rendered by computer +graphics engines is a challenging problem, since gaming content generally +differs statistically from naturalistic videos, often lacks detail, and +contains many smooth regions. Until recently, the problem has been further +complicated by the lack of adequate subjective quality databases of mobile +gaming content. We have created a new gaming-specific NR VQA model called the +Gaming Video Quality Evaluator (GAMIVAL), which combines and leverages the +advantages of spatial and temporal gaming distorted scene statistics models, a +neural noise model, and deep semantic features. Using a support vector +regression (SVR) as a regressor, GAMIVAL achieves superior performance on the +new LIVE-Meta Mobile Cloud Gaming (LIVE-Meta MCG) video quality database. + +
+
+ comment: Accepted to IEEE SPL 2023. The implementation of GAMIVAL has been + made available online: https://github.com/lskdream/GAMIVAL +
+
+
+
+
+ + ♻ ☆ Robust affine point matching via quadratic assignment on Grassmannians + + +
+ Robust Affine matching with Grassmannians (RAG) is a new algorithm to perform +affine registration of point clouds. The algorithm is based on minimizing the +Frobenius distance between two elements of the Grassmannian. For this purpose, +an indefinite relaxation of the Quadratic Assignment Problem (QAP) is used, and +several approaches to affine feature matching are studied and compared. +Experiments demonstrate that RAG is more robust to noise and point discrepancy +than previous methods. + +
+
+ comment: 8 pages, 23 figures; GitHub repository at + (https://github.com/sashakolpakov/rag) +
+
+
+
+
+ + ♻ ☆ GazeGNN: A Gaze-Guided Graph Neural Network for Chest X-ray + Classification WACV 2024 + + +
+ Eye tracking research is important in computer vision because it can help us +understand how humans interact with the visual world. Specifically for +high-risk applications, such as in medical imaging, eye tracking can help us to +comprehend how radiologists and other medical professionals search, analyze, +and interpret images for diagnostic and clinical purposes. Hence, the +application of eye tracking techniques in disease classification has become +increasingly popular in recent years. Contemporary works usually transform gaze +information collected by eye tracking devices into visual attention maps (VAMs) +to supervise the learning process. However, this is a time-consuming +preprocessing step, which stops us from applying eye tracking to radiologists' +daily work. To solve this problem, we propose a novel gaze-guided graph neural +network (GNN), GazeGNN, to leverage raw eye-gaze data without being converted +into VAMs. In GazeGNN, to directly integrate eye gaze into image +classification, we create a unified representation graph that models both +images and gaze pattern information. With this benefit, we develop a real-time, +real-world, end-to-end disease classification algorithm for the first time in +the literature. This achievement demonstrates the practicality and feasibility +of integrating real-time eye tracking techniques into the daily work of +radiologists. To our best knowledge, GazeGNN is the first work that adopts GNN +to integrate image and eye-gaze data. Our experiments on the public chest X-ray +dataset show that our proposed method exhibits the best classification +performance compared to existing methods. The code is available at +https://github.com/ukaukaaaa/GazeGNN. + +
+
+ comment: WACV 2024 +
+
+
+
+
+ + ♻ ☆ CASSPR: Cross Attention Single Scan Place Recognition ICCV2023 + + +
+ Place recognition based on point clouds (LiDAR) is an important component for +autonomous robots or self-driving vehicles. Current SOTA performance is +achieved on accumulated LiDAR submaps using either point-based or voxel-based +structures. While voxel-based approaches nicely integrate spatial context +across multiple scales, they do not exhibit the local precision of point-based +methods. As a result, existing methods struggle with fine-grained matching of +subtle geometric features in sparse single-shot Li- DAR scans. To overcome +these limitations, we propose CASSPR as a method to fuse point-based and +voxel-based approaches using cross attention transformers. CASSPR leverages a +sparse voxel branch for extracting and aggregating information at lower +resolution and a point-wise branch for obtaining fine-grained local +information. CASSPR uses queries from one branch to try to match structures in +the other branch, ensuring that both extract self-contained descriptors of the +point cloud (rather than one branch dominating), but using both to inform the +output global descriptor of the point cloud. Extensive experiments show that +CASSPR surpasses the state-of-the-art by a large margin on several datasets +(Oxford RobotCar, TUM, USyd). For instance, it achieves AR@1 of 85.6% on the +TUM dataset, surpassing the strongest prior model by ~15%. Our code is publicly +available. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ♻ ☆ MetaCOG: Learning a Metacognition to Recover What Objects Are Actually + There + + +
+ Humans not only form representations about the world based on what we see, +but also learn meta-cognitive representations about how our own vision works. +This enables us to recognize when our vision is unreliable (e.g., when we +realize that we are experiencing a visual illusion) and enables us to question +what we see. Inspired by this human capacity, we present MetaCOG: a model that +increases the robustness of object detectors by learning representations of +their reliability, and does so without feedback. Specifically, MetaCOG is a +hierarchical probabilistic model that expresses a joint distribution over the +objects in a 3D scene and the outputs produced by a detector. When paired with +an off-the-shelf object detector, MetaCOG takes detections as input and infers +the detector's tendencies to miss objects of certain categories and to +hallucinate objects that are not actually present, all without access to +ground-truth object labels. When paired with three modern neural object +detectors, MetaCOG learns useful and accurate meta-cognitive representations, +resulting in improved performance on the detection task. Additionally, we show +that MetaCOG is robust to varying levels of error in the detections. Our +results are a proof-of-concept for a novel approach to the problem of +correcting a faulty vision system's errors. The model code, datasets, results, +and demos are available: +https://osf.io/8b9qt/?view_only=8c1b1c412c6b4e1697e3c7859be2fce6 + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+
+
+
+ + Information Retrieval 20 + +
+
+
+ + ☆ Robust Long-Tailed Learning via Label-Aware Bounded CVaR + + +
+ Data in the real-world classification problems are always imbalanced or +long-tailed, wherein the majority classes have the most of the samples that +dominate the model training. In such setting, the naive model tends to have +poor performance on the minority classes. Previously, a variety of loss +modifications have been proposed to address the long-tailed leaning problem, +while these methods either treat the samples in the same class +indiscriminatingly or lack a theoretical guarantee. In this paper, we propose +two novel approaches based on CVaR (Conditional Value at Risk) to improve the +performance of long-tailed learning with a solid theoretical ground. +Specifically, we firstly introduce a Label-Aware Bounded CVaR (LAB-CVaR) loss +to overcome the pessimistic result of the original CVaR, and further design the +optimal weight bounds for LAB-CVaR theoretically. Based on LAB-CVaR, we +additionally propose a LAB-CVaR with logit adjustment (LAB-CVaR-logit) loss to +stabilize the optimization process, where we also offer the theoretical +support. Extensive experiments on real-world datasets with long-tailed label +distributions verify the superiority of our proposed methods. + +
+
+
+
+
+ + ☆ A Multi-Perspective Learning to Rank Approach to Support Children's + Information Seeking in the Classroom + + +
+ We introduce a novel re-ranking model that aims to augment the functionality +of standard search engines to support classroom search activities for children +(ages 6 to 11). This model extends the known listwise learning-to-rank +framework by balancing risk and reward. Doing so enables the model to +prioritize Web resources of high educational alignment, appropriateness, and +adequate readability by analyzing the URLs, snippets, and page titles of Web +resources retrieved by a given mainstream search engine. Experimental results, +including an ablation study and comparisons with existing baselines, showcase +the correctness of the proposed model. The outcomes of this work demonstrate +the value of considering multiple perspectives inherent to the classroom +setting, e.g., educational alignment, readability, and objectionability, when +applied to the design of algorithms that can better support children's +information discovery. + +
+
+ comment: Extended version of the manuscript to appear in proceedings of the + 22nd IEEE/WIC International Conference on Web Intelligence and Intelligent + Agent Technology +
+
+
+
+
+ + ☆ Knowledge-based Multiple Adaptive Spaces Fusion for Recommendation + + +
+ Since Knowledge Graphs (KGs) contain rich semantic information, recently +there has been an influx of KG-enhanced recommendation methods. Most of +existing methods are entirely designed based on euclidean space without +considering curvature. However, recent studies have revealed that a tremendous +graph-structured data exhibits highly non-euclidean properties. Motivated by +these observations, in this work, we propose a knowledge-based multiple +adaptive spaces fusion method for recommendation, namely MCKG. Unlike existing +methods that solely adopt a specific manifold, we introduce the unified space +that is compatible with hyperbolic, euclidean and spherical spaces. +Furthermore, we fuse the multiple unified spaces in an attention manner to +obtain the high-quality embeddings for better knowledge propagation. In +addition, we propose a geometry-aware optimization strategy which enables the +pull and push processes benefited from both hyperbolic and spherical spaces. +Specifically, in hyperbolic space, we set smaller margins in the area near to +the origin, which is conducive to distinguishing between highly similar +positive items and negative ones. At the same time, we set larger margins in +the area far from the origin to ensure the model has sufficient error +tolerance. The similar manner also applies to spherical spaces. Extensive +experiments on three real-world datasets demonstrate that the MCKG has a +significant improvement over state-of-the-art recommendation methods. Further +ablation experiments verify the importance of multi-space fusion and +geometry-aware optimization strategy, justifying the rationality and +effectiveness of MCKG. + +
+
+
+
+
+ + ☆ Classification-Aware Neural Topic Model Combined With Interpretable + Analysis -- For Conflict Classification + + +
+ A large number of conflict events are affecting the world all the time. In +order to analyse such conflict events effectively, this paper presents a +Classification-Aware Neural Topic Model (CANTM-IA) for Conflict Information +Classification and Topic Discovery. The model provides a reliable +interpretation of classification results and discovered topics by introducing +interpretability analysis. At the same time, interpretation is introduced into +the model architecture to improve the classification performance of the model +and to allow interpretation to focus further on the details of the data. +Finally, the model architecture is optimised to reduce the complexity of the +model. + +
+
+ comment: Accepted by RANLP 2023 +
+
+
+
+
+ + ☆ Providing Previously Unseen Users Fair Recommendations Using Variational + Autoencoders RecSys 2023 + + +
+ An emerging definition of fairness in machine learning requires that models +are oblivious to demographic user information, e.g., a user's gender or age +should not influence the model. Personalized recommender systems are +particularly prone to violating this definition through their explicit user +focus and user modelling. Explicit user modelling is also an aspect that makes +many recommender systems incapable of providing hitherto unseen users with +recommendations. We propose novel approaches for mitigating discrimination in +Variational Autoencoder-based recommender systems by limiting the encoding of +demographic information. The approaches are capable of, and evaluated on, +providing users that are not represented in the training data with fair +recommendations. + +
+
+ comment: Appearing in RecSys 2023 proceedings +
+
+
+
+
+ + ☆ CAGRA: Highly Parallel Graph Construction and Approximate Nearest + Neighbor Search for GPUs + + +
+ Approximate Nearest Neighbor Search (ANNS) plays a critical role in various +disciplines spanning data mining and artificial intelligence, from information +retrieval and computer vision to natural language processing and recommender +systems. Data volumes have soared in recent years and the computational cost of +an exhaustive exact nearest neighbor search is often prohibitive, necessitating +the adoption of approximate techniques. The balanced performance and recall of +graph-based approaches have more recently garnered significant attention in +ANNS algorithms, however, only a few studies have explored harnessing the power +of GPUs and multi-core processors despite the widespread use of massively +parallel and general-purpose computing. To bridge this gap, we introduce a +novel parallel computing hardware-based proximity graph and search algorithm. +By leveraging the high-performance capabilities of modern hardware, our +approach achieves remarkable efficiency gains. In particular, our method +surpasses existing CPU and GPU-based methods in constructing the proximity +graph, demonstrating higher throughput in both large- and small-batch searches +while maintaining compatible accuracy. In graph construction time, our method, +CAGRA, is 2.2~27x faster than HNSW, which is one of the CPU SOTA +implementations. In large-batch query throughput in the 90% to 95% recall +range, our method is 33~77x faster than HNSW, and is 3.8~8.8x faster than the +SOTA implementations for GPU. For a single query, our method is 3.4~53x faster +than HNSW at 95% recall. + +
+
+
+
+
+ + ☆ Killing two birds with one stone: Can an audio captioning system also be + used for audio-text retrieval? + + +
+ Automated Audio Captioning (AAC) aims to develop systems capable of +describing an audio recording using a textual sentence. In contrast, Audio-Text +Retrieval (ATR) systems seek to find the best matching audio recording(s) for a +given textual query (Text-to-Audio) or vice versa (Audio-to-Text). These tasks +require different types of systems: AAC employs a sequence-to-sequence model, +while ATR utilizes a ranking model that compares audio and text representations +within a shared projection subspace. However, this work investigates the +relationship between AAC and ATR by exploring the ATR capabilities of an +unmodified AAC system, without fine-tuning for the new task. Our AAC system +consists of an audio encoder (ConvNeXt-Tiny) trained on AudioSet for audio +tagging, and a transformer decoder responsible for generating sentences. For +AAC, it achieves a high SPIDEr-FL score of 0.298 on Clotho and 0.472 on +AudioCaps on average. For ATR, we propose using the standard Cross-Entropy loss +values obtained for any audio/caption pair. Experimental results on the Clotho +and AudioCaps datasets demonstrate decent recall values using this simple +approach. For instance, we obtained a Text-to-Audio R@1 value of 0.382 for +Au-dioCaps, which is above the current state-of-the-art method without external +data. Interestingly, we observe that normalizing the loss values was necessary +for Audio-to-Text retrieval. + +
+
+ comment: cam ready version (14/08/23) +
+
+
+
+
+ + ☆ STEC: See-Through Transformer-based Encoder for CTR Prediction + + +
+ Click-Through Rate (CTR) prediction holds a pivotal place in online +advertising and recommender systems since CTR prediction performance directly +influences the overall satisfaction of the users and the revenue generated by +companies. Even so, CTR prediction is still an active area of research since it +involves accurately modelling the preferences of users based on sparse and +high-dimensional features where the higher-order interactions of multiple +features can lead to different outcomes. Most CTR prediction models have relied +on a single fusion and interaction learning strategy. The few CTR prediction +models that have utilized multiple interaction modelling strategies have +treated each interaction to be self-contained. In this paper, we propose a +novel model named STEC that reaps the benefits of multiple interaction learning +approaches in a single unified architecture. Additionally, our model introduces +residual connections from different orders of interactions which boosts the +performance by allowing lower level interactions to directly affect the +predictions. Through extensive experiments on four real-world datasets, we +demonstrate that STEC outperforms existing state-of-the-art approaches for CTR +prediction thanks to its greater expressive capabilities. + +
+
+
+
+
+ + ☆ Improving Neural Ranking Models with Traditional IR Methods + + +
+ Neural ranking methods based on large transformer models have recently gained +significant attention in the information retrieval community, and have been +adopted by major commercial solutions. Nevertheless, they are computationally +expensive to create, and require a great deal of labeled data for specialized +corpora. In this paper, we explore a low resource alternative which is a +bag-of-embedding model for document retrieval and find that it is competitive +with large transformer models fine tuned on information retrieval tasks. Our +results show that a simple combination of TF-IDF, a traditional keyword +matching method, with a shallow embedding model provides a low cost path to +compete well with the performance of complex neural ranking models on 3 +datasets. Furthermore, adding TF-IDF measures improves the performance of +large-scale fine tuned models on these tasks. + +
+
+ comment: Short paper, 4 pages +
+
+
+
+
+ + ☆ CAPS: A Practical Partition Index for Filtered Similarity Search + + +
+ With the surging popularity of approximate near-neighbor search (ANNS), +driven by advances in neural representation learning, the ability to serve +queries accompanied by a set of constraints has become an area of intense +interest. While the community has recently proposed several algorithms for +constrained ANNS, almost all of these methods focus on integration with +graph-based indexes, the predominant class of algorithms achieving +state-of-the-art performance in latency-recall tradeoffs. In this work, we take +a different approach and focus on developing a constrained ANNS algorithm via +space partitioning as opposed to graphs. To that end, we introduce Constrained +Approximate Partitioned Search (CAPS), an index for ANNS with filters via space +partitions that not only retains the benefits of a partition-based algorithm +but also outperforms state-of-the-art graph-based constrained search techniques +in recall-latency tradeoffs, with only 10% of the index size. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ Continual Learning for Generative Retrieval over Dynamic Corpora CIKM 2023 + + +
+ Generative retrieval (GR) directly predicts the identifiers of relevant +documents (i.e., docids) based on a parametric model. It has achieved solid +performance on many ad-hoc retrieval tasks. So far, these tasks have assumed a +static document collection. In many practical scenarios, however, document +collections are dynamic, where new documents are continuously added to the +corpus. The ability to incrementally index new documents while preserving the +ability to answer queries with both previously and newly indexed relevant +documents is vital to applying GR models. In this paper, we address this +practical continual learning problem for GR. We put forward a novel +Continual-LEarner for generatiVE Retrieval (CLEVER) model and make two major +contributions to continual learning for GR: (i) To encode new documents into +docids with low computational cost, we present Incremental Product +Quantization, which updates a partial quantization codebook according to two +adaptive thresholds; and (ii) To memorize new documents for querying without +forgetting previous knowledge, we propose a memory-augmented learning +mechanism, to form meaningful connections between old and new documents. +Empirical results demonstrate the effectiveness and efficiency of the proposed +model. + +
+
+ comment: Accepted by CIKM 2023 +
+
+
+
+
+ + ☆ Vector Search with OpenAI Embeddings: Lucene Is All You Need + + +
+ We provide a reproducible, end-to-end demonstration of vector search with +OpenAI embeddings using Lucene on the popular MS MARCO passage ranking test +collection. The main goal of our work is to challenge the prevailing narrative +that a dedicated vector store is necessary to take advantage of recent advances +in deep neural networks as applied to search. Quite the contrary, we show that +hierarchical navigable small-world network (HNSW) indexes in Lucene are +adequate to provide vector search capabilities in a standard bi-encoder +architecture. This suggests that, from a simple cost-benefit analysis, there +does not appear to be a compelling reason to introduce a dedicated vector store +into a modern "AI stack" for search, since such applications have already +received substantial investments in existing, widely deployed infrastructure. + +
+
+
+
+
+ + ☆ Ensuring User-side Fairness in Dynamic Recommender Systems + + +
+ User-side group fairness is crucial for modern recommender systems, as it +aims to alleviate performance disparity between groups of users defined by +sensitive attributes such as gender, race, or age. We find that the disparity +tends to persist or even increase over time. This calls for effective ways to +address user-side fairness in a dynamic environment, which has been +infrequently explored in the literature. However, fairness-constrained +re-ranking, a typical method to ensure user-side fairness (i.e., reducing +performance disparity), faces two fundamental challenges in the dynamic +setting: (1) non-differentiability of the ranking-based fairness constraint, +which hinders the end-to-end training paradigm, and (2) time-inefficiency, +which impedes quick adaptation to changes in user preferences. In this paper, +we propose FAir Dynamic rEcommender (FADE), an end-to-end framework with +fine-tuning strategy to dynamically alleviate performance disparity. To tackle +the above challenges, FADE uses a novel fairness loss designed to be +differentiable and lightweight to fine-tune model parameters to ensure both +user-side fairness and high-quality recommendations. Via extensive experiments +on the real-world dataset, we empirically demonstrate that FADE effectively and +efficiently reduces performance disparity, and furthermore, FADE improves +overall recommendation quality over time compared to not using any new data. + +
+
+ comment: 10 pages, 8 figures +
+
+
+
+
+ + ☆ Dimensionality Reduction Using pseudo-Boolean polynomials For Cluster + Analysis + + +
+ We introduce usage of a reduction property of penalty-based formulation of +pseudo-Boolean polynomials as a mechanism for invariant dimensionality +reduction in cluster analysis processes. In our experiments, we show that +multidimensional data, like 4-dimensional Iris Flower dataset can be reduced to +2-dimensional space while the 30-dimensional Wisconsin Diagnostic Breast Cancer +(WDBC) dataset can be reduced to 3-dimensional space, and by searching lines or +planes that lie between reduced samples we can extract clusters in a linear and +unbiased manner with competitive accuracies, reproducibility and clear +interpretation. + +
+
+ comment: 14 pages, 4 figures, submitted to the International Conference Data + Analysis, Optimization and Their Applications on the Occasion of Boris + Mirkin's 80th Birthday January 30-31, 2023, Dolgoprudny, Moscow Region, + Moscow Institute of Physics and Technology + https://mipt.ru/education/chairs/dm/conferences/data-analysis-optimization-and-their-applications-2023.php +
+
+
+
+
+ + ☆ Chunked Lists versus Extensible Arrays for Text Inversion + + +
+ In our 2017 work on in-memory list-based text inversion [Hawking and +Billerbeck. Efficient In-Memory, List-Based Text Inversion. ADCS 2017] we +compared memory use and indexing speed of a considerable number of variants of +chunked linked lists. In the present work we compare the best performing of +those variants (FBB - dynamic Fibonacci chunking) with the extensible SQ array +technique (SQA) presented in [Moffat and Mackenzie. Immediate-Access Indexing +Using Space-Efficient Extensible Arrays. ADCS 2023]. + +
+
+ comment: 2 pages, 2 figures, 1 table +
+
+
+
+
+ + ♻ ☆ Mol-Instructions: A Large-Scale Biomolecular Instruction Dataset for + Large Language Models + + +
+ Large Language Models (LLMs), with their remarkable task-handling +capabilities and innovative outputs, have catalyzed significant advancements +across a spectrum of fields. However, their proficiency within specialized +domains such as biomolecular studies remains limited. To address this +challenge, we introduce Mol-Instructions, a meticulously curated, comprehensive +instruction dataset expressly designed for the biomolecular realm. +Mol-Instructions is composed of three pivotal components: molecule-oriented +instructions, protein-oriented instructions, and biomolecular text +instructions, each curated to enhance the understanding and prediction +capabilities of LLMs concerning biomolecular features and behaviors. Through +extensive instruction tuning experiments on the representative LLM, we +underscore the potency of Mol-Instructions to enhance the adaptability and +cognitive acuity of large models within the complex sphere of biomolecular +studies, thereby promoting advancements in the biomolecular research community. +Mol-Instructions is made publicly accessible for future research endeavors and +will be subjected to continual updates for enhanced applicability. + +
+
+ comment: Project homepage: https://github.com/zjunlp/Mol-Instructions. Add + quantitative evaluations +
+
+
+
+
+ + ♻ ☆ Political Sentiment Analysis of Persian Tweets Using CNN-LSTM Model + + +
+ Sentiment analysis is the process of identifying and categorizing people's +emotions or opinions regarding various topics. The analysis of Twitter +sentiment has become an increasingly popular topic in recent years. In this +paper, we present several machine learning and a deep learning model to +analysis sentiment of Persian political tweets. Our analysis was conducted +using Bag of Words and ParsBERT for word representation. We applied Gaussian +Naive Bayes, Gradient Boosting, Logistic Regression, Decision Trees, Random +Forests, as well as a combination of CNN and LSTM to classify the polarities of +tweets. The results of this study indicate that deep learning with ParsBERT +embedding performs better than machine learning. The CNN-LSTM model had the +highest classification accuracy with 89 percent on the first dataset and 71 +percent on the second dataset. Due to the complexity of Persian, it was a +difficult task to achieve this level of efficiency. The main objective of our +research was to reduce the training time while maintaining the model's +performance. As a result, several adjustments were made to the model +architecture and parameters. In addition to achieving the objective, the +performance was slightly improved as well. + +
+
+
+
+
+ + ♻ ☆ Soft Prompt Tuning for Augmenting Dense Retrieval with Large Language + Models + + +
+ Dense retrieval (DR) converts queries and documents into dense embeddings and +measures the similarity between queries and documents in vector space. One of +the challenges in DR is the lack of domain-specific training data. While DR +models can learn from large-scale public datasets like MS MARCO through +transfer learning, evidence shows that not all DR models and domains can +benefit from transfer learning equally. Recently, some researchers have +resorted to large language models (LLMs) to improve the zero-shot and few-shot +DR models. However, the hard prompts or human-written prompts utilized in these +works cannot guarantee the good quality of generated weak queries. To tackle +this, we propose soft prompt tuning for augmenting DR (SPTAR): For each task, +we leverage soft prompt-tuning to optimize a task-specific soft prompt on +limited ground truth data and then prompt the LLMs to tag unlabeled documents +with weak queries, yielding enough weak document-query pairs to train +task-specific dense retrievers. We design a filter to select high-quality +example document-query pairs in the prompt to further improve the quality of +weak tagged queries. To the best of our knowledge, there is no prior work +utilizing soft prompt tuning to augment DR models. The experiments demonstrate +that SPTAR outperforms the unsupervised baselines BM25 and the recently +proposed LLMs-based augmentation method for DR. + +
+
+ comment: fix typos +
+
+
+
+
+ + ♻ ☆ Dual-Granularity Contrastive Learning for Session-based Recommendation + + +
+ Session-based recommendation systems(SBRS) are more suitable for the current +e-commerce and streaming media recommendation scenarios and thus have become a +hot topic. The data encountered by SBRS is typically highly sparse, which also +serves as one of the bottlenecks limiting the accuracy of recommendations. So +Contrastive Learning(CL) is applied in SBRS owing to its capability of +improving embedding learning under the condition of sparse data. However, +existing CL strategies are limited in their ability to enforce finer-grained +(e.g., factor-level) comparisons and, as a result, are unable to capture subtle +differences between instances. More than that, these strategies usually use +item or segment dropout as a means of data augmentation which may result in +sparser data and thus ineffective self-supervised signals. By addressing the +two aforementioned limitations, we introduce a novel multi-granularity CL +framework. Specifically, two extra augmented embedding convolution channels +with different granularities are constructed and the embeddings learned by them +are compared with those learned from original view to complete the CL tasks. At +factor-level, we employ Disentangled Representation Learning to obtain +finer-grained data(e.g. factor-level embeddings), with which we can construct +factor-level convolution channels. At item-level, the star graph is deployed as +the augmented data and graph convolution on it can ensure the effectiveness of +self-supervised signals. Compare the learned embeddings of these two views with +the learned embeddings of the basic view to achieve CL at two granularities. +Finally, the more precise item-level and factor-level embeddings obtained are +referenced to generate personalized recommendations for the user. The proposed +model is validated through extensive experiments on two benchmark datasets, +showcasing superior performance compared to existing methods. + +
+
+
+
+
+ + ♻ ☆ RecXplainer: Amortized Attribute-based Personalized Explanations for + Recommender Systems NeurIPS 2022 + + +
+ Recommender systems influence many of our interactions in the digital world +-- impacting how we shop for clothes, sorting what we see when browsing YouTube +or TikTok, and determining which restaurants and hotels we are shown when using +hospitality platforms. Modern recommender systems are large, opaque models +trained on a mixture of proprietary and open-source datasets. Naturally, issues +of trust arise on both the developer and user side: is the system working +correctly, and why did a user receive (or not receive) a particular +recommendation? Providing an explanation alongside a recommendation alleviates +some of these concerns. The status quo for auxiliary recommender system +feedback is either user-specific explanations (e.g., "users who bought item B +also bought item A") or item-specific explanations (e.g., "we are recommending +item A because you watched/bought item B"). However, users bring personalized +context into their search experience, valuing an item as a function of that +item's attributes and their own personal preferences. In this work, we propose +RecXplainer, a novel method for generating fine-grained explanations based on a +user's preferences over the attributes of recommended items. We evaluate +RecXplainer on five real-world and large-scale recommendation datasets using +five different kinds of recommender systems to demonstrate the efficacy of +RecXplainer in capturing users' preferences over item attributes and using them +to explain recommendations. We also compare RecXplainer to five baselines and +show RecXplainer's exceptional performance on ten metrics. + +
+
+ comment: Awarded the Best Student Paper at TEA Workshop at NeurIPS 2022 +
+
+
+
+
+
+
+
+ + Machine Learning 157 + +
+
+
+ + ☆ 3D Adversarial Augmentations for Robust Out-of-Domain Predictions + + +
+ Since real-world training datasets cannot properly sample the long tail of +the underlying data distribution, corner cases and rare out-of-domain samples +can severely hinder the performance of state-of-the-art models. This problem +becomes even more severe for dense tasks, such as 3D semantic segmentation, +where points of non-standard objects can be confidently associated to the wrong +class. In this work, we focus on improving the generalization to out-of-domain +data. We achieve this by augmenting the training set with adversarial examples. +First, we learn a set of vectors that deform the objects in an adversarial +fashion. To prevent the adversarial examples from being too far from the +existing data distribution, we preserve their plausibility through a series of +constraints, ensuring sensor-awareness and shapes smoothness. Then, we perform +adversarial augmentation by applying the learned sample-independent vectors to +the available objects when training a model. We conduct extensive experiments +across a variety of scenarios on data from KITTI, Waymo, and CrashD for 3D +object detection, and on data from SemanticKITTI, Waymo, and nuScenes for 3D +semantic segmentation. Despite training on a standard single dataset, our +approach substantially improves the robustness and generalization of both 3D +object detection and 3D semantic segmentation methods to out-of-domain data. + +
+
+ comment: 37 pages, 12 figures +
+
+
+
+
+ + ☆ An Adaptive Tangent Feature Perspective of Neural Networks + + +
+ In order to better understand feature learning in neural networks, we propose +a framework for understanding linear models in tangent feature space where the +features are allowed to be transformed during training. We consider linear +transformations of features, resulting in a joint optimization over parameters +and transformations with a bilinear interpolation constraint. We show that this +optimization problem has an equivalent linearly constrained optimization with +structured regularization that encourages approximately low rank solutions. +Specializing to neural network structure, we gain insights into how the +features and thus the kernel function change, providing additional nuance to +the phenomenon of kernel alignment when the target function is poorly +represented using tangent features. In addition to verifying our theoretical +observations in real neural networks on a simple regression problem, we +empirically show that an adaptive feature implementation of tangent feature +classification has an order of magnitude lower sample complexity than the fixed +tangent feature model on MNIST and CIFAR-10. + +
+
+ comment: 15 pages, 4 figures +
+
+
+
+
+ + ☆ Policy composition in reinforcement learning via multi-objective policy + optimization + + +
+ We enable reinforcement learning agents to learn successful behavior policies +by utilizing relevant pre-existing teacher policies. The teacher policies are +introduced as objectives, in addition to the task objective, in a +multi-objective policy optimization setting. Using the Multi-Objective Maximum +a Posteriori Policy Optimization algorithm +\citep{abdolmaleki2020distributional}, we show that teacher policies can help +speed up learning, particularly in the absence of shaping rewards. In two +domains with continuous observation and action spaces, our agents successfully +compose teacher policies in sequence and in parallel, and are also able to +further extend the policies of the teachers in order to solve the task. + Depending on the specified combination of task and teacher(s), teacher(s) may +naturally act to limit the final performance of an agent. The extent to which +agents are required to adhere to teacher policies are determined by +hyperparameters which determine both the effect of teachers on learning speed +and the eventual performance of the agent on the task. In the {\tt humanoid} +domain \citep{deepmindcontrolsuite2018}, we also equip agents with the ability +to control the selection of teachers. With this ability, agents are able to +meaningfully compose from the teacher policies to achieve a superior task +reward on the {\tt walk} task than in cases without access to the teacher +policies. We show the resemblance of composed task policies with the +corresponding teacher policies through videos. + +
+
+
+
+
+ + ☆ Input margins can predict generalization too + + +
+ Understanding generalization in deep neural networks is an active area of +research. A promising avenue of exploration has been that of margin +measurements: the shortest distance to the decision boundary for a given sample +or its representation internal to the network. While margins have been shown to +be correlated with the generalization ability of a model when measured at its +hidden representations (hidden margins), no such link between large margins and +generalization has been established for input margins. We show that while input +margins are not generally predictive of generalization, they can be if the +search space is appropriately constrained. We develop such a measure based on +input margins, which we refer to as `constrained margins'. The predictive power +of this new measure is demonstrated on the 'Predicting Generalization in Deep +Learning' (PGDL) dataset and contrasted with hidden representation margins. We +find that constrained margins achieve highly competitive scores and outperform +other margin measurements in general. This provides a novel insight on the +relationship between generalization and classification margins, and highlights +the importance of considering the data manifold for investigations of +generalization in DNNs. + +
+
+
+
+
+ + ☆ A Comparative Study of Loss Functions: Traffic Predictions in Regular + and Congestion Scenarios + + +
+ Spatiotemporal graph neural networks have achieved state-of-the-art +performance in traffic forecasting. However, they often struggle to forecast +congestion accurately due to the limitations of traditional loss functions. +While accurate forecasting of regular traffic conditions is crucial, a reliable +AI system must also accurately forecast congestion scenarios to maintain safe +and efficient transportation. In this paper, we explore various loss functions +inspired by heavy tail analysis and imbalanced classification problems to +address this issue. We evaluate the efficacy of these loss functions in +forecasting traffic speed, with an emphasis on congestion scenarios. Through +extensive experiments on real-world traffic datasets, we discovered that when +optimizing for Mean Absolute Error (MAE), the MAE-Focal Loss function stands +out as the most effective. When optimizing Mean Squared Error (MSE), Gumbel +Loss proves to be the superior choice. These choices effectively forecast +traffic congestion events without compromising the accuracy of regular traffic +speed forecasts. This research enhances deep learning models' capabilities in +forecasting sudden speed changes due to congestion and underscores the need for +more research in this direction. By elevating the accuracy of congestion +forecasting, we advocate for AI systems that are reliable, secure, and +resilient in practical traffic management scenarios. + +
+
+
+
+
+ + ☆ Canonical Factors for Hybrid Neural Fields ICCV 2023 + + +
+ Factored feature volumes offer a simple way to build more compact, efficient, +and intepretable neural fields, but also introduce biases that are not +necessarily beneficial for real-world data. In this work, we (1) characterize +the undesirable biases that these architectures have for axis-aligned signals +-- they can lead to radiance field reconstruction differences of as high as 2 +PSNR -- and (2) explore how learning a set of canonicalizing transformations +can improve representations by removing these biases. We prove in a +two-dimensional model problem that simultaneously learning these +transformations together with scene appearance succeeds with drastically +improved efficiency. We validate the resulting architectures, which we call +TILTED, using image, signed distance, and radiance field reconstruction tasks, +where we observe improvements across quality, robustness, compactness, and +runtime. Results demonstrate that TILTED can enable capabilities comparable to +baselines that are 2x larger, while highlighting weaknesses of neural field +evaluation procedures. + +
+
+ comment: ICCV 2023. Project webpage: https://brentyi.github.io/tilted/ +
+
+
+
+
+ + ☆ From SMOTE to Mixup for Deep Imbalanced Classification + + +
+ Given imbalanced data, it is hard to train a good classifier using deep +learning because of the poor generalization of minority classes. Traditionally, +the well-known synthetic minority oversampling technique (SMOTE) for data +augmentation, a data mining approach for imbalanced learning, has been used to +improve this generalization. However, it is unclear whether SMOTE also benefits +deep learning. In this work, we study why the original SMOTE is insufficient +for deep learning, and enhance SMOTE using soft labels. Connecting the +resulting soft SMOTE with Mixup, a modern data augmentation technique, leads to +a unified framework that puts traditional and modern data augmentation +techniques under the same umbrella. A careful study within this framework shows +that Mixup improves generalization by implicitly achieving uneven margins +between majority and minority classes. We then propose a novel margin-aware +Mixup technique that more explicitly achieves uneven margins. Extensive +experimental results demonstrate that our proposed technique yields +state-of-the-art performance on deep imbalanced classification while achieving +superior performance on extremely imbalanced data. The code is open-sourced in +our developed package https://github.com/ntucllab/imbalanced-DL to foster +future research in this direction. + +
+
+ comment: 25 pages, 3 figures +
+
+
+
+
+ + ☆ When Do Program-of-Thoughts Work for Reasoning? + + +
+ The reasoning capabilities of Large Language Models (LLMs) play a pivotal +role in the realm of embodied artificial intelligence. Although there are +effective methods like program-of-thought prompting for LLMs which uses +programming language to tackle complex reasoning tasks, the specific impact of +code data on the improvement of reasoning capabilities remains under-explored. +To address this gap, we propose complexity-impacted reasoning score (CIRS), +which combines structural and logical attributes, to measure the correlation +between code and reasoning abilities. Specifically, we use the abstract syntax +tree to encode the structural information and calculate logical complexity by +considering the difficulty and the cyclomatic complexity. Through an empirical +analysis, we find not all code data of complexity can be learned or understood +by LLMs. Optimal level of complexity is critical to the improvement of +reasoning abilities by program-aided prompting. Then we design an +auto-synthesizing and stratifying algorithm, and apply it to instruction +generation for mathematical reasoning and code data filtering for code +generation tasks. Extensive results demonstrates the effectiveness of our +proposed approach. Code will be integrated into the EasyInstruct framework at +https://github.com/zjunlp/EasyInstruct. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Random feature approximation for general spectral methods + + +
+ Random feature approximation is arguably one of the most popular techniques +to speed up kernel methods in large scale algorithms and provides a theoretical +approach to the analysis of deep neural networks. We analyze generalization +properties for a large class of spectral regularization methods combined with +random features, containing kernel methods with implicit regularization such as +gradient descent or explicit methods like Tikhonov regularization. For our +estimators we obtain optimal learning rates over regularity classes (even for +classes that are not included in the reproducing kernel Hilbert space), which +are defined through appropriate source conditions. This improves or completes +previous results obtained in related settings for specific kernel algorithms. + +
+
+
+
+
+ + ☆ Probabilistic solar flare forecasting using historical magnetogram data + + +
+ Solar flare forecasting research using machine learning (ML) has focused on +high resolution magnetogram data from the SDO/HMI era covering Solar Cycle 24 +and the start of Solar Cycle 25, with some efforts looking back to SOHO/MDI for +data from Solar Cycle 23. In this paper, we consider over 4 solar cycles of +daily historical magnetogram data from multiple instruments. This is the first +attempt to take advantage of this historical data for ML-based flare +forecasting. We apply a convolutional neural network (CNN) to extract features +from full-disk magnetograms together with a logistic regression model to +incorporate scalar features based on magnetograms and flaring history. We use +an ensemble approach to generate calibrated probabilistic forecasts of M-class +or larger flares in the next 24 hours. Overall, we find that including +historical data improves forecasting skill and reliability. We show that single +frame magnetograms do not contain significantly more relevant information than +can be summarized in a small number of scalar features, and that flaring +history has greater predictive power than our CNN-extracted features. This +indicates the importance of including temporal information in flare forecasting +models. + +
+
+ comment: 22 pages, 16 figures, accepted to ApJ +
+
+
+
+
+ + ☆ Robust Long-Tailed Learning via Label-Aware Bounded CVaR + + +
+ Data in the real-world classification problems are always imbalanced or +long-tailed, wherein the majority classes have the most of the samples that +dominate the model training. In such setting, the naive model tends to have +poor performance on the minority classes. Previously, a variety of loss +modifications have been proposed to address the long-tailed leaning problem, +while these methods either treat the samples in the same class +indiscriminatingly or lack a theoretical guarantee. In this paper, we propose +two novel approaches based on CVaR (Conditional Value at Risk) to improve the +performance of long-tailed learning with a solid theoretical ground. +Specifically, we firstly introduce a Label-Aware Bounded CVaR (LAB-CVaR) loss +to overcome the pessimistic result of the original CVaR, and further design the +optimal weight bounds for LAB-CVaR theoretically. Based on LAB-CVaR, we +additionally propose a LAB-CVaR with logit adjustment (LAB-CVaR-logit) loss to +stabilize the optimization process, where we also offer the theoretical +support. Extensive experiments on real-world datasets with long-tailed label +distributions verify the superiority of our proposed methods. + +
+
+
+
+
+ + ☆ The CausalBench challenge: A machine learning contest for gene network + inference from single-cell perturbation data + + +
+ In drug discovery, mapping interactions between genes within cellular systems +is a crucial early step. This helps formulate hypotheses regarding molecular +mechanisms that could potentially be targeted by future medicines. The +CausalBench Challenge was an initiative to invite the machine learning +community to advance the state of the art in constructing gene-gene interaction +networks. These networks, derived from large-scale, real-world datasets of +single cells under various perturbations, are crucial for understanding the +causal mechanisms underlying disease biology. Using the framework provided by +the CausalBench benchmark, participants were tasked with enhancing the capacity +of the state of the art methods to leverage large-scale genetic perturbation +data. This report provides an analysis and summary of the methods submitted +during the challenge to give a partial image of the state of the art at the +time of the challenge. The winning solutions significantly improved performance +compared to previous baselines, establishing a new state of the art for this +critical task in biology and medicine. + +
+
+
+
+
+ + ☆ Decentralized Multi-agent Reinforcement Learning based State-of-Charge + Balancing Strategy for Distributed Energy Storage System + + +
+ This paper develops a Decentralized Multi-Agent Reinforcement Learning +(Dec-MARL) method to solve the SoC balancing problem in the distributed energy +storage system (DESS). First, the SoC balancing problem is formulated into a +finite Markov decision process with action constraints derived from demand +balance, which can be solved by Dec-MARL. Specifically, the first-order average +consensus algorithm is utilized to expand the observations of the DESS state in +a fully-decentralized way, and the initial actions (i.e., output power) are +decided by the agents (i.e., energy storage units) according to these +observations. In order to get the final actions in the allowable range, a +counterfactual demand balance algorithm is proposed to balance the total demand +and the initial actions. Next, the agents execute the final actions and get +local rewards from the environment, and the DESS steps into the next state. +Finally, through the first-order average consensus algorithm, the agents get +the average reward and the expended observation of the next state for later +training. By the above procedure, Dec-MARL reveals outstanding performance in a +fully-decentralized system without any expert experience or constructing any +complicated model. Besides, it is flexible and can be extended to other +decentralized multi-agent systems straightforwardly. Extensive simulations have +validated the effectiveness and efficiency of Dec-MARL. + +
+
+
+
+
+ + ☆ Shape-Margin Knowledge Augmented Network for Thyroid Nodule Segmentation + and Diagnosis + + +
+ Thyroid nodule segmentation is a crucial step in the diagnostic procedure of +physicians and computer-aided diagnosis systems. Mostly, current studies treat +segmentation and diagnosis as independent tasks without considering the +correlation between these tasks. The sequence steps of these independent tasks +in computer-aided diagnosis systems may lead to the accumulation of errors. +Therefore, it is worth combining them as a whole through exploring the +relationship between thyroid nodule segmentation and diagnosis. According to +the thyroid imaging reporting and data system (TI-RADS), the assessment of +shape and margin characteristics is the prerequisite for the discrimination of +benign and malignant thyroid nodules. These characteristics can be observed in +the thyroid nodule segmentation masks. Inspired by the diagnostic procedure of +TI-RADS, this paper proposes a shape-margin knowledge augmented network +(SkaNet) for simultaneously thyroid nodule segmentation and diagnosis. Due to +the similarity in visual features between segmentation and diagnosis, SkaNet +shares visual features in the feature extraction stage and then utilizes a +dual-branch architecture to perform thyroid nodule segmentation and diagnosis +tasks simultaneously. To enhance effective discriminative features, an +exponential mixture module is devised, which incorporates convolutional feature +maps and self-attention maps by exponential weighting. Then, SkaNet is jointly +optimized by a knowledge augmented multi-task loss function with a constraint +penalty term. It embeds shape and margin characteristics through numerical +computation and models the relationship between the thyroid nodule diagnosis +results and segmentation masks. + +
+
+
+
+
+ + ☆ Multi-Response Heteroscedastic Gaussian Process Models and Their + Inference + + +
+ Despite the widespread utilization of Gaussian process models for versatile +nonparametric modeling, they exhibit limitations in effectively capturing +abrupt changes in function smoothness and accommodating relationships with +heteroscedastic errors. Addressing these shortcomings, the heteroscedastic +Gaussian process (HeGP) regression seeks to introduce flexibility by +acknowledging the variability of residual variances across covariates in the +regression model. In this work, we extend the HeGP concept, expanding its scope +beyond regression tasks to encompass classification and state-space models. To +achieve this, we propose a novel framework where the Gaussian process is +coupled with a covariate-induced precision matrix process, adopting a mixture +formulation. This approach enables the modeling of heteroscedastic covariance +functions across covariates. To mitigate the computational challenges posed by +sampling, we employ variational inference to approximate the posterior and +facilitate posterior predictive modeling. Additionally, our training process +leverages an EM algorithm featuring closed-form M-step updates to efficiently +evaluate the heteroscedastic covariance function. A notable feature of our +model is its consistent performance on multivariate responses, accommodating +various types (continuous or categorical) seamlessly. Through a combination of +simulations and real-world applications in climatology, we illustrate the +model's prowess and advantages. By overcoming the limitations of traditional +Gaussian process models, our proposed framework offers a robust and versatile +tool for a wide array of applications. + +
+
+ comment: submitted to the Journal of the American Statistical Association + (JASA) +
+
+
+
+
+ + ☆ Efficient Model Personalization in Federated Learning via + Client-Specific Prompt Generation ICCV 2023 + + +
+ Federated learning (FL) emerges as a decentralized learning framework which +trains models from multiple distributed clients without sharing their data to +preserve privacy. Recently, large-scale pre-trained models (e.g., Vision +Transformer) have shown a strong capability of deriving robust representations. +However, the data heterogeneity among clients, the limited computation +resources, and the communication bandwidth restrict the deployment of +large-scale models in FL frameworks. To leverage robust representations from +large-scale models while enabling efficient model personalization for +heterogeneous clients, we propose a novel personalized FL framework of +client-specific Prompt Generation (pFedPG), which learns to deploy a +personalized prompt generator at the server for producing client-specific +visual prompts that efficiently adapts frozen backbones to local data +distributions. Our proposed framework jointly optimizes the stages of +personalized prompt adaptation locally and personalized prompt generation +globally. The former aims to train visual prompts that adapt foundation models +to each client, while the latter observes local optimization directions to +generate personalized prompts for all clients. Through extensive experiments on +benchmark datasets, we show that our pFedPG is favorable against +state-of-the-art personalized FL methods under various types of data +heterogeneity, allowing computation and communication efficient model +personalization. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Heterogeneous Multi-Task Gaussian Cox Processes + + +
+ This paper presents a novel extension of multi-task Gaussian Cox processes +for modeling multiple heterogeneous correlated tasks jointly, e.g., +classification and regression, via multi-output Gaussian processes (MOGP). A +MOGP prior over the parameters of the dedicated likelihoods for classification, +regression and point process tasks can facilitate sharing of information +between heterogeneous tasks, while allowing for nonparametric parameter +estimation. To circumvent the non-conjugate Bayesian inference in the MOGP +modulated heterogeneous multi-task framework, we employ the data augmentation +technique and derive a mean-field approximation to realize closed-form +iterative updates for estimating model parameters. We demonstrate the +performance and inference on both 1D synthetic data as well as 2D urban data of +Vancouver. + +
+
+
+
+
+ + ☆ Text-to-SQL Empowered by Large Language Models: A Benchmark Evaluation + + +
+ Large language models (LLMs) have emerged as a new paradigm for Text-to-SQL +task. However, the absence of a systematical benchmark inhibits the development +of designing effective, efficient and economic LLM-based Text-to-SQL solutions. +To address this challenge, in this paper, we first conduct a systematical and +extensive comparison over existing prompt engineering methods, including +question representation, example selection and example organization, and with +these experimental results, we elaborates their pros and cons. Based on these +findings, we propose a new integrated solution, named DAIL-SQL, which refreshes +the Spider leaderboard with 86.6% execution accuracy and sets a new bar. +Towards an efficient and economic LLM-based Text-to-SQL solution, we emphasize +the token efficiency in prompt engineering and compare the prior studies under +this metric. Additionally, we investigate open-source LLMs in in-context +learning, and further enhance their performance with task-specific supervised +fine-tuning. Our explorations highlight open-source LLMs' potential in +Text-to-SQL, as well as the advantages and disadvantages of the task-specific +supervised fine-tuning. We hope that our work provides a deeper understanding +of Text-to-SQL with LLMs, and inspire further investigations and broad +applications. + +
+
+ comment: We have released code on https://github.com/BeachWang/DAIL-SQL +
+
+
+
+
+ + ☆ Lie-Poisson Neural Networks (LPNets): Data-Based Computing of + Hamiltonian Systems with Symmetries + + +
+ An accurate data-based prediction of the long-term evolution of Hamiltonian +systems requires a network that preserves the appropriate structure under each +time step. Every Hamiltonian system contains two essential ingredients: the +Poisson bracket and the Hamiltonian. Hamiltonian systems with symmetries, whose +paradigm examples are the Lie-Poisson systems, have been shown to describe a +broad category of physical phenomena, from satellite motion to underwater +vehicles, fluids, geophysical applications, complex fluids, and plasma physics. +The Poisson bracket in these systems comes from the symmetries, while the +Hamiltonian comes from the underlying physics. We view the symmetry of the +system as primary, hence the Lie-Poisson bracket is known exactly, whereas the +Hamiltonian is regarded as coming from physics and is considered not known, or +known approximately. Using this approach, we develop a network based on +transformations that exactly preserve the Poisson bracket and the special +functions of the Lie-Poisson systems (Casimirs) to machine precision. We +present two flavors of such systems: one, where the parameters of +transformations are computed from data using a dense neural network (LPNets), +and another, where the composition of transformations is used as building +blocks (G-LPNets). We also show how to adapt these methods to a larger class of +Poisson brackets. We apply the resulting methods to several examples, such as +rigid body (satellite) motion, underwater vehicles, a particle in a magnetic +field, and others. The methods developed in this paper are important for the +construction of accurate data-based methods for simulating the long-term +dynamics of physical systems. + +
+
+ comment: 57 pages, 13 figures +
+
+
+
+
+ + ☆ Imperceptible Adversarial Attack on Deep Neural Networks from Image + Boundary + + +
+ Although Deep Neural Networks (DNNs), such as the convolutional neural +networks (CNN) and Vision Transformers (ViTs), have been successfully applied +in the field of computer vision, they are demonstrated to be vulnerable to +well-sought Adversarial Examples (AEs) that can easily fool the DNNs. The +research in AEs has been active, and many adversarial attacks and explanations +have been proposed since they were discovered in 2014. The mystery of the AE's +existence is still an open question, and many studies suggest that DNN training +algorithms have blind spots. The salient objects usually do not overlap with +boundaries; hence, the boundaries are not the DNN model's attention. +Nevertheless, recent studies show that the boundaries can dominate the behavior +of the DNN models. Hence, this study aims to look at the AEs from a different +perspective and proposes an imperceptible adversarial attack that systemically +attacks the input image boundary for finding the AEs. The experimental results +have shown that the proposed boundary attacking method effectively attacks six +CNN models and the ViT using only 32% of the input image content (from the +boundaries) with an average success rate (SR) of 95.2% and an average peak +signal-to-noise ratio of 41.37 dB. Correlation analyses are conducted, +including the relation between the adversarial boundary's width and the SR and +how the adversarial boundary changes the DNN model's attention. This paper's +discoveries can potentially advance the understanding of AEs and provide a +different perspective on how AEs can be constructed. + +
+
+
+
+
+ + ☆ Enhancing Robot Learning through Learned Human-Attention Feature Maps ICRA 2023 + + +
+ Robust and efficient learning remains a challenging problem in robotics, in +particular with complex visual inputs. Inspired by human attention mechanism, +with which we quickly process complex visual scenes and react to changes in the +environment, we think that embedding auxiliary information about focus point +into robot learning would enhance efficiency and robustness of the learning +process. In this paper, we propose a novel approach to model and emulate the +human attention with an approximate prediction model. We then leverage this +output and feed it as a structured auxiliary feature map into downstream +learning tasks. We validate this idea by learning a prediction model from +human-gaze recordings of manual driving in the real world. We test our approach +on two learning tasks - object detection and imitation learning. Our +experiments demonstrate that the inclusion of predicted human attention leads +to improved robustness of the trained models to out-of-distribution samples and +faster learning in low-data regime settings. Our work highlights the potential +of incorporating structured auxiliary information in representation learning +for robotics and opens up new avenues for research in this direction. All code +and data are available online. + +
+
+ comment: This work has been accepted for the RAP4Robots workshop at ICRA 2023 + in London +
+
+
+
+
+ + ☆ Occlusion-Aware Deep Convolutional Neural Network via Homogeneous + Tanh-transforms for Face Parsing + + +
+ Face parsing infers a pixel-wise label map for each semantic facial +component. Previous methods generally work well for uncovered faces, however +overlook the facial occlusion and ignore some contextual area outside a single +face, especially when facial occlusion has become a common situation during the +COVID-19 epidemic. Inspired by the illumination theory of image, we propose a +novel homogeneous tanh-transforms for image preprocessing, which made up of +four tanh-transforms, that fuse the central vision and the peripheral vision +together. Our proposed method addresses the dilemma of face parsing under +occlusion and compresses more information of surrounding context. Based on +homogeneous tanh-transforms, we propose an occlusion-aware convolutional neural +network for occluded face parsing. It combines the information both in +Tanh-polar space and Tanh-Cartesian space, capable of enhancing receptive +fields. Furthermore, we introduce an occlusion-aware loss to focus on the +boundaries of occluded regions. The network is simple and flexible, and can be +trained end-to-end. To facilitate future research of occluded face parsing, we +also contribute a new cleaned face parsing dataset, which is manually purified +from several academic or industrial datasets, including CelebAMask-HQ, +Short-video Face Parsing as well as Helen dataset and will make it public. +Experiments demonstrate that our method surpasses state-of-art methods of face +parsing under occlusion. + +
+
+
+
+
+ + ☆ Elucidating the Exposure Bias in Diffusion Models + + +
+ Diffusion models have demonstrated impressive generative capabilities, but +their 'exposure bias' problem, described as the input mismatch between training +and sampling, lacks in-depth exploration. In this paper, we systematically +investigate the exposure bias problem in diffusion models by first analytically +modelling the sampling distribution, based on which we then attribute the +prediction error at each sampling step as the root cause of the exposure bias +issue. Furthermore, we discuss potential solutions to this issue and propose an +intuitive metric for it. Along with the elucidation of exposure bias, we +propose a simple, yet effective, training-free method called Epsilon Scaling to +alleviate the exposure bias. We show that Epsilon Scaling explicitly moves the +sampling trajectory closer to the vector field learned in the training phase by +scaling down the network output (Epsilon), mitigating the input mismatch +between training and sampling. Experiments on various diffusion frameworks +(ADM, DDPM/DDIM, LDM), unconditional and conditional settings, and +deterministic vs. stochastic sampling verify the effectiveness of our method. + +
+
+ comment: 7 pages, code available soon +
+
+
+
+
+ + ☆ On-Device Learning with Binary Neural Networks + + +
+ Existing Continual Learning (CL) solutions only partially address the +constraints on power, memory and computation of the deep learning models when +deployed on low-power embedded CPUs. In this paper, we propose a CL solution +that embraces the recent advancements in CL field and the efficiency of the +Binary Neural Networks (BNN), that use 1-bit for weights and activations to +efficiently execute deep learning models. We propose a hybrid quantization of +CWR* (an effective CL approach) that considers differently forward and backward +pass in order to retain more precision during gradient update step and at the +same time minimizing the latency overhead. The choice of a binary network as +backbone is essential to meet the constraints of low power devices and, to the +best of authors' knowledge, this is the first attempt to prove on-device +learning with BNN. The experimental validation carried out confirms the +validity and the suitability of the proposed method. + +
+
+
+
+
+ + ☆ Towards quantitative precision for ECG analysis: Leveraging state space + models, self-supervision and patient metadata + + +
+ Deep learning has emerged as the preferred modeling approach for automatic +ECG analysis. In this study, we investigate three elements aimed at improving +the quantitative accuracy of such systems. These components consistently +enhance performance beyond the existing state-of-the-art, which is +predominantly based on convolutional models. Firstly, we explore more +expressive architectures by exploiting structured state space models (SSMs). +These models have shown promise in capturing long-term dependencies in time +series data. By incorporating SSMs into our approach, we not only achieve +better performance, but also gain insights into long-standing questions in the +field. Specifically, for standard diagnostic tasks, we find no advantage in +using higher sampling rates such as 500Hz compared to 100Hz. Similarly, +extending the input size of the model beyond 3 seconds does not lead to +significant improvements. Secondly, we demonstrate that self-supervised +learning using contrastive predictive coding can further improve the +performance of SSMs. By leveraging self-supervision, we enable the model to +learn more robust and representative features, leading to improved analysis +accuracy. Lastly, we depart from synthetic benchmarking scenarios and +incorporate basic demographic metadata alongside the ECG signal as input. This +inclusion of patient metadata departs from the conventional practice of relying +solely on the signal itself. Remarkably, this addition consistently yields +positive effects on predictive performance. We firmly believe that all three +components should be considered when developing next-generation ECG analysis +algorithms. + +
+
+ comment: extended version of arXiv:2211.07579 +
+
+
+
+
+ + ☆ Structural Node Embeddings with Homomorphism Counts + + +
+ Graph homomorphism counts, first explored by Lov\'asz in 1967, have recently +garnered interest as a powerful tool in graph-based machine learning. Grohe +(PODS 2020) proposed the theoretical foundations for using homomorphism counts +in machine learning on graph level as well as node level tasks. By their very +nature, these capture local structural information, which enables the creation +of robust structural embeddings. While a first approach for graph level tasks +has been made by Nguyen and Maehara (ICML 2020), we experimentally show the +effectiveness of homomorphism count based node embeddings. Enriched with node +labels, node weights, and edge weights, these offer an interpretable +representation of graph data, allowing for enhanced explainability of machine +learning models. + We propose a theoretical framework for isomorphism-invariant homomorphism +count based embeddings which lend themselves to a wide variety of downstream +tasks. Our approach capitalises on the efficient computability of graph +homomorphism counts for bounded treewidth graph classes, rendering it a +practical solution for real-world applications. We demonstrate their +expressivity through experiments on benchmark datasets. Although our results do +not match the accuracy of state-of-the-art neural architectures, they are +comparable to other advanced graph learning models. Remarkably, our approach +demarcates itself by ensuring explainability for each individual feature. By +integrating interpretable machine learning algorithms like SVMs or Random +Forests, we establish a seamless, end-to-end explainable pipeline. Our study +contributes to the advancement of graph-based techniques that offer both +performance and interpretability. + +
+
+
+
+
+ + ☆ Let There Be Sound: Reconstructing High Quality Speech from Silent + Videos + + +
+ The goal of this work is to reconstruct high quality speech from lip motions +alone, a task also known as lip-to-speech. A key challenge of lip-to-speech +systems is the one-to-many mapping caused by (1) the existence of homophenes +and (2) multiple speech variations, resulting in a mispronounced and +over-smoothed speech. In this paper, we propose a novel lip-to-speech system +that significantly improves the generation quality by alleviating the +one-to-many mapping problem from multiple perspectives. Specifically, we +incorporate (1) self-supervised speech representations to disambiguate +homophenes, and (2) acoustic variance information to model diverse speech +styles. Additionally, to better solve the aforementioned problem, we employ a +flow based post-net which captures and refines the details of the generated +speech. We perform extensive experiments and demonstrate that our method +achieves the generation quality close to that of real human utterance, +outperforming existing methods in terms of speech naturalness and +intelligibility by a large margin. Synthesised samples are available at the +anonymous demo page: https://mm.kaist.ac.kr/projects/LTBS. + +
+
+ comment: 10 pages, 2 figures, 5 tables +
+
+
+
+
+ + ☆ The Relative Gaussian Mechanism and its Application to Private Gradient + Descent + + +
+ The Gaussian Mechanism (GM), which consists in adding Gaussian noise to a +vector-valued query before releasing it, is a standard privacy protection +mechanism. In particular, given that the query respects some L2 sensitivity +property (the L2 distance between outputs on any two neighboring inputs is +bounded), GM guarantees R\'enyi Differential Privacy (RDP). Unfortunately, +precisely bounding the L2 sensitivity can be hard, thus leading to loose +privacy bounds. In this work, we consider a Relative L2 sensitivity assumption, +in which the bound on the distance between two query outputs may also depend on +their norm. Leveraging this assumption, we introduce the Relative Gaussian +Mechanism (RGM), in which the variance of the noise depends on the norm of the +output. We prove tight bounds on the RDP parameters under relative L2 +sensitivity, and characterize the privacy loss incurred by using +output-dependent noise. In particular, we show that RGM naturally adapts to a +latent variable that would control the norm of the output. Finally, we +instantiate our framework to show tight guarantees for Private Gradient +Descent, a problem that naturally fits our relative L2 sensitivity assumption. + +
+
+
+
+
+ + ☆ Reliability Gaps Between Groups in COMPAS Dataset + + +
+ This paper investigates the inter-rater reliability of risk assessment +instruments (RAIs). The main question is whether different, socially salient +groups are affected differently by a lack of inter-rater reliability of RAIs, +that is, whether mistakes with respect to different groups affects them +differently. The question is investigated with a simulation study of the COMPAS +dataset. A controlled degree of noise is injected into the input data of a +predictive model; the noise can be interpreted as a synthetic rater that makes +mistakes. The main finding is that there are systematic differences in output +reliability between groups in the COMPAS dataset. The sign of the difference +depends on the kind of inter-rater statistic that is used (Cohen's Kappa, +Byrt's PABAK, ICC), and in particular whether or not a correction of +predictions prevalences of the groups is used. + +
+
+ comment: 15 pages + appendix +
+
+
+
+
+ + ☆ Assessing Cyclostationary Malware Detection via Feature Selection and + Classification + + +
+ Cyclostationarity involves periodic statistical variations in signals and +processes, commonly used in signal analysis and network security. In the +context of attacks, cyclostationarity helps detect malicious behaviors within +network traffic, such as traffic patterns in Distributed Denial of Service +(DDoS) attacks or hidden communication channels in malware. This approach +enhances security by identifying abnormal patterns and informing Network +Intrusion Detection Systems (NIDSs) to recognize potential attacks, enhancing +protection against both known and novel threats. This research focuses on +identifying cyclostationary malware behavior and its detection. The main goal +is to pinpoint essential cyclostationary features used in NIDSs. These features +are extracted using algorithms such as Boruta and Principal Component Analysis +(PCA), and then categorized to find the most significant cyclostationary +patterns. The aim of this article is to reveal periodically changing malware +behaviors through cyclostationarity. The study highlights the importance of +spotting cyclostationary malware in NIDSs by using established datasets like +KDD99, NSL-KDD, and the UGRansome dataset. The UGRansome dataset is designed +for anomaly detection research and includes both normal and abnormal network +threat categories of zero-day attacks. A comparison is made using the Random +Forest (RF) and Support Vector Machine (SVM) algorithms, while also evaluating +the effectiveness of Boruta and PCA. The findings show that PCA is more +promising than using Boruta alone for extracting cyclostationary network +feature patterns. Additionally, the analysis identifies the internet protocol +as the most noticeable cyclostationary feature pattern used by malware. +Notably, the UGRansome dataset outperforms the KDD99 and NSL-KDD, achieving 99% +accuracy in signature malware detection using the RF algorithm and 98% with the +SVM. + +
+
+
+
+
+ + ☆ Classification-Aware Neural Topic Model Combined With Interpretable + Analysis -- For Conflict Classification + + +
+ A large number of conflict events are affecting the world all the time. In +order to analyse such conflict events effectively, this paper presents a +Classification-Aware Neural Topic Model (CANTM-IA) for Conflict Information +Classification and Topic Discovery. The model provides a reliable +interpretation of classification results and discovered topics by introducing +interpretability analysis. At the same time, interpretation is introduced into +the model architecture to improve the classification performance of the model +and to allow interpretation to focus further on the details of the data. +Finally, the model architecture is optimised to reduce the complexity of the +model. + +
+
+ comment: Accepted by RANLP 2023 +
+
+
+
+
+ + ☆ Providing Previously Unseen Users Fair Recommendations Using Variational + Autoencoders RecSys 2023 + + +
+ An emerging definition of fairness in machine learning requires that models +are oblivious to demographic user information, e.g., a user's gender or age +should not influence the model. Personalized recommender systems are +particularly prone to violating this definition through their explicit user +focus and user modelling. Explicit user modelling is also an aspect that makes +many recommender systems incapable of providing hitherto unseen users with +recommendations. We propose novel approaches for mitigating discrimination in +Variational Autoencoder-based recommender systems by limiting the encoding of +demographic information. The approaches are capable of, and evaluated on, +providing users that are not represented in the training data with fair +recommendations. + +
+
+ comment: Appearing in RecSys 2023 proceedings +
+
+
+
+
+ + ☆ Evaluating Explanation Methods for Multivariate Time Series + Classification ALT + + +
+ Multivariate time series classification is an important computational task +arising in applications where data is recorded over time and over multiple +channels. For example, a smartwatch can record the acceleration and orientation +of a person's motion, and these signals are recorded as multivariate time +series. We can classify this data to understand and predict human movement and +various properties such as fitness levels. In many applications classification +alone is not enough, we often need to classify but also understand what the +model learns (e.g., why was a prediction given, based on what information in +the data). The main focus of this paper is on analysing and evaluating +explanation methods tailored to Multivariate Time Series Classification (MTSC). +We focus on saliency-based explanation methods that can point out the most +relevant channels and time series points for the classification decision. We +analyse two popular and accurate multivariate time series classifiers, ROCKET +and dResNet, as well as two popular explanation methods, SHAP and dCAM. We +study these methods on 3 synthetic datasets and 2 real-world datasets and +provide a quantitative and qualitative analysis of the explanations provided. +We find that flattening the multivariate datasets by concatenating the channels +works as well as using multivariate classifiers directly and adaptations of +SHAP for MTSC work quite well. Additionally, we also find that the popular +synthetic datasets we used are not suitable for time series analysis. + +
+
+ comment: Accepted at AALTD '23 +
+
+
+
+
+ + ☆ Ensemble of Counterfactual Explainers + + +
+ In eXplainable Artificial Intelligence (XAI), several counterfactual +explainers have been proposed, each focusing on some desirable properties of +counterfactual instances: minimality, actionability, stability, diversity, +plausibility, discriminative power. We propose an ensemble of counterfactual +explainers that boosts weak explainers, which provide only a subset of such +properties, to a powerful method covering all of them. The ensemble runs weak +explainers on a sample of instances and of features, and it combines their +results by exploiting a diversity-driven selection function. The method is +model-agnostic and, through a wrapping approach based on autoencoders, it is +also data-agnostic. + +
+
+
+
+
+ + ☆ Is visual explanation with Grad-CAM more reliable for deeper neural + networks? a case study with automatic pneumothorax diagnosis + + +
+ While deep learning techniques have provided the state-of-the-art performance +in various clinical tasks, explainability regarding their decision-making +process can greatly enhance the credence of these methods for safer and quicker +clinical adoption. With high flexibility, Gradient-weighted Class Activation +Mapping (Grad-CAM) has been widely adopted to offer intuitive visual +interpretation of various deep learning models' reasoning processes in +computer-assisted diagnosis. However, despite the popularity of the technique, +there is still a lack of systematic study on Grad-CAM's performance on +different deep learning architectures. In this study, we investigate its +robustness and effectiveness across different popular deep learning models, +with a focus on the impact of the networks' depths and architecture types, by +using a case study of automatic pneumothorax diagnosis in X-ray scans. Our +results show that deeper neural networks do not necessarily contribute to a +strong improvement of pneumothorax diagnosis accuracy, and the effectiveness of +GradCAM also varies among different network architectures. + +
+
+
+
+
+ + ☆ ABS-SGD: A Delayed Synchronous Stochastic Gradient Descent Algorithm + with Adaptive Batch Size for Heterogeneous GPU Clusters + + +
+ As the size of models and datasets grows, it has become increasingly common +to train models in parallel. However, existing distributed stochastic gradient +descent (SGD) algorithms suffer from insufficient utilization of computational +resources and poor convergence in heterogeneous clusters. In this paper, we +propose a delayed synchronous SGD algorithm with adaptive batch size (ABS-SGD) +for heterogeneous GPU clusters. In ABS-SGD, workers perform global +synchronization to accumulate delayed gradients and use the accumulated delayed +gradients to update parameters. While workers are performing global +synchronization for delayed gradients, they perform the computation of the next +batch without specifying batch size in advance, which lasts until the next +global synchronization starts, realizing the full utilization of computational +resources. Since the gradient delay is only one iteration, the stale gradient +problem can be alleviated. We theoretically prove the convergence of ABS-SGD in +heterogeneous clusters. Extensive experiments in three types of heterogeneous +clusters demonstrate that ABS-SGD can make full use of computational resources +and accelerate model convergence: When training ResNet18 network with 4 +workers, ABS-SGD increases the convergence speed by 1.30x on average compared +with the best baseline algorithm. + +
+
+ comment: 15 pages, 3 figures +
+
+
+
+
+ + ☆ On the improvement of model-predictive controllers + + +
+ This article investigates synthetic model-predictive control (MPC) problems +to demonstrate that an increased precision of the internal prediction model +(PM) automatially entails an improvement of the controller as a whole. In +contrast to reinforcement learning (RL), MPC uses the PM to predict subsequent +states of the controlled system (CS), instead of directly recommending suitable +actions. To assess how the precision of the PM translates into the quality of +the model-predictive controller, we compare a DNN-based PM to the optimal +baseline PM for three well-known control problems of varying complexity. The +baseline PM achieves perfect accuracy by accessing the simulation of the CS +itself. Based on the obtained results, we argue that an improvement of the PM +will always improve the controller as a whole, without considering the impact +of other components such as action selection (which, in this article, relies on +evolutionary optimization). + +
+
+
+
+
+ + ☆ Uncertainty Aware Training to Improve Deep Learning Model Calibration + for Classification of Cardiac MR Images + + +
+ Quantifying uncertainty of predictions has been identified as one way to +develop more trustworthy artificial intelligence (AI) models beyond +conventional reporting of performance metrics. When considering their role in a +clinical decision support setting, AI classification models should ideally +avoid confident wrong predictions and maximise the confidence of correct +predictions. Models that do this are said to be well-calibrated with regard to +confidence. However, relatively little attention has been paid to how to +improve calibration when training these models, i.e., to make the training +strategy uncertainty-aware. In this work we evaluate three novel +uncertainty-aware training strategies comparing against two state-of-the-art +approaches. We analyse performance on two different clinical applications: +cardiac resynchronisation therapy (CRT) response prediction and coronary artery +disease (CAD) diagnosis from cardiac magnetic resonance (CMR) images. The +best-performing model in terms of both classification accuracy and the most +common calibration measure, expected calibration error (ECE) was the Confidence +Weight method, a novel approach that weights the loss of samples to explicitly +penalise confident incorrect predictions. The method reduced the ECE by 17% for +CRT response prediction and by 22% for CAD diagnosis when compared to a +baseline classifier in which no uncertainty-aware strategy was included. In +both applications, as well as reducing the ECE there was a slight increase in +accuracy from 69% to 70% and 70% to 72% for CRT response prediction and CAD +diagnosis respectively. However, our analysis showed a lack of consistency in +terms of optimal models when using different calibration measures. This +indicates the need for careful consideration of performance metrics when +training and selecting models for complex high-risk applications in healthcare. + +
+
+
+
+
+ + ☆ Biquality Learning: a Framework to Design Algorithms Dealing with + Closed-Set Distribution Shifts + + +
+ Training machine learning models from data with weak supervision and dataset +shifts is still challenging. Designing algorithms when these two situations +arise has not been explored much, and existing algorithms cannot always handle +the most complex distributional shifts. We think the biquality data setup is a +suitable framework for designing such algorithms. Biquality Learning assumes +that two datasets are available at training time: a trusted dataset sampled +from the distribution of interest and the untrusted dataset with dataset shifts +and weaknesses of supervision (aka distribution shifts). The trusted and +untrusted datasets available at training time make designing algorithms dealing +with any distribution shifts possible. We propose two methods, one inspired by +the label noise literature and another by the covariate shift literature for +biquality learning. We experiment with two novel methods to synthetically +introduce concept drift and class-conditional shifts in real-world datasets +across many of them. We opened some discussions and assessed that developing +biquality learning algorithms robust to distributional changes remains an +interesting problem for future research. + +
+
+
+
+
+ + ☆ Evaluation and Analysis of Hallucination in Large Vision-Language Models + + +
+ Large Vision-Language Models (LVLMs) have recently achieved remarkable +success. However, LVLMs are still plagued by the hallucination problem, which +limits the practicality in many scenarios. Hallucination refers to the +information of LVLMs' responses that does not exist in the visual input, which +poses potential risks of substantial consequences. There has been limited work +studying hallucination evaluation in LVLMs. In this paper, we propose +Hallucination Evaluation based on Large Language Models (HaELM), an LLM-based +hallucination evaluation framework. HaELM achieves an approximate 95% +performance comparable to ChatGPT and has additional advantages including low +cost, reproducibility, privacy preservation and local deployment. Leveraging +the HaELM, we evaluate the hallucination in current LVLMs. Furthermore, we +analyze the factors contributing to hallucination in LVLMs and offer helpful +suggestions to mitigate the hallucination problem. Our training data and human +annotation hallucination data will be made public soon. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ☆ Mixup-Augmented Meta-Learning for Sample-Efficient Fine-Tuning of + Protein Simulators + + +
+ Molecular dynamics simulations have emerged as a fundamental instrument for +studying biomolecules. At the same time, it is desirable to perform simulations +of a collection of particles under various conditions in which the molecules +can fluctuate. In this paper, we explore and adapt the soft prompt-based +learning method to molecular dynamics tasks. Our model can remarkably +generalize to unseen and out-of-distribution scenarios with limited training +data. While our work focuses on temperature as a test case, the versatility of +our approach allows for efficient simulation through any continuous dynamic +conditions, such as pressure and volumes. Our framework has two stages: 1) +Pre-trains with data mixing technique, augments molecular structure data and +temperature prompts, then applies a curriculum learning method by increasing +the ratio of them smoothly. 2) Meta-learning-based fine-tuning framework +improves sample-efficiency of fine-tuning process and gives the soft +prompt-tuning better initialization points. Comprehensive experiments reveal +that our framework excels in accuracy for in-domain data and demonstrates +strong generalization capabilities for unseen and out-of-distribution samples. + +
+
+
+
+
+ + ☆ Stochastic Graph Bandit Learning with Side-Observations + + +
+ In this paper, we investigate the stochastic contextual bandit with general +function space and graph feedback. We propose an algorithm that addresses this +problem by adapting to both the underlying graph structures and reward gaps. To +the best of our knowledge, our algorithm is the first to provide a +gap-dependent upper bound in this stochastic setting, bridging the research gap +left by the work in [35]. In comparison to [31,33,35], our method offers +improved regret upper bounds and does not require knowledge of graphical +quantities. We conduct numerical experiments to demonstrate the computational +efficiency and effectiveness of our approach in terms of regret upper bounds. +These findings highlight the significance of our algorithm in advancing the +field of stochastic contextual bandits with graph feedback, opening up avenues +for practical applications in various domains. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2010.03104 by other authors +
+
+
+
+
+ + ☆ How Faithful are Self-Explainable GNNs? + + +
+ Self-explainable deep neural networks are a recent class of models that can +output ante-hoc local explanations that are faithful to the model's reasoning, +and as such represent a step forward toward filling the gap between +expressiveness and interpretability. Self-explainable graph neural networks +(GNNs) aim at achieving the same in the context of graph data. This begs the +question: do these models fulfill their implicit guarantees in terms of +faithfulness? In this extended abstract, we analyze the faithfulness of several +self-explainable GNNs using different measures of faithfulness, identify +several limitations -- both in the models themselves and in the evaluation +metrics -- and outline possible ways forward. + +
+
+
+
+
+ + ☆ Group-Conditional Conformal Prediction via Quantile Regression + Calibration for Crop and Weed Classification + + +
+ As deep learning predictive models become an integral part of a large +spectrum of precision agricultural systems, a barrier to the adoption of such +automated solutions is the lack of user trust in these highly complex, opaque +and uncertain models. Indeed, deep neural networks are not equipped with any +explicit guarantees that can be used to certify the system's performance, +especially in highly varying uncontrolled environments such as the ones +typically faced in computer vision for agriculture.Fortunately, certain methods +developed in other communities can prove to be important for agricultural +applications. This article presents the conformal prediction framework that +provides valid statistical guarantees on the predictive performance of any +black box prediction machine, with almost no assumptions, applied to the +problem of deep visual classification of weeds and crops in real-world +conditions. The framework is exposed with a focus on its practical aspects and +special attention accorded to the Adaptive Prediction Sets (APS) approach that +delivers marginal guarantees on the model's coverage. Marginal results are then +shown to be insufficient to guarantee performance on all groups of individuals +in the population as characterized by their environmental and pedo-climatic +auxiliary data gathered during image acquisition.To tackle this shortcoming, +group-conditional conformal approaches are presented: the ''classical'' method +that consists of iteratively applying the APS procedure on all groups, and a +proposed elegant reformulation and implementation of the procedure using +quantile regression on group membership indicators. Empirical results showing +the validity of the proposed approach are presented and compared to the +marginal APS then discussed. + +
+
+
+
+
+ + ☆ Can We Rely on AI? + + +
+ Over the last decade, adversarial attack algorithms have revealed +instabilities in deep learning tools. These algorithms raise issues regarding +safety, reliability and interpretability in artificial intelligence; especially +in high risk settings. From a practical perspective, there has been a war of +escalation between those developing attack and defence strategies. At a more +theoretical level, researchers have also studied bigger picture questions +concerning the existence and computability of attacks. Here we give a brief +overview of the topic, focusing on aspects that are likely to be of interest to +researchers in applied and computational mathematics. + +
+
+
+
+
+ + ☆ Using deep learning for an automatic detection and classification of the + vascular bifurcations along the Circle of Willis + + +
+ Most of the intracranial aneurysms (ICA) occur on a specific portion of the +cerebral vascular tree named the Circle of Willis (CoW). More particularly, +they mainly arise onto fifteen of the major arterial bifurcations constituting +this circular structure. Hence, for an efficient and timely diagnosis it is +critical to develop some methods being able to accurately recognize each +Bifurcation of Interest (BoI). Indeed, an automatic extraction of the +bifurcations presenting the higher risk of developing an ICA would offer the +neuroradiologists a quick glance at the most alarming areas. Due to the recent +efforts on Artificial Intelligence, Deep Learning turned out to be the best +performing technology for many pattern recognition tasks. Moreover, various +methods have been particularly designed for medical image analysis purposes. +This study intends to assist the neuroradiologists to promptly locate any +bifurcation presenting a high risk of ICA occurrence. It can be seen as a +Computer Aided Diagnosis scheme, where the Artificial Intelligence facilitates +the access to the regions of interest within the MRI. In this work, we propose +a method for a fully automatic detection and recognition of the bifurcations of +interest forming the Circle of Willis. Several neural networks architectures +have been tested, and we thoroughly evaluate the bifurcation recognition rate. + +
+
+
+
+
+ + ☆ Exploring Model Transferability through the Lens of Potential Energy ICCV 2023 + + +
+ Transfer learning has become crucial in computer vision tasks due to the vast +availability of pre-trained deep learning models. However, selecting the +optimal pre-trained model from a diverse pool for a specific downstream task +remains a challenge. Existing methods for measuring the transferability of +pre-trained models rely on statistical correlations between encoded static +features and task labels, but they overlook the impact of underlying +representation dynamics during fine-tuning, leading to unreliable results, +especially for self-supervised models. In this paper, we present an insightful +physics-inspired approach named PED to address these challenges. We reframe the +challenge of model selection through the lens of potential energy and directly +model the interaction forces that influence fine-tuning dynamics. By capturing +the motion of dynamic representations to decline the potential energy within a +force-driven physical model, we can acquire an enhanced and more stable +observation for estimating transferability. The experimental results on 10 +downstream tasks and 12 self-supervised models demonstrate that our approach +can seamlessly integrate into existing ranking techniques and enhance their +performances, revealing its effectiveness for the model selection task and its +potential for understanding the mechanism in transfer learning. Code will be +available at https://github.com/lixiaotong97/PED. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Advancing Adversarial Robustness Through Adversarial Logit Update + + +
+ Deep Neural Networks are susceptible to adversarial perturbations. +Adversarial training and adversarial purification are among the most widely +recognized defense strategies. Although these methods have different underlying +logic, both rely on absolute logit values to generate label predictions. In +this study, we theoretically analyze the logit difference around successful +adversarial attacks from a theoretical point of view and propose a new +principle, namely Adversarial Logit Update (ALU), to infer adversarial sample's +labels. Based on ALU, we introduce a new classification paradigm that utilizes +pre- and post-purification logit differences for model's adversarial robustness +boost. Without requiring adversarial or additional data for model training, our +clean data synthesis model can be easily applied to various pre-trained models +for both adversarial sample detection and ALU-based data classification. +Extensive experiments on both CIFAR-10, CIFAR-100, and tiny-ImageNet datasets +show that even with simple components, the proposed solution achieves superior +robustness performance compared to state-of-the-art methods against a wide +range of adversarial attacks. Our python implementation is submitted in our +Supplementary document and will be published upon the paper's acceptance. + +
+
+
+
+
+ + ☆ MadSGM: Multivariate Anomaly Detection with Score-based Generative + Models + + +
+ The time-series anomaly detection is one of the most fundamental tasks for +time-series. Unlike the time-series forecasting and classification, the +time-series anomaly detection typically requires unsupervised (or +self-supervised) training since collecting and labeling anomalous observations +are difficult. In addition, most existing methods resort to limited forms of +anomaly measurements and therefore, it is not clear whether they are optimal in +all circumstances. To this end, we present a multivariate time-series anomaly +detector based on score-based generative models, called MadSGM, which considers +the broadest ever set of anomaly measurement factors: i) reconstruction-based, +ii) density-based, and iii) gradient-based anomaly measurements. We also design +a conditional score network and its denoising score matching loss for the +time-series anomaly detection. Experiments on five real-world benchmark +datasets illustrate that MadSGM achieves the most robust and accurate +predictions. + +
+
+
+
+
+ + ☆ OEBench: Investigating Open Environment Challenges in Real-World + Relational Data Streams + + +
+ Relational datasets are widespread in real-world scenarios and are usually +delivered in a streaming fashion. This type of data stream can present unique +challenges, such as distribution drifts, outliers, emerging classes, and +changing features, which have recently been described as open environment +challenges for machine learning. While some work has been done on incremental +learning for data streams, their evaluations are mostly conducted with manually +partitioned datasets. Moreover, while several real-world streaming datasets are +available, it is uncertain whether these open environment challenges are +prevalent and how existing incremental learning algorithms perform on real +datasets. To fill this gap, we develop an Open Environment Benchmark named +OEBench to evaluate open environment challenges in relational data streams. +Specifically, we investigate 55 real-world streaming datasets and establish +that open environment scenarios are indeed widespread in real-world datasets, +which presents significant challenges for stream learning algorithms. Through +benchmarks, we find that increased data quantity may not consistently enhance +the model accuracy when applied in open environment scenarios, where machine +learning models can be significantly compromised by distribution shifts, +anomalies, or untrustworthy data within real-world data streams. The current +techniques are insufficient in effectively mitigating these challenges posed by +open environments. Thus, it is promising to conduct more researches to address +real-world new challenges of open environment scenarios. + +
+
+
+
+
+ + ☆ Taxonomic Loss for Morphological Glossing of Low-Resource Languages + + +
+ Morpheme glossing is a critical task in automated language documentation and +can benefit other downstream applications greatly. While state-of-the-art +glossing systems perform very well for languages with large amounts of existing +data, it is more difficult to create useful models for low-resource languages. +In this paper, we propose the use of a taxonomic loss function that exploits +morphological information to make morphological glossing more performant when +data is scarce. We find that while the use of this loss function does not +outperform a standard loss function with regards to single-label prediction +accuracy, it produces better predictions when considering the top-n predicted +labels. We suggest this property makes the taxonomic loss function useful in a +human-in-the-loop annotation setting. + +
+
+
+
+
+ + ☆ iBARLE: imBalance-Aware Room Layout Estimation + + +
+ Room layout estimation predicts layouts from a single panorama. It requires +datasets with large-scale and diverse room shapes to train the models. However, +there are significant imbalances in real-world datasets including the +dimensions of layout complexity, camera locations, and variation in scene +appearance. These issues considerably influence the model training performance. +In this work, we propose the imBalance-Aware Room Layout Estimation (iBARLE) +framework to address these issues. iBARLE consists of (1) Appearance Variation +Generation (AVG) module, which promotes visual appearance domain +generalization, (2) Complex Structure Mix-up (CSMix) module, which enhances +generalizability w.r.t. room structure, and (3) a gradient-based layout +objective function, which allows more effective accounting for occlusions in +complex layouts. All modules are jointly trained and help each other to achieve +the best performance. Experiments and ablation studies based on +ZInD~\cite{cruz2021zillow} dataset illustrate that iBARLE has state-of-the-art +performance compared with other layout estimation baselines. + +
+
+
+
+
+ + ☆ Large language models converge toward human-like concept organization + + +
+ Large language models show human-like performance in knowledge extraction, +reasoning and dialogue, but it remains controversial whether this performance +is best explained by memorization and pattern matching, or whether it reflects +human-like inferential semantics and world knowledge. Knowledge bases such as +WikiData provide large-scale, high-quality representations of inferential +semantics and world knowledge. We show that large language models learn to +organize concepts in ways that are strikingly similar to how concepts are +organized in such knowledge bases. Knowledge bases model collective, +institutional knowledge, and large language models seem to induce such +knowledge from raw text. We show that bigger and better models exhibit more +human-like concept organization, across four families of language models and +three knowledge graph embeddings. + +
+
+
+
+
+ + ☆ Massively Parallel Continuous Local Search for Hybrid SAT Solving on + GPUs + + +
+ Although state-of-the-art (SOTA) SAT solvers based on conflict-driven clause +learning (CDCL) have achieved remarkable engineering success, their sequential +nature limits the parallelism that may be extracted for acceleration on +platforms such as the graphics processing unit (GPU). In this work, we propose +FastFourierSAT, a highly parallel hybrid SAT solver based on gradient-driven +continuous local search (CLS). This is realized by a novel parallel algorithm +inspired by the Fast Fourier Transform (FFT)-based convolution for computing +the elementary symmetric polynomials (ESPs), which is the major computational +task in previous CLS methods. The complexity of our algorithm matches the best +previous result. Furthermore, the substantial parallelism inherent in our +algorithm can leverage the GPU for acceleration, demonstrating significant +improvement over the previous CLS approaches. We also propose to incorporate +the restart heuristics in CLS to improve search efficiency. We compare our +approach with the SOTA parallel SAT solvers on several benchmarks. Our results +show that FastFourierSAT computes the gradient 100+ times faster than previous +prototypes implemented on CPU. Moreover, FastFourierSAT solves most instances +and demonstrates promising performance on larger-size instances. + +
+
+
+
+
+ + ☆ Exploiting Problem Geometry in Safe Linear Bandits + + +
+ The safe linear bandit problem is a version of the classic linear bandit +problem where the learner's actions must satisfy an uncertain linear constraint +at all rounds. Due its applicability to many real-world settings, this problem +has received considerable attention in recent years. We find that by exploiting +the geometry of the specific problem setting, we can achieve improved regret +guarantees for both well-separated problem instances and action sets that are +finite star convex sets. Additionally, we propose a novel algorithm for this +setting that chooses problem parameters adaptively and enjoys at least as good +regret guarantees as existing algorithms. Lastly, we introduce a generalization +of the safe linear bandit setting where the constraints are convex and adapt +our algorithms and analyses to this setting by leveraging a novel +convex-analysis based approach. Simulation results show improved performance +over existing algorithms for a variety of randomly sampled settings. + +
+
+ comment: 38 pages, 4 figures +
+
+
+
+
+ + ☆ WSAM: Visual Explanations from Style Augmentation as Adversarial + Attacker and Their Influence in Image Classification + + +
+ Currently, style augmentation is capturing attention due to convolutional +neural networks (CNN) being strongly biased toward recognizing textures rather +than shapes. Most existing styling methods either perform a low-fidelity style +transfer or a weak style representation in the embedding vector. This paper +outlines a style augmentation algorithm using stochastic-based sampling with +noise addition to improving randomization on a general linear transformation +for style transfer. With our augmentation strategy, all models not only present +incredible robustness against image stylizing but also outperform all previous +methods and surpass the state-of-the-art performance for the STL-10 dataset. In +addition, we present an analysis of the model interpretations under different +style variations. At the same time, we compare comprehensive experiments +demonstrating the performance when applied to deep neural architectures in +training settings. + +
+
+ comment: 8 pages, 10 figures +
+
+
+
+
+ + ☆ Incorporating Neuro-Inspired Adaptability for Continual Learning in + Artificial Intelligence + + +
+ Continual learning aims to empower artificial intelligence (AI) with strong +adaptability to the real world. For this purpose, a desirable solution should +properly balance memory stability with learning plasticity, and acquire +sufficient compatibility to capture the observed distributions. Existing +advances mainly focus on preserving memory stability to overcome catastrophic +forgetting, but remain difficult to flexibly accommodate incremental changes as +biological intelligence (BI) does. By modeling a robust Drosophila learning +system that actively regulates forgetting with multiple learning modules, here +we propose a generic approach that appropriately attenuates old memories in +parameter distributions to improve learning plasticity, and accordingly +coordinates a multi-learner architecture to ensure solution compatibility. +Through extensive theoretical and empirical validation, our approach not only +clearly enhances the performance of continual learning, especially over +synaptic regularization methods in task-incremental settings, but also +potentially advances the understanding of neurological adaptive mechanisms, +serving as a novel paradigm to progress AI and BI together. + +
+
+
+
+
+ + ☆ Constructive Incremental Learning for Fault Diagnosis of Rolling + Bearings with Ensemble Domain Adaptation + + +
+ Given the prevalence of rolling bearing fault diagnosis as a practical issue +across various working conditions, the limited availability of samples +compounds the challenge. Additionally, the complexity of the external +environment and the structure of rolling bearings often manifests faults +characterized by randomness and fuzziness, hindering the effective extraction +of fault characteristics and restricting the accuracy of fault diagnosis. To +overcome these problems, this paper presents a novel approach termed +constructive Incremental learning-based ensemble domain adaptation (CIL-EDA) +approach. Specifically, it is implemented on stochastic configuration networks +(SCN) to constructively improve its adaptive performance in multi-domains. +Concretely, a cloud feature extraction method is employed in conjunction with +wavelet packet decomposition (WPD) to capture the uncertainty of fault +information from multiple resolution aspects. Subsequently, constructive +Incremental learning-based domain adaptation (CIL-DA) is firstly developed to +enhance the cross-domain learning capability of each hidden node through domain +matching and construct a robust fault classifier by leveraging limited labeled +data from both target and source domains. Finally, fault diagnosis results are +obtained by a majority voting of CIL-EDA which integrates CIL-DA and parallel +ensemble learning. Experimental results demonstrate that our CIL-DA outperforms +several domain adaptation methods and CIL-EDA consistently outperforms +state-of-art fault diagnosis methods in few-shot scenarios. + +
+
+
+
+
+ + ☆ Sub-universal variational circuits for combinatorial optimization + problems + + +
+ Quantum variational circuits have gained significant attention due to their +applications in the quantum approximate optimization algorithm and quantum +machine learning research. This work introduces a novel class of classical +probabilistic circuits designed for generating approximate solutions to +combinatorial optimization problems constructed using two-bit stochastic +matrices. Through a numerical study, we investigate the performance of our +proposed variational circuits in solving the Max-Cut problem on various graphs +of increasing sizes. Our classical algorithm demonstrates improved performance +for several graph types to the quantum approximate optimization algorithm. Our +findings suggest that evaluating the performance of quantum variational +circuits against variational circuits with sub-universal gate sets is a +valuable benchmark for identifying areas where quantum variational circuits can +excel. + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ☆ Efficient labeling of solar flux evolution videos by a deep learning + model + + +
+ Machine learning (ML) is becoming a critical tool for interrogation of large +complex data. Labeling, defined as the process of adding meaningful +annotations, is a crucial step of supervised ML. However, labeling datasets is +time consuming. Here we show that convolutional neural networks (CNNs), trained +on crudely labeled astronomical videos, can be leveraged to improve the quality +of data labeling and reduce the need for human intervention. We use videos of +the solar magnetic field, crudely labeled into two classes: emergence or +non-emergence of bipolar magnetic regions (BMRs), based on their first +detection on the solar disk. We train CNNs using crude labels, manually verify, +correct labeling vs. CNN disagreements, and repeat this process until +convergence. Traditionally, flux emergence labelling is done manually. We find +that a high-quality labeled dataset, derived through this iterative process, +reduces the necessary manual verification by 50%. Furthermore, by gradually +masking the videos and looking for maximum change in CNN inference, we locate +BMR emergence time without retraining the CNN. This demonstrates the +versatility of CNNs for simplifying the challenging task of labeling complex +dynamic events. + +
+
+ comment: 16 pages, 7 figures, published in Nature Astronomy, June 27, 2022 +
+
+
+
+
+ + ☆ Distributed multi-agent target search and tracking with Gaussian process + and reinforcement learning + + +
+ Deploying multiple robots for target search and tracking has many practical +applications, yet the challenge of planning over unknown or partially known +targets remains difficult to address. With recent advances in deep learning, +intelligent control techniques such as reinforcement learning have enabled +agents to learn autonomously from environment interactions with little to no +prior knowledge. Such methods can address the exploration-exploitation tradeoff +of planning over unknown targets in a data-driven manner, eliminating the +reliance on heuristics typical of traditional approaches and streamlining the +decision-making pipeline with end-to-end training. In this paper, we propose a +multi-agent reinforcement learning technique with target map building based on +distributed Gaussian process. We leverage the distributed Gaussian process to +encode belief over the target locations and efficiently plan over unknown +targets. We evaluate the performance and transferability of the trained policy +in simulation and demonstrate the method on a swarm of micro unmanned aerial +vehicles with hardware experiments. + +
+
+ comment: 10 pages, 6 figures; preprint submitted to IJCAS; first two authors + contributed equally +
+
+
+
+
+ + ☆ Reprogramming under constraints: Revisiting efficient and reliable + transferability of lottery tickets + + +
+ In the era of foundation models with huge pre-training budgets, the +downstream tasks have been shifted to the narrative of efficient and fast +adaptation. For classification-based tasks in the domain of computer vision, +the two most efficient approaches have been linear probing (LP) and visual +prompting/reprogramming (VP); the former aims to learn a classifier in the form +of a linear head on the features extracted by the pre-trained model, while the +latter maps the input data to the domain of the source data on which the model +was originally pre-trained on. Although extensive studies have demonstrated the +differences between LP and VP in terms of downstream performance, we explore +the capabilities of the two aforementioned methods via the sparsity axis: (a) +Data sparsity: the impact of few-shot adaptation and (b) Model sparsity: the +impact of lottery tickets (LT). We demonstrate that LT are not universal +reprogrammers, i.e., for certain target datasets, reprogramming an LT yields +significantly lower performance than the reprogrammed dense model although +their corresponding upstream performance is similar. Further, we demonstrate +that the calibration of dense models is always superior to that of their +lottery ticket counterparts under both LP and VP regimes. Our empirical study +opens a new avenue of research into VP for sparse models and encourages further +understanding of the performance beyond the accuracy achieved by VP under +constraints of sparsity. Code and logs can be accessed at +\url{https://github.com/landskape-ai/Reprogram_LT}. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Streaming Compression of Scientific Data via weak-SINDy + + +
+ In this paper a streaming weak-SINDy algorithm is developed specifically for +compressing streaming scientific data. The production of scientific data, +either via simulation or experiments, is undergoing an stage of exponential +growth, which makes data compression important and often necessary for storing +and utilizing large scientific data sets. As opposed to classical ``offline" +compression algorithms that perform compression on a readily available data +set, streaming compression algorithms compress data ``online" while the data +generated from simulation or experiments is still flowing through the system. +This feature makes streaming compression algorithms well-suited for scientific +data compression, where storing the full data set offline is often infeasible. +This work proposes a new streaming compression algorithm, streaming weak-SINDy, +which takes advantage of the underlying data characteristics during +compression. The streaming weak-SINDy algorithm constructs feature matrices and +target vectors in the online stage via a streaming integration method in a +memory efficient manner. The feature matrices and target vectors are then used +in the offline stage to build a model through a regression process that aims to +recover equations that govern the evolution of the data. For compressing +high-dimensional streaming data, we adopt a streaming proper orthogonal +decomposition (POD) process to reduce the data dimension and then use the +streaming weak-SINDy algorithm to compress the temporal data of the POD +expansion. We propose modifications to the streaming weak-SINDy algorithm to +accommodate the dynamically updated POD basis. By combining the built model +from the streaming weak-SINDy algorithm and a small amount of data samples, the +full data flow could be reconstructed accurately at a low memory cost, as shown +in the numerical tests. + +
+
+
+
+
+ + ☆ Robust Open-Set Spoken Language Identification and the CU MultiLang + Dataset + + +
+ Most state-of-the-art spoken language identification models are closed-set; +in other words, they can only output a language label from the set of classes +they were trained on. Open-set spoken language identification systems, however, +gain the ability to detect when an input exhibits none of the original +languages. In this paper, we implement a novel approach to open-set spoken +language identification that uses MFCC and pitch features, a TDNN model to +extract meaningful feature embeddings, confidence thresholding on softmax +outputs, and LDA and pLDA for learning to classify new unknown languages. We +present a spoken language identification system that achieves 91.76% accuracy +on trained languages and has the capability to adapt to unknown languages on +the fly. To that end, we also built the CU MultiLang Dataset, a large and +diverse multilingual speech corpus which was used to train and evaluate our +system. + +
+
+ comment: 6pages, 1 table, 6 figures +
+
+
+
+
+ + ☆ Low-bit Quantization for Deep Graph Neural Networks with + Smoothness-aware Message Propagation CIKM2023 + + +
+ Graph Neural Network (GNN) training and inference involve significant +challenges of scalability with respect to both model sizes and number of +layers, resulting in degradation of efficiency and accuracy for large and deep +GNNs. We present an end-to-end solution that aims to address these challenges +for efficient GNNs in resource constrained environments while avoiding the +oversmoothing problem in deep GNNs. We introduce a quantization based approach +for all stages of GNNs, from message passing in training to node +classification, compressing the model and enabling efficient processing. The +proposed GNN quantizer learns quantization ranges and reduces the model size +with comparable accuracy even under low-bit quantization. To scale with the +number of layers, we devise a message propagation mechanism in training that +controls layer-wise changes of similarities between neighboring nodes. This +objective is incorporated into a Lagrangian function with constraints and a +differential multiplier method is utilized to iteratively find optimal +embeddings. This mitigates oversmoothing and suppresses the quantization error +to a bound. Significant improvements are demonstrated over state-of-the-art +quantization methods and deep GNN approaches in both full-precision and +quantized models. The proposed quantizer demonstrates superior performance in +INT2 configurations across all stages of GNN, achieving a notable level of +accuracy. In contrast, existing quantization approaches fail to generate +satisfactory accuracy levels. Finally, the inference with INT2 and INT4 +representations exhibits a speedup of 5.11 $\times$ and 4.70 $\times$ compared +to full precision counterparts, respectively. + +
+
+ comment: To appear in CIKM2023 +
+
+
+
+
+ + ☆ Improving Reinforcement Learning Training Regimes for Social Robot + Navigation + + +
+ In order for autonomous mobile robots to navigate in human spaces, they must +abide by our social norms. Reinforcement learning (RL) has emerged as an +effective method to train robot navigation policies that are able to respect +these norms. However, a large portion of existing work in the field conducts +both RL training and testing in simplistic environments. This limits the +generalization potential of these models to unseen environments, and the +meaningfulness of their reported results. We propose a method to improve the +generalization performance of RL social navigation methods using curriculum +learning. By employing multiple environment types and by modeling pedestrians +using multiple dynamics models, we are able to progressively diversify and +escalate difficulty in training. Our results show that the use of curriculum +learning in training can be used to achieve better generalization performance +than previous training methods. We also show that results presented in many +existing state-of-the art RL social navigation works do not evaluate their +methods outside of their training environments, and thus do not reflect their +policies' failure to adequately generalize to out-of-distribution scenarios. In +response, we validate our training approach on larger and more crowded testing +environments than those used in training, allowing for more meaningful +measurements of model performance. + +
+
+
+
+
+ + ☆ Bridging Distribution Learning and Image Clustering in High-dimensional + Space + + +
+ Distribution learning focuses on learning the probability density function +from a set of data samples. In contrast, clustering aims to group similar +objects together in an unsupervised manner. Usually, these two tasks are +considered unrelated. However, the relationship between the two may be +indirectly correlated, with Gaussian Mixture Models (GMM) acting as a bridge. +In this paper, we focus on exploring the correlation between distribution +learning and clustering, with the motivation to fill the gap between these two +fields, utilizing an autoencoder (AE) to encode images into a high-dimensional +latent space. Then, Monte-Carlo Marginalization (MCMarg) and Kullback-Leibler +(KL) divergence loss are used to fit the Gaussian components of the GMM and +learn the data distribution. Finally, image clustering is achieved through each +Gaussian component of GMM. Yet, the "curse of dimensionality" poses severe +challenges for most clustering algorithms. Compared with the classic +Expectation-Maximization (EM) Algorithm, experimental results show that MCMarg +and KL divergence can greatly alleviate the difficulty. Based on the +experimental results, we believe distribution learning can exploit the +potential of GMM in image clustering within high-dimensional space. + +
+
+
+
+
+ + ☆ Deep Reinforcement Learning Based Framework for Mobile Energy + Disseminator Dispatching to Charge On-the-Road Electric Vehicles + + +
+ The exponential growth of electric vehicles (EVs) presents novel challenges +in preserving battery health and in addressing the persistent problem of +vehicle range anxiety. To address these concerns, wireless charging, +particularly, Mobile Energy Disseminators (MEDs) have emerged as a promising +solution. The MED is mounted behind a large vehicle and charges all +participating EVs within a radius upstream of it. Unfortuantely, during such +V2V charging, the MED and EVs inadvertently form platoons, thereby occupying +multiple lanes and impairing overall corridor travel efficiency. In addition, +constrained budgets for MED deployment necessitate the development of an +effective dispatching strategy to determine optimal timing and locations for +introducing the MEDs into traffic. This paper proposes a deep reinforcement +learning (DRL) based methodology to develop a vehicle dispatching framework. In +the first component of the framework, we develop a realistic reinforcement +learning environment termed "ChargingEnv" which incorporates a reliable +charging simulation system that accounts for common practical issues in +wireless charging deployment, specifically, the charging panel misalignment. +The second component, the Proximal-Policy Optimization (PPO) agent, is trained +to control MED dispatching through continuous interactions with ChargingEnv. +Numerical experiments were carried out to demonstrate the demonstrate the +efficacy of the proposed MED deployment decision processor. The experiment +results suggest that the proposed model can significantly enhance EV travel +range while efficiently deploying a optimal number of MEDs. The proposed model +is found to be not only practical in its applicability but also has promises of +real-world effectiveness. The proposed model can help travelers to maximize EV +range and help road agencies or private-sector vendors to manage the deployment +of MEDs efficiently. + +
+
+ comment: Submitted for presentation only at the 2024 Annual Meeting of the + Transportation Research Board +
+
+
+
+
+ + ☆ Ensuring User-side Fairness in Dynamic Recommender Systems + + +
+ User-side group fairness is crucial for modern recommender systems, as it +aims to alleviate performance disparity between groups of users defined by +sensitive attributes such as gender, race, or age. We find that the disparity +tends to persist or even increase over time. This calls for effective ways to +address user-side fairness in a dynamic environment, which has been +infrequently explored in the literature. However, fairness-constrained +re-ranking, a typical method to ensure user-side fairness (i.e., reducing +performance disparity), faces two fundamental challenges in the dynamic +setting: (1) non-differentiability of the ranking-based fairness constraint, +which hinders the end-to-end training paradigm, and (2) time-inefficiency, +which impedes quick adaptation to changes in user preferences. In this paper, +we propose FAir Dynamic rEcommender (FADE), an end-to-end framework with +fine-tuning strategy to dynamically alleviate performance disparity. To tackle +the above challenges, FADE uses a novel fairness loss designed to be +differentiable and lightweight to fine-tune model parameters to ensure both +user-side fairness and high-quality recommendations. Via extensive experiments +on the real-world dataset, we empirically demonstrate that FADE effectively and +efficiently reduces performance disparity, and furthermore, FADE improves +overall recommendation quality over time compared to not using any new data. + +
+
+ comment: 10 pages, 8 figures +
+
+
+
+
+ + ☆ A General Recipe for Automated Machine Learning in Practice + + +
+ Automated Machine Learning (AutoML) is an area of research that focuses on +developing methods to generate machine learning models automatically. The idea +of being able to build machine learning models with very little human +intervention represents a great opportunity for the practice of applied machine +learning. However, there is very little information on how to design an AutoML +system in practice. Most of the research focuses on the problems facing +optimization algorithms and leaves out the details of how that would be done in +practice. In this paper, we propose a frame of reference for building general +AutoML systems. Through a narrative review of the main approaches in the area, +our main idea is to distill the fundamental concepts in order to support them +in a single design. Finally, we discuss some open problems related to the +application of AutoML for future research. + +
+
+
+
+
+ + ☆ Clustering Without an Eigengap + + +
+ We study graph clustering in the Stochastic Block Model (SBM) in the presence +of both large clusters and small, unrecoverable clusters. Previous approaches +achieving exact recovery do not allow any small clusters of size $o(\sqrt{n})$, +or require a size gap between the smallest recovered cluster and the largest +non-recovered cluster. We provide an algorithm based on semidefinite +programming (SDP) which removes these requirements and provably recovers large +clusters regardless of the remaining cluster sizes. Mid-sized clusters pose +unique challenges to the analysis, since their proximity to the recovery +threshold makes them highly sensitive to small noise perturbations and +precludes a closed-form candidate solution. We develop novel techniques, +including a leave-one-out-style argument which controls the correlation between +SDP solutions and noise vectors even when the removal of one row of noise can +drastically change the SDP solution. We also develop improved eigenvalue +perturbation bounds of potential independent interest. Using our gap-free +clustering procedure, we obtain efficient algorithms for the problem of +clustering with a faulty oracle with superior query complexities, notably +achieving $o(n^2)$ sample complexity even in the presence of a large number of +small clusters. Our gap-free clustering procedure also leads to improved +algorithms for recursive clustering. Our results extend to certain +heterogeneous probability settings that are challenging for alternative +algorithms. + +
+
+ comment: 68 pages, 1 figure +
+
+
+
+
+ + ☆ Identifying Constitutive Parameters for Complex Hyperelastic Solids + using Physics-Informed Neural Networks + + +
+ Identifying constitutive parameters in engineering and biological materials, +particularly those with intricate geometries and mechanical behaviors, remains +a longstanding challenge. The recent advent of Physics-Informed Neural Networks +(PINNs) offers promising solutions, but current frameworks are often limited to +basic constitutive laws and encounter practical constraints when combined with +experimental data. In this paper, we introduce a new PINN-based framework +designed to identify material parameters for soft materials, specifically those +exhibiting complex constitutive behaviors, under large deformation in plane +stress conditions. Distinctively, our model emphasizes training PINNs with +multi-modal time-dependent experimental datasets consisting of full-field +deformation and loading history, ensuring algorithm robustness even amidst +noisy data. Our results reveal that our framework can accurately identify +constitutive parameters of the incompressible Arruda-Boyce model for samples +with intricate geometries, maintaining an error below 5%, even with an +experimental noise level of 5%. We believe our framework sets the stage for a +transformative approach in modulus identification for complex solids, +especially for those with geometrical and constitutive intricate. + +
+
+ comment: 31 pages, 5 figures, 1 table +
+
+
+
+
+ + ☆ Hyperbolic Convolutional Neural Networks + + +
+ Deep Learning is mostly responsible for the surge of interest in Artificial +Intelligence in the last decade. So far, deep learning researchers have been +particularly successful in the domain of image processing, where Convolutional +Neural Networks are used. Although excelling at image classification, +Convolutional Neural Networks are quite naive in that no inductive bias is set +on the embedding space for images. Similar flaws are also exhibited by another +type of Convolutional Networks - Graph Convolutional Neural Networks. However, +using non-Euclidean space for embedding data might result in more robust and +explainable models. One example of such a non-Euclidean space is hyperbolic +space. Hyperbolic spaces are particularly useful due to their ability to fit +more data in a low-dimensional space and tree-likeliness properties. These +attractive properties have been previously used in multiple papers which +indicated that they are beneficial for building hierarchical embeddings using +shallow models and, recently, using MLPs and RNNs. + However, no papers have yet suggested a general approach to using Hyperbolic +Convolutional Neural Networks for structured data processing, although these +are the most common examples of data used. Therefore, the goal of this work is +to devise a general recipe for building Hyperbolic Convolutional Neural +Networks. We hypothesize that ability of hyperbolic space to capture hierarchy +in the data would lead to better performance. This ability should be +particularly useful in cases where data has a tree-like structure. Since this +is the case for many existing datasets \citep{wordnet, imagenet, fb15k}, we +argue that such a model would be advantageous both in terms of applications and +future research prospects. + +
+
+
+
+
+ + ☆ RACR-MIL: Weakly Supervised Skin Cancer Grading using Rank-Aware + Contextual Reasoning on Whole Slide Images AAAI + + +
+ Cutaneous squamous cell cancer (cSCC) is the second most common skin cancer +in the US. It is diagnosed by manual multi-class tumor grading using a tissue +whole slide image (WSI), which is subjective and suffers from inter-pathologist +variability. We propose an automated weakly-supervised grading approach for +cSCC WSIs that is trained using WSI-level grade and does not require +fine-grained tumor annotations. The proposed model, RACR-MIL, transforms each +WSI into a bag of tiled patches and leverages attention-based multiple-instance +learning to assign a WSI-level grade. We propose three key innovations to +address general as well as cSCC-specific challenges in tumor grading. First, we +leverage spatial and semantic proximity to define a WSI graph that encodes both +local and non-local dependencies between tumor regions and leverage graph +attention convolution to derive contextual patch features. Second, we introduce +a novel ordinal ranking constraint on the patch attention network to ensure +that higher-grade tumor regions are assigned higher attention. Third, we use +tumor depth as an auxiliary task to improve grade classification in a multitask +learning framework. RACR-MIL achieves 2-9% improvement in grade classification +over existing weakly-supervised approaches on a dataset of 718 cSCC tissue +images and localizes the tumor better. The model achieves 5-20% higher accuracy +in difficult-to-classify high-risk grade classes and is robust to class +imbalance. + +
+
+ comment: 7 pages main text, 2 page references, 3 page appendix; submitted to + AAAI +
+
+
+
+
+ + ☆ Everything Perturbed All at Once: Enabling Differentiable Graph Attacks + + +
+ As powerful tools for representation learning on graphs, graph neural +networks (GNNs) have played an important role in applications including social +networks, recommendation systems, and online web services. However, GNNs have +been shown to be vulnerable to adversarial attacks, which can significantly +degrade their effectiveness. Recent state-of-the-art approaches in adversarial +attacks rely on gradient-based meta-learning to selectively perturb a single +edge with the highest attack score until they reach the budget constraint. +While effective in identifying vulnerable links, these methods are plagued by +high computational costs. By leveraging continuous relaxation and +parameterization of the graph structure, we propose a novel attack method +called Differentiable Graph Attack (DGA) to efficiently generate effective +attacks and meanwhile eliminate the need for costly retraining. Compared to the +state-of-the-art, DGA achieves nearly equivalent attack performance with 6 +times less training time and 11 times smaller GPU memory footprint on different +benchmark datasets. Additionally, we provide extensive experimental analyses of +the transferability of the DGA among different graph models, as well as its +robustness against widely-used defense mechanisms. + +
+
+
+
+
+ + ☆ Mixed Variational Flows for Discrete Variables + + +
+ Variational flows allow practitioners to learn complex continuous +distributions, but approximating discrete distributions remains a challenge. +Current methodologies typically embed the discrete target in a continuous space +- usually via continuous relaxation or dequantization - and then apply a +continuous flow. These approaches involve a surrogate target that may not +capture the original discrete target, might have biased or unstable gradients, +and can create a difficult optimization problem. In this work, we develop a +variational flow family for discrete distributions without any continuous +embedding. First, we develop a measure-preserving and discrete (MAD) invertible +map that leaves the discrete target invariant, and then create a mixed +variational flow (MAD Mix) based on that map. We also develop an extension to +MAD Mix that handles joint discrete and continuous models. Our experiments +suggest that MAD Mix produces more reliable approximations than +continuous-embedding flows while being significantly faster to train. + +
+
+
+
+
+ + ☆ InstaTune: Instantaneous Neural Architecture Search During Fine-Tuning + + +
+ One-Shot Neural Architecture Search (NAS) algorithms often rely on training a +hardware agnostic super-network for a domain specific task. Optimal +sub-networks are then extracted from the trained super-network for different +hardware platforms. However, training super-networks from scratch can be +extremely time consuming and compute intensive especially for large models that +rely on a two-stage training process of pre-training and fine-tuning. State of +the art pre-trained models are available for a wide range of tasks, but their +large sizes significantly limits their applicability on various hardware +platforms. We propose InstaTune, a method that leverages off-the-shelf +pre-trained weights for large models and generates a super-network during the +fine-tuning stage. InstaTune has multiple benefits. Firstly, since the process +happens during fine-tuning, it minimizes the overall time and compute resources +required for NAS. Secondly, the sub-networks extracted are optimized for the +target task, unlike prior work that optimizes on the pre-training objective. +Finally, InstaTune is easy to "plug and play" in existing frameworks. By using +multi-objective evolutionary search algorithms along with lightly trained +predictors, we find Pareto-optimal sub-networks that outperform their +respective baselines across different performance objectives such as accuracy +and MACs. Specifically, we demonstrate that our approach performs well across +both unimodal (ViT and BERT) and multi-modal (BEiT-3) transformer based +architectures. + +
+
+
+
+
+ + ☆ Measurement Tampering Detection Benchmark + + +
+ When training powerful AI systems to perform complex tasks, it may be +challenging to provide training signals which are robust to optimization. One +concern is measurement tampering, where the AI system manipulates multiple +measurements to create the illusion of good results instead of achieving the +desired outcome. In this work, we build four new text-based datasets to +evaluate measurement tampering detection techniques on large language models. +Concretely, given sets of text inputs and measurements aimed at determining if +some outcome occurred, as well as a base model able to accurately predict +measurements, the goal is to determine if examples where all measurements +indicate the outcome actually had the outcome occur, or if this was caused by +measurement tampering. We demonstrate techniques that outperform simple +baselines on most datasets, but don't achieve maximum performance. We believe +there is significant room for improvement for both techniques and datasets, and +we are excited for future work tackling measurement tampering. + +
+
+
+
+
+ + ☆ An Experimental Comparison of Partitioning Strategies for Distributed + Graph Neural Network Training + + +
+ Recently, graph neural networks (GNNs) have gained much attention as a +growing area of deep learning capable of learning on graph-structured data. +However, the computational and memory requirements for training GNNs on +large-scale graphs can exceed the capabilities of single machines or GPUs, +making distributed GNN training a promising direction for large-scale GNN +training. A prerequisite for distributed GNN training is to partition the input +graph into smaller parts that are distributed among multiple machines of a +compute cluster. Although graph partitioning has been extensively studied with +regard to graph analytics and graph databases, its effect on GNN training +performance is largely unexplored. + In this paper, we study the effectiveness of graph partitioning for +distributed GNN training. Our study aims to understand how different factors +such as GNN parameters, mini-batch size, graph type, features size, and +scale-out factor influence the effectiveness of graph partitioning. We conduct +experiments with two different GNN systems using vertex and edge partitioning. +We found that graph partitioning is a crucial pre-processing step that can +heavily reduce the training time and memory footprint. Furthermore, our results +show that invested partitioning time can be amortized by reduced GNN training, +making it a relevant optimization. + +
+
+
+
+
+ + ☆ Can transformers learn the greatest common divisor? + + +
+ I investigate the capability of small transformers to compute the greatest +common divisor (GCD) of two positive integers. When the training distribution +and the representation base are carefully chosen, models achieve 98% accuracy +and correctly predict 91 of the 100 first GCD. Model predictions are +deterministic and fully interpretable. During training, the models learn to +cluster input pairs with the same GCD, and classify them by their divisors. +Basic models, trained from uniform operands encoded on small bases, only +compute a handful of GCD (up to 38 out of 100): the products of divisors of the +base. Longer training and larger bases allow some models to "grok" small prime +GCD. Training from log-uniform operands boosts performance to 73 correct GCD, +and balancing the training distribution of GCD, from inverse square to +log-uniform, to 91 GCD. Training models from a uniform distribution of GCD +breaks the deterministic model behavior. + +
+
+
+
+
+ + ☆ Prototype Fission: Closing Set for Robust Open-set Semi-supervised + Learning + + +
+ Semi-supervised Learning (SSL) has been proven vulnerable to +out-of-distribution (OOD) samples in realistic large-scale unsupervised +datasets due to over-confident pseudo-labeling OODs as in-distribution (ID). A +key underlying problem is class-wise latent space spreading from closed seen +space to open unseen space, and the bias is further magnified in SSL's +self-training loops. To close the ID distribution set so that OODs are better +rejected for safe SSL, we propose Prototype Fission(PF) to divide class-wise +latent spaces into compact sub-spaces by automatic fine-grained latent space +mining, driven by coarse-grained labels only. Specifically, we form multiple +unique learnable sub-class prototypes for each class, optimized towards both +diversity and consistency. The Diversity Modeling term encourages samples to be +clustered by one of the multiple sub-class prototypes, while the Consistency +Modeling term clusters all samples of the same class to a global prototype. +Instead of "opening set", i.e., modeling OOD distribution, Prototype Fission +"closes set" and makes it hard for OOD samples to fit in sub-class latent +space. Therefore, PF is compatible with existing methods for further +performance gains. Extensive experiments validate the effectiveness of our +method in open-set SSL settings in terms of successfully forming sub-classes, +discriminating OODs from IDs and improving overall accuracy. Codes will be +released. + +
+
+
+
+
+ + ☆ Learning Sequential Information in Task-based fMRI for Synthetic Data + Augmentation MICCAI + + +
+ Insufficiency of training data is a persistent issue in medical image +analysis, especially for task-based functional magnetic resonance images (fMRI) +with spatio-temporal imaging data acquired using specific cognitive tasks. In +this paper, we propose an approach for generating synthetic fMRI sequences that +can then be used to create augmented training datasets in downstream learning +tasks. To synthesize high-resolution task-specific fMRI, we adapt the +$\alpha$-GAN structure, leveraging advantages of both GAN and variational +autoencoder models, and propose different alternatives in aggregating temporal +information. The synthetic images are evaluated from multiple perspectives +including visualizations and an autism spectrum disorder (ASD) classification +task. The results show that the synthetic task-based fMRI can provide effective +data augmentation in learning the ASD classification task. + +
+
+ comment: Accepted by Machine Learning in Clinical Neuroimaging 2023 (MICCAI + workshop), preprint version +
+
+
+
+
+ + ☆ Glocal Explanations of Expected Goal Models in Soccer + + +
+ The expected goal models have gained popularity, but their interpretability +is often limited, especially when trained using black-box methods. Explainable +artificial intelligence tools have emerged to enhance model transparency and +extract descriptive knowledge for a single observation or for all observations. +However, explaining black-box models for a specific group of observations may +be more useful in some domains. This paper introduces the glocal explanations +(between local and global levels) of the expected goal models to enable +performance analysis at the team and player levels by proposing the use of +aggregated versions of the SHAP values and partial dependence profiles. This +allows knowledge to be extracted from the expected goal model for a player or +team rather than just a single shot. In addition, we conducted real-data +applications to illustrate the usefulness of aggregated SHAP and aggregated +profiles. The paper concludes with remarks on the potential of these +explanations for performance analysis in soccer analytics. + +
+
+ comment: 26 pages, 8 figures +
+
+
+
+
+ + ☆ Dimensionality Reduction Using pseudo-Boolean polynomials For Cluster + Analysis + + +
+ We introduce usage of a reduction property of penalty-based formulation of +pseudo-Boolean polynomials as a mechanism for invariant dimensionality +reduction in cluster analysis processes. In our experiments, we show that +multidimensional data, like 4-dimensional Iris Flower dataset can be reduced to +2-dimensional space while the 30-dimensional Wisconsin Diagnostic Breast Cancer +(WDBC) dataset can be reduced to 3-dimensional space, and by searching lines or +planes that lie between reduced samples we can extract clusters in a linear and +unbiased manner with competitive accuracies, reproducibility and clear +interpretation. + +
+
+ comment: 14 pages, 4 figures, submitted to the International Conference Data + Analysis, Optimization and Their Applications on the Occasion of Boris + Mirkin's 80th Birthday January 30-31, 2023, Dolgoprudny, Moscow Region, + Moscow Institute of Physics and Technology + https://mipt.ru/education/chairs/dm/conferences/data-analysis-optimization-and-their-applications-2023.php +
+
+
+
+
+ + ☆ Pure Exploration under Mediators' Feedback + + +
+ Stochastic multi-armed bandits are a sequential-decision-making framework, +where, at each interaction step, the learner selects an arm and observes a +stochastic reward. Within the context of best-arm identification (BAI) +problems, the goal of the agent lies in finding the optimal arm, i.e., the one +with highest expected reward, as accurately and efficiently as possible. +Nevertheless, the sequential interaction protocol of classical BAI problems, +where the agent has complete control over the arm being pulled at each round, +does not effectively model several decision-making problems of interest (e.g., +off-policy learning, partially controllable environments, and human feedback). +For this reason, in this work, we propose a novel strict generalization of the +classical BAI problem that we refer to as best-arm identification under +mediators' feedback (BAI-MF). More specifically, we consider the scenario in +which the learner has access to a set of mediators, each of which selects the +arms on the agent's behalf according to a stochastic and possibly unknown +policy. The mediator, then, communicates back to the agent the pulled arm +together with the observed reward. In this setting, the agent's goal lies in +sequentially choosing which mediator to query to identify with high probability +the optimal arm while minimizing the identification time, i.e., the sample +complexity. To this end, we first derive and analyze a statistical lower bound +on the sample complexity specific to our general mediator feedback scenario. +Then, we propose a sequential decision-making strategy for discovering the best +arm under the assumption that the mediators' policies are known to the learner. +As our theory verifies, this algorithm matches the lower bound both almost +surely and in expectation. Finally, we extend these results to cases where the +mediators' policies are unknown to the learner obtaining comparable results. + +
+
+
+
+
+ + ☆ Adversarial Style Transfer for Robust Policy Optimization in Deep + Reinforcement Learning + + +
+ This paper proposes an algorithm that aims to improve generalization for +reinforcement learning agents by removing overfitting to confounding features. +Our approach consists of a max-min game theoretic objective. A generator +transfers the style of observation during reinforcement learning. An additional +goal of the generator is to perturb the observation, which maximizes the +agent's probability of taking a different action. In contrast, a policy network +updates its parameters to minimize the effect of such perturbations, thus +staying robust while maximizing the expected future reward. Based on this +setup, we propose a practical deep reinforcement learning algorithm, +Adversarial Robust Policy Optimization (ARPO), to find a robust policy that +generalizes to unseen environments. We evaluate our approach on Procgen and +Distracting Control Suite for generalization and sample efficiency. +Empirically, ARPO shows improved performance compared to a few baseline +algorithms, including data augmentation. + +
+
+
+
+
+ + ☆ Tuning the perplexity for and computing sampling-based t-SNE embeddings + + +
+ Widely used pipelines for the analysis of high-dimensional data utilize +two-dimensional visualizations. These are created, e.g., via t-distributed +stochastic neighbor embedding (t-SNE). When it comes to large data sets, +applying these visualization techniques creates suboptimal embeddings, as the +hyperparameters are not suitable for large data. Cranking up these parameters +usually does not work as the computations become too expensive for practical +workflows. In this paper, we argue that a sampling-based embedding approach can +circumvent these problems. We show that hyperparameters must be chosen +carefully, depending on the sampling rate and the intended final embedding. +Further, we show how this approach speeds up the computation and increases the +quality of the embeddings. + +
+
+
+
+
+ + ♻ ☆ Empowering Clinicians and Democratizing Data Science: Large Language + Models Automate Machine Learning for Clinical Studies + + +
+ A knowledge gap persists between Machine Learning (ML) developers (e.g., data +scientists) and practitioners (e.g., clinicians), hampering the full +utilization of ML for clinical data analysis. We investigated the potential of +the chatGPT Advanced Data Analysis (ADA), an extension of GPT-4, to bridge this +gap and perform ML analyses efficiently. Real-world clinical datasets and study +details from large trials across various medical specialties were presented to +chatGPT ADA without specific guidance. ChatGPT ADA autonomously developed +state-of-the-art ML models based on the original study's training data to +predict clinical outcomes such as cancer development, cancer progression, +disease complications, or biomarkers such as pathogenic gene sequences. +Strikingly, these ML models matched or outperformed their published +counterparts. We conclude that chatGPT ADA offers a promising avenue to +democratize ML in medicine, making advanced analytics accessible to non-ML +experts and promoting broader applications in medical research and practice. + +
+
+
+
+
+ + ♻ ☆ Fairness-aware Vision Transformer via Debiased Self-Attention + + +
+ Vision Transformer (ViT) has recently gained significant interest in solving +computer vision (CV) problems due to its capability of extracting informative +features and modeling long-range dependencies through the self-attention +mechanism. To fully realize the advantages of ViT in real-world applications, +recent works have explored the trustworthiness of ViT, including its robustness +and explainability. However, another desiderata, fairness has not yet been +adequately addressed in the literature. We establish that the existing +fairness-aware algorithms (primarily designed for CNNs) do not perform well on +ViT. This necessitates the need for developing our novel framework via Debiased +Self-Attention (DSA). DSA is a fairness-through-blindness approach that +enforces ViT to eliminate spurious features correlated with the sensitive +attributes for bias mitigation. Notably, adversarial examples are leveraged to +locate and mask the spurious features in the input image patches. In addition, +DSA utilizes an attention weights alignment regularizer in the training +objective to encourage learning informative features for target prediction. +Importantly, our DSA framework leads to improved fairness guarantees over prior +works on multiple prediction tasks without compromising target prediction +performance. + +
+
+
+
+
+ + ♻ ☆ Mol-Instructions: A Large-Scale Biomolecular Instruction Dataset for + Large Language Models + + +
+ Large Language Models (LLMs), with their remarkable task-handling +capabilities and innovative outputs, have catalyzed significant advancements +across a spectrum of fields. However, their proficiency within specialized +domains such as biomolecular studies remains limited. To address this +challenge, we introduce Mol-Instructions, a meticulously curated, comprehensive +instruction dataset expressly designed for the biomolecular realm. +Mol-Instructions is composed of three pivotal components: molecule-oriented +instructions, protein-oriented instructions, and biomolecular text +instructions, each curated to enhance the understanding and prediction +capabilities of LLMs concerning biomolecular features and behaviors. Through +extensive instruction tuning experiments on the representative LLM, we +underscore the potency of Mol-Instructions to enhance the adaptability and +cognitive acuity of large models within the complex sphere of biomolecular +studies, thereby promoting advancements in the biomolecular research community. +Mol-Instructions is made publicly accessible for future research endeavors and +will be subjected to continual updates for enhanced applicability. + +
+
+ comment: Project homepage: https://github.com/zjunlp/Mol-Instructions. Add + quantitative evaluations +
+
+
+
+
+ + ♻ ☆ An Empirical Investigation of the Role of Pre-training in Lifelong + Learning + + +
+ The lifelong learning paradigm in machine learning is an attractive +alternative to the more prominent isolated learning scheme not only due to its +resemblance to biological learning but also its potential to reduce energy +waste by obviating excessive model re-training. A key challenge to this +paradigm is the phenomenon of catastrophic forgetting. With the increasing +popularity and success of pre-trained models in machine learning, we pose the +question: What role does pre-training play in lifelong learning, specifically +with respect to catastrophic forgetting? We investigate existing methods in the +context of large, pre-trained models and evaluate their performance on a +variety of text and image classification tasks, including a large-scale study +using a novel data set of 15 diverse NLP tasks. Across all settings, we observe +that generic pre-training implicitly alleviates the effects of catastrophic +forgetting when learning multiple tasks sequentially compared to randomly +initialized models. We then further investigate why pre-training alleviates +forgetting in this setting. We study this phenomenon by analyzing the loss +landscape, finding that pre-trained weights appear to ease forgetting by +leading to wider minima. Based on this insight, we propose jointly optimizing +for current task loss and loss basin sharpness to explicitly encourage wider +basins during sequential fine-tuning. We show that this optimization approach +outperforms several state-of-the-art task-sequential continual learning +algorithms across multiple settings, occasionally even without retaining a +memory that scales in size with the number of tasks. + +
+
+
+
+
+ + ♻ ☆ Beyond Document Page Classification: Design, Datasets, and Challenges + + +
+ This paper highlights the need to bring document classification benchmarking +closer to real-world applications, both in the nature of data tested ($X$: +multi-channel, multi-paged, multi-industry; $Y$: class distributions and label +set variety) and in classification tasks considered ($f$: multi-page document, +page stream, and document bundle classification, ...). We identify the lack of +public multi-page document classification datasets, formalize different +classification tasks arising in application scenarios, and motivate the value +of targeting efficient multi-page document representations. An experimental +study on proposed multi-page document classification datasets demonstrates that +current benchmarks have become irrelevant and need to be updated to evaluate +complete documents, as they naturally occur in practice. This reality check +also calls for more mature evaluation methodologies, covering calibration +evaluation, inference complexity (time-memory), and a range of realistic +distribution shifts (e.g., born-digital vs. scanning noise, shifting page +order). Our study ends on a hopeful note by recommending concrete avenues for +future improvements.} + +
+
+ comment: 8 pages, under review +
+
+
+
+
+ + ♻ ☆ inTformer: A Time-Embedded Attention-Based Transformer for Crash + Likelihood Prediction at Intersections Using Connected Vehicle Data + + +
+ The real-time crash likelihood prediction model is an essential component of +the proactive traffic safety management system. Over the years, numerous +studies have attempted to construct a crash likelihood prediction model in +order to enhance traffic safety, but mostly on freeways. In the majority of the +existing studies, researchers have primarily employed a deep learning-based +framework to identify crash potential. Lately, Transformer has emerged as a +potential deep neural network that fundamentally operates through +attention-based mechanisms. Transformer has several functional benefits over +extant deep learning models such as LSTM, CNN, etc. Firstly, Transformer can +readily handle long-term dependencies in a data sequence. Secondly, +Transformers can parallelly process all elements in a data sequence during +training. Finally, a Transformer does not have the vanishing gradient issue. +Realizing the immense possibility of Transformers, this paper proposes +inTersection-Transformer (inTformer), a time-embedded attention-based +Transformer model that can effectively predict intersection crash likelihood in +real-time. The proposed model was evaluated using connected vehicle data +extracted from Signal Analytics Platform. Acknowledging the complex traffic +operation mechanism at intersection, this study developed zone-specific models +by dividing the intersection region into two distinct zones: +within-intersection and approach zone. The best inTformer models in +'within-intersection,' and 'approach' zone achieved a sensitivity of 73%, and +70%, respectively. The zone-level models were also compared to earlier studies +on crash likelihood prediction at intersections and with several established +deep learning models trained on the same connected vehicle dataset. + +
+
+ comment: 29 pages, 10 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ A Bayesian Framework for Digital Twin-Based Control, Monitoring, and + Data Collection in Wireless Systems + + +
+ Commonly adopted in the manufacturing and aerospace sectors, digital twin +(DT) platforms are increasingly seen as a promising paradigm to control, +monitor, and analyze software-based, "open", communication systems. Notably, DT +platforms provide a sandbox in which to test artificial intelligence (AI) +solutions for communication systems, potentially reducing the need to collect +data and test algorithms in the field, i.e., on the physical twin (PT). A key +challenge in the deployment of DT systems is to ensure that virtual control +optimization, monitoring, and analysis at the DT are safe and reliable, +avoiding incorrect decisions caused by "model exploitation". To address this +challenge, this paper presents a general Bayesian framework with the aim of +quantifying and accounting for model uncertainty at the DT that is caused by +limitations in the amount and quality of data available at the DT from the PT. +In the proposed framework, the DT builds a Bayesian model of the communication +system, which is leveraged to enable core DT functionalities such as control +via multi-agent reinforcement learning (MARL), monitoring of the PT for anomaly +detection, prediction, data-collection optimization, and counterfactual +analysis. To exemplify the application of the proposed framework, we +specifically investigate a case-study system encompassing multiple sensing +devices that report to a common receiver. Experimental results validate the +effectiveness of the proposed Bayesian framework as compared to standard +frequentist model-based solutions. + +
+
+ comment: Accepted for publication in IEEE Journal on Selected Areas in + Communications ; Extends and subsumes arXiv:2210.05582 ; Updates: - + 18/01/2023: Updated reference ; - 29/08/2023: Revised manuscript version +
+
+
+
+
+ + ♻ ☆ Investigating Reproducibility at Interspeech Conferences: A Longitudinal + and Comparative Perspective + + +
+ Reproducibility is a key aspect for scientific advancement across +disciplines, and reducing barriers for open science is a focus area for the +theme of Interspeech 2023. Availability of source code is one of the indicators +that facilitates reproducibility. However, less is known about the rates of +reproducibility at Interspeech conferences in comparison to other conferences +in the field. In order to fill this gap, we have surveyed 27,717 papers at +seven conferences across speech and language processing disciplines. We find +that despite having a close number of accepted papers to the other conferences, +Interspeech has up to 40% less source code availability. In addition to +reporting the difficulties we have encountered during our research, we also +provide recommendations and possible directions to increase reproducibility for +further studies. + +
+
+
+
+
+ + ♻ torchgfn: A PyTorch GFlowNet library + + +
+ The growing popularity of generative flow networks (GFlowNets or GFNs) from a +range of researchers with diverse backgrounds and areas of expertise +necessitates a library which facilitates the testing of new features such as +training losses that can be easily compared to standard benchmark +implementations, or on a set of common environments. torchgfn is a PyTorch +library that aims to address this need. It provides users with a simple API for +environments and useful abstractions for samplers and losses. Multiple examples +are provided, replicating and unifying published results. The code is available +in https://github.com/saleml/torchgfn. + +
+
+
+
+
+ + ♻ ☆ Application Performance Modeling via Tensor Completion + + +
+ Performance tuning, software/hardware co-design, and job scheduling are among +the many tasks that rely on models to predict application performance. We +propose and evaluate low-rank tensor decomposition for modeling application +performance. We discretize the input and configuration domains of an +application using regular grids. Application execution times mapped within +grid-cells are averaged and represented by tensor elements. We show that +low-rank canonical-polyadic (CP) tensor decomposition is effective in +approximating these tensors. We further show that this decomposition enables +accurate extrapolation of unobserved regions of an application's parameter +space. We then employ tensor completion to optimize a CP decomposition given a +sparse set of observed execution times. We consider alternative +piecewise/grid-based models and supervised learning models for six applications +and demonstrate that CP decomposition optimized using tensor completion offers +higher prediction accuracy and memory-efficiency for high-dimensional +performance modeling. + +
+
+
+
+
+ + ♻ ☆ Learning Bayesian Networks with Heterogeneous Agronomic Data Sets via + Mixed-Effect Models and Hierarchical Clustering + + +
+ Research involving diverse but related data sets, where associations between +covariates and outcomes may vary, is prevalent in various fields including +agronomic studies. In these scenarios, hierarchical models, also known as +multilevel models, are frequently employed to assimilate information from +different data sets while accommodating their distinct characteristics. +However, their structure extend beyond simple heterogeneity, as variables often +form complex networks of causal relationships. + Bayesian networks (BNs) provide a powerful framework for modelling such +relationships using directed acyclic graphs to illustrate the connections +between variables. This study introduces a novel approach that integrates +random effects into BN learning. Rooted in linear mixed-effects models, this +approach is particularly well-suited for handling hierarchical data. Results +from a real-world agronomic trial suggest that employing this approach enhances +structural learning, leading to the discovery of new connections and the +improvement of improved model specification. Furthermore, we observe a +reduction in prediction errors from 28% to 17%. By extending the applicability +of BNs to complex data set structures, this approach contributes to the +effective utilisation of BNs for hierarchical agronomic data. This, in turn, +enhances their value as decision-support tools in the field. + +
+
+ comment: 28 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Quantifying Causes of Arctic Amplification via Deep Learning based + Time-series Causal Inference + + +
+ The warming of the Arctic, also known as Arctic amplification, is led by +several atmospheric and oceanic drivers. However, the details of its underlying +thermodynamic causes are still unknown. Inferring the causal effects of +atmospheric processes on sea ice melt using fixed treatment effect strategies +leads to unrealistic counterfactual estimations. Such models are also prone to +bias due to time-varying confoundedness. Further, the complex non-linearity in +Earth science data makes it infeasible to perform causal inference using +existing marginal structural techniques. In order to tackle these challenges, +we propose TCINet - time-series causal inference model to infer causation under +continuous treatment using recurrent neural networks and a novel probabilistic +balancing technique. Through experiments on synthetic and observational data, +we show how our research can substantially improve the ability to quantify +leading causes of Arctic sea ice melt, further paving paths for causal +inference in observational Earth science. + +
+
+
+
+
+ + ♻ ☆ Bayesian Feature Selection in Joint Quantile Time Series Analysis + + +
+ Quantile feature selection over correlated multivariate time series data has +always been a methodological challenge and is an open problem. In this paper, +we propose a general Bayesian dimension reduction methodology for feature +selection in high-dimensional joint quantile time series analysis, under the +name of the quantile feature selection time series (QFSTS) model. The QFSTS +model is a general structural time series model, where each component yields an +additive contribution to the time series modeling with direct interpretations. +Its flexibility is compound in the sense that users can add/deduct components +for each time series and each time series can have its own specific valued +components of different sizes. Feature selection is conducted in the quantile +regression component, where each time series has its own pool of +contemporaneous external predictors allowing nowcasting. Bayesian methodology +in extending feature selection to the quantile time series research area is +developed using multivariate asymmetric Laplace distribution, spike-and-slab +prior setup, the Metropolis-Hastings algorithm, and the Bayesian model +averaging technique, all implemented consistently in the Bayesian paradigm. The +QFSTS model requires small datasets to train and converges fast. Extensive +examinations confirmed that the QFSTS model has superior performance in feature +selection, parameter estimation, and forecast. + +
+
+ comment: Accepted to the Bayesian Analysis journal +
+
+
+
+
+ + ♻ ☆ Uncertainty-inspired Open Set Learning for Retinal Anomaly + Identification + + +
+ Failure to recognize samples from the classes unseen during training is a +major limitation of artificial intelligence in the real-world implementation +for recognition and classification of retinal anomalies. We established an +uncertainty-inspired open-set (UIOS) model, which was trained with fundus +images of 9 retinal conditions. Besides assessing the probability of each +category, UIOS also calculated an uncertainty score to express its confidence. +Our UIOS model with thresholding strategy achieved an F1 score of 99.55%, +97.01% and 91.91% for the internal testing set, external target categories +(TC)-JSIEC dataset and TC-unseen testing set, respectively, compared to the F1 +score of 92.20%, 80.69% and 64.74% by the standard AI model. Furthermore, UIOS +correctly predicted high uncertainty scores, which would prompt the need for a +manual check in the datasets of non-target categories retinal diseases, +low-quality fundus images, and non-fundus images. UIOS provides a robust method +for real-world screening of retinal anomalies. + +
+
+
+
+
+ + ♻ ☆ Combinatorial Pure Exploration with Full-bandit Feedback and Beyond: + Solving Combinatorial Optimization under Uncertainty with Limited Observation + + +
+ Combinatorial optimization is one of the fundamental research fields that has +been extensively studied in theoretical computer science and operations +research. When developing an algorithm for combinatorial optimization, it is +commonly assumed that parameters such as edge weights are exactly known as +inputs. However, this assumption may not be fulfilled since input parameters +are often uncertain or initially unknown in many applications such as +recommender systems, crowdsourcing, communication networks, and online +advertisement. To resolve such uncertainty, the problem of combinatorial pure +exploration of multi-armed bandits (CPE) and its variants have recieved +increasing attention. Earlier work on CPE has studied the semi-bandit feedback +or assumed that the outcome from each individual edge is always accessible at +all rounds. However, due to practical constraints such as a budget ceiling or +privacy concern, such strong feedback is not always available in recent +applications. In this article, we review recently proposed techniques for +combinatorial pure exploration problems with limited feedback. + +
+
+ comment: Preprint of an Invited Review Article, In Fields Institute +
+
+
+
+
+ + ♻ ☆ Large Language Models are Fixated by Red Herrings: Exploring Creative + Problem Solving and Einstellung Effect using the Only Connect Wall Dataset + + +
+ The quest for human imitative AI has been an enduring topic in AI research +since its inception. The technical evolution and emerging capabilities of the +latest cohort of large language models (LLMs) have reinvigorated the subject +beyond academia to the cultural zeitgeist. While recent NLP evaluation +benchmark tasks test some aspects of human-imitative behaviour (e.g., +BIG-bench's 'human-like behavior' tasks), few, if not none, examine creative +problem solving abilities. Creative problem solving in humans is a well-studied +topic in cognitive neuroscience with standardized tests that predominantly use +the ability to associate (heterogeneous) connections among clue words as a +metric for creativity. Exposure to misleading stimuli - distractors dubbed red +herrings - impede human performance in such tasks via the fixation effect and +Einstellung paradigm. In cognitive neuroscience studies, such fixations are +experimentally induced by pre-exposing participants to orthographically similar +incorrect words to subsequent word-fragments or clues. The popular British quiz +show Only Connect's Connecting Wall segment essentially mimics Mednick's Remote +Associates Test (RAT) formulation with built-in, deliberate red herrings, which +makes it an ideal proxy dataset to explore and study fixation effect and +Einstellung paradigm from cognitive neuroscience in LLMs. In this paper we +present the novel Only Connect Wall (OCW) dataset and report results from our +evaluation of selected pre-trained language models and LLMs on creative problem +solving tasks like grouping clue words by heterogeneous connections, and +identifying correct open knowledge domain connections in respective groups. We +synthetically generate two additional datasets: OCW-Randomized, OCW-WordNet to +further analyze our red-herrings hypothesis in language models. The code and +link to the dataset are available at https://github.com/TaatiTeam/OCW. + +
+
+ comment: V3: Minor cosmetic adjustment from V2. Fixed Fig. 2 caption + overlapping with text in S2.2. V2: with added OCW-Randomized and OCW-WordNet + results in Section 4.3 (added). 22 pages with Appendix +
+
+
+
+
+ + ♻ ☆ Deep Learning Based Residuals in Non-linear Factor Models: Precision + Matrix Estimation of Returns with Low Signal-to-Noise Ratio + + +
+ This paper introduces a consistent estimator and rate of convergence for the +precision matrix of asset returns in large portfolios using a non-linear factor +model within the deep learning framework. Our estimator remains valid even in +low signal-to-noise ratio environments typical for financial markets and is +compatible with weak factors. Our theoretical analysis establishes uniform +bounds on expected estimation risk based on deep neural networks for an +expanding number of assets. Additionally, we provide a new consistent +data-dependent estimator of error covariance in deep neural networks. Our +models demonstrate superior accuracy in extensive simulations and the empirics. + +
+
+
+
+
+ + ♻ ☆ Atlas-Based Interpretable Age Prediction In Whole-Body MR Images + + +
+ Age prediction is an important part of medical assessments and research. It +can aid in detecting diseases as well as abnormal ageing by highlighting the +discrepancy between chronological and biological age. To gain a comprehensive +understanding of age-related changes observed in various body parts, we +investigate them on a larger scale by using whole-body images. We utilise the +Grad-CAM interpretability method to determine the body areas most predictive of +a person's age. We expand our analysis beyond individual subjects by employing +registration techniques to generate population-wide interpretability maps. +Furthermore, we set state-of-the-art whole-body age prediction with a model +that achieves a mean absolute error of 2.76 years. Our findings reveal three +primary areas of interest: the spine, the autochthonous back muscles, and the +cardiac region, which exhibits the highest importance. + +
+
+
+
+
+ + ♻ ☆ Generalized partitioned local depth + + +
+ In this paper we provide a generalization of the concept of cohesion as +introduced recently by Berenhaut, Moore and Melvin [Proceedings of the National +Academy of Sciences, 119 (4) (2022)]. The formulation presented builds on the +technique of partitioned local depth by distilling two key probabilistic +concepts: local relevance and support division. Earlier results are extended +within the new context, and examples of applications to revealing communities +in data with uncertainty are included. The work sheds light on the foundations +of partitioned local depth, and extends the original ideas to enable +probabilistic consideration of uncertain, variable and potentially conflicting +information. + +
+
+ comment: Improved exposition & motivation, references added, 19 pages, 6 + figures +
+
+
+
+
+ + ♻ ☆ An Analysis of Abstracted Model-Based Reinforcement Learning + + +
+ Many methods for Model-based Reinforcement learning (MBRL) in Markov decision +processes (MDPs) provide guarantees for both the accuracy of the model they can +deliver and the learning efficiency. At the same time, state abstraction +techniques allow for a reduction of the size of an MDP while maintaining a +bounded loss with respect to the original problem. Therefore, it may come as a +surprise that no such guarantees are available when combining both techniques, +i.e., where MBRL merely observes abstract states. Our theoretical analysis +shows that abstraction can introduce a dependence between samples collected +online (e.g., in the real world). That means that, without taking this +dependence into account, results for MBRL do not directly extend to this +setting. Our result shows that we can use concentration inequalities for +martingales to overcome this problem. This result makes it possible to extend +the guarantees of existing MBRL algorithms to the setting with abstraction. We +illustrate this by combining R-MAX, a prototypical MBRL algorithm, with +abstraction, thus producing the first performance guarantees for model-based +`RL from Abstracted Observations': model-based reinforcement learning with an +abstract model. + +
+
+ comment: 36 pages, 2 figures, submitted to TMLR +
+
+
+
+
+ + ♻ ☆ Strategic Coalition for Data Pricing in IoT Data Markets + + +
+ This paper considers a market for trading Internet of Things (IoT) data that +is used to train machine learning models. The data, either raw or processed, is +supplied to the market platform through a network and the price of such data is +controlled based on the value it brings to the machine learning model. We +explore the correlation property of data in a game-theoretical setting to +eventually derive a simplified distributed solution for a data trading +mechanism that emphasizes the mutual benefit of devices and the market. The key +proposal is an efficient algorithm for markets that jointly addresses the +challenges of availability and heterogeneity in participation, as well as the +transfer of trust and the economic value of data exchange in IoT networks. The +proposed approach establishes the data market by reinforcing collaboration +opportunities between device with correlated data to avoid information leakage. +Therein, we develop a network-wide optimization problem that maximizes the +social value of coalition among the IoT devices of similar data types; at the +same time, it minimizes the cost due to network externalities, i.e., the impact +of information leakage due to data correlation, as well as the opportunity +costs. Finally, we reveal the structure of the formulated problem as a +distributed coalition game and solve it following the simplified +split-and-merge algorithm. Simulation results show the efficacy of our proposed +mechanism design toward a trusted IoT data market, with up to 32.72% gain in +the average payoff for each seller. + +
+
+ comment: 15 pages. 12 figures. This paper has been accepted for publication in + IEEE Internet of Things Journal. Copyright may change without notice +
+
+
+
+
+ + ♻ ☆ Cyclic and Randomized Stepsizes Invoke Heavier Tails in SGD than + Constant Stepsize + + +
+ Cyclic and randomized stepsizes are widely used in the deep learning practice +and can often outperform standard stepsize choices such as constant stepsize in +SGD. Despite their empirical success, not much is currently known about when +and why they can theoretically improve the generalization performance. We +consider a general class of Markovian stepsizes for learning, which contain +i.i.d. random stepsize, cyclic stepsize as well as the constant stepsize as +special cases, and motivated by the literature which shows that heaviness of +the tails (measured by the so-called "tail-index") in the SGD iterates is +correlated with generalization, we study tail-index and provide a number of +theoretical results that demonstrate how the tail-index varies on the stepsize +scheduling. Our results bring a new understanding of the benefits of cyclic and +randomized stepsizes compared to constant stepsize in terms of the tail +behavior. We illustrate our theory on linear regression experiments and show +through deep learning experiments that Markovian stepsizes can achieve even a +heavier tail and be a viable alternative to cyclic and i.i.d. randomized +stepsize rules. + +
+
+ comment: To Appear +
+
+
+
+
+ + ♻ ☆ Explainable AI Insights for Symbolic Computation: A case study on + selecting the variable ordering for cylindrical algebraic decomposition + + +
+ In recent years there has been increased use of machine learning (ML) +techniques within mathematics, including symbolic computation where it may be +applied safely to optimise or select algorithms. This paper explores whether +using explainable AI (XAI) techniques on such ML models can offer new insight +for symbolic computation, inspiring new implementations within computer algebra +systems that do not directly call upon AI tools. We present a case study on the +use of ML to select the variable ordering for cylindrical algebraic +decomposition. It has already been demonstrated that ML can make the choice +well, but here we show how the SHAP tool for explainability can be used to +inform new heuristics of a size and complexity similar to those human-designed +heuristics currently commonly used in symbolic computation. + +
+
+ comment: 40 pages +
+
+
+
+
+ + ♻ ☆ Towards an AI-enabled Connected Industry: AGV Communication and Sensor + Measurement Datasets + + +
+ This paper presents two wireless measurement campaigns in industrial +testbeds: industrial Vehicle-to-vehicle (iV2V) and industrial +Vehicle-to-infrastructure plus Sensor (iV2I+), together with detailed +information about the two captured datasets. iV2V covers sidelink communication +scenarios between Automated Guided Vehicles (AGVs), while iV2I+ is conducted at +an industrial setting where an autonomous cleaning robot is connected to a +private cellular network. The combination of different communication +technologies within a common measurement methodology provides insights that can +be exploited by Machine Learning (ML) for tasks such as fingerprinting, +line-of-sight detection, prediction of quality of service or link selection. +Moreover, the datasets are publicly available, labelled and prefiltered for +fast on-boarding and applicability. + +
+
+ comment: 7 pages, 3 figures. Submitted to a magazine. Datasets available at + https://ieee-dataport.org/open-access/ai4mobile-industrial-wireless-datasets-iv2v-and-iv2i +
+
+
+
+
+ + ♻ ☆ Deep Curvilinear Editing: Commutative and Nonlinear Image Manipulation + for Pretrained Deep Generative Model CVPR2023 + + +
+ Semantic editing of images is the fundamental goal of computer vision. +Although deep learning methods, such as generative adversarial networks (GANs), +are capable of producing high-quality images, they often do not have an +inherent way of editing generated images semantically. Recent studies have +investigated a way of manipulating the latent variable to determine the images +to be generated. However, methods that assume linear semantic arithmetic have +certain limitations in terms of the quality of image editing, whereas methods +that discover nonlinear semantic pathways provide non-commutative editing, +which is inconsistent when applied in different orders. This study proposes a +novel method called deep curvilinear editing (DeCurvEd) to determine semantic +commuting vector fields on the latent space. We theoretically demonstrate that +owing to commutativity, the editing of multiple attributes depends only on the +quantities and not on the order. Furthermore, we experimentally demonstrate +that compared to previous methods, the nonlinear and commutative nature of +DeCurvEd facilitates the disentanglement of image attributes and provides +higher-quality editing. + +
+
+ comment: 15 pages. The last update made no changes except for adding the + following link to the CVF repository: + https://openaccess.thecvf.com/content/CVPR2023/html/Aoshima_Deep_Curvilinear_Editing_Commutative_and_Nonlinear_Image_Manipulation_for_Pretrained_CVPR_2023_paper.html. + Here, you can find our code to reproduce our results +
+
+
+
+
+ + ♻ ☆ Recurrent segmentation meets block models in temporal networks + + +
+ A popular approach to model interactions is to represent them as a network +with nodes being the agents and the interactions being the edges. Interactions +are often timestamped, which leads to having timestamped edges. Many real-world +temporal networks have a recurrent or possibly cyclic behaviour. For example, +social network activity may be heightened during certain hours of day. In this +paper, our main interest is to model recurrent activity in such temporal +networks. As a starting point we use stochastic block model, a popular choice +for modelling static networks, where nodes are split into $R$ groups. We extend +this model to temporal networks by modelling the edges with a Poisson process. +We make the parameters of the process dependent on time by segmenting the time +line into $K$ segments. To enforce the recurring activity we require that only +$H < K$ different set of parameters can be used, that is, several, not +necessarily consecutive, segments must share their parameters. We prove that +the searching for optimal blocks and segmentation is an NP-hard problem. +Consequently, we split the problem into 3 subproblems where we optimize blocks, +model parameters, and segmentation in turn while keeping the remaining +structures fixed. We propose an iterative algorithm that requires $O(KHm + Rn + +R^2H)$ time per iteration, where $n$ and $m$ are the number of nodes and edges +in the network. We demonstrate experimentally that the number of required +iterations is typically low, the algorithm is able to discover the ground truth +from synthetic datasets, and show that certain real-world networks exhibit +recurrent behaviour as the likelihood does not deteriorate when $H$ is lowered. + +
+
+
+
+
+ + ♻ ☆ Walking Your LiDOG: A Journey Through Multiple Domains for LiDAR + Semantic Segmentation ICCV 2023 + + +
+ The ability to deploy robots that can operate safely in diverse environments +is crucial for developing embodied intelligent agents. As a community, we have +made tremendous progress in within-domain LiDAR semantic segmentation. However, +do these methods generalize across domains? To answer this question, we design +the first experimental setup for studying domain generalization (DG) for LiDAR +semantic segmentation (DG-LSS). Our results confirm a significant gap between +methods, evaluated in a cross-domain setting: for example, a model trained on +the source dataset (SemanticKITTI) obtains $26.53$ mIoU on the target data, +compared to $48.49$ mIoU obtained by the model trained on the target domain +(nuScenes). To tackle this gap, we propose the first method specifically +designed for DG-LSS, which obtains $34.88$ mIoU on the target domain, +outperforming all baselines. Our method augments a sparse-convolutional +encoder-decoder 3D segmentation network with an additional, dense 2D +convolutional decoder that learns to classify a birds-eye view of the point +cloud. This simple auxiliary task encourages the 3D network to learn features +that are robust to sensor placement shifts and resolution, and are transferable +across domains. With this work, we aim to inspire the community to develop and +evaluate future models in such cross-domain conditions. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Physics-informed neural networks for solving forward and inverse + problems in complex beam systems + + +
+ This paper proposes a new framework using physics-informed neural networks +(PINNs) to simulate complex structural systems that consist of single and +double beams based on Euler-Bernoulli and Timoshenko theory, where the double +beams are connected with a Winkler foundation. In particular, forward and +inverse problems for the Euler-Bernoulli and Timoshenko partial differential +equations (PDEs) are solved using nondimensional equations with the +physics-informed loss function. Higher-order complex beam PDEs are efficiently +solved for forward problems to compute the transverse displacements and +cross-sectional rotations with less than 1e-3 percent error. Furthermore, +inverse problems are robustly solved to determine the unknown dimensionless +model parameters and applied force in the entire space-time domain, even in the +case of noisy data. The results suggest that PINNs are a promising strategy for +solving problems in engineering structures and machines involving beam systems. + +
+
+
+
+
+ + ♻ ☆ A Survey of Imbalanced Learning on Graphs: Problems, Techniques, and + Future Directions + + +
+ Graphs represent interconnected structures prevalent in a myriad of +real-world scenarios. Effective graph analytics, such as graph learning +methods, enables users to gain profound insights from graph data, underpinning +various tasks including node classification and link prediction. However, these +methods often suffer from data imbalance, a common issue in graph data where +certain segments possess abundant data while others are scarce, thereby leading +to biased learning outcomes. This necessitates the emerging field of imbalanced +learning on graphs, which aims to correct these data distribution skews for +more accurate and representative learning outcomes. In this survey, we embark +on a comprehensive review of the literature on imbalanced learning on graphs. +We begin by providing a definitive understanding of the concept and related +terminologies, establishing a strong foundational understanding for readers. +Following this, we propose two comprehensive taxonomies: (1) the problem +taxonomy, which describes the forms of imbalance we consider, the associated +tasks, and potential solutions; (2) the technique taxonomy, which details key +strategies for addressing these imbalances, and aids readers in their method +selection process. Finally, we suggest prospective future directions for both +problems and techniques within the sphere of imbalanced learning on graphs, +fostering further innovation in this critical area. + +
+
+ comment: The collection of awesome literature on imbalanced learning on + graphs: https://github.com/Xtra-Computing/Awesome-Literature-ILoGs +
+
+
+
+
+ + ♻ ☆ Policy Gradient for Reinforcement Learning with General Utilities + + +
+ In Reinforcement Learning (RL), the goal of agents is to discover an optimal +policy that maximizes the expected cumulative rewards. This objective may also +be viewed as finding a policy that optimizes a linear function of its +state-action occupancy measure, hereafter referred as Linear RL. However, many +supervised and unsupervised RL problems are not covered in the Linear RL +framework, such as apprenticeship learning, pure exploration and variational +intrinsic control, where the objectives are non-linear functions of the +occupancy measures. RL with non-linear utilities looks unwieldy, as methods +like Bellman equation, value iteration, policy gradient, dynamic programming +that had tremendous success in Linear RL, fail to trivially generalize. In this +paper, we derive the policy gradient theorem for RL with general utilities. The +policy gradient theorem proves to be a cornerstone in Linear RL due to its +elegance and ease of implementability. Our policy gradient theorem for RL with +general utilities shares the same elegance and ease of implementability. Based +on the policy gradient theorem derived, we also present a simple sample-based +algorithm. We believe our results will be of interest to the community and +offer inspiration to future works in this generalized setting. + +
+
+
+
+
+ + ♻ ☆ Combining Primal and Dual Representations in Deep Restricted Kernel + Machines Classifiers + + +
+ In the context of deep learning with kernel machines, the deep Restricted +Kernel Machine (DRKM) framework allows multiple levels of kernel PCA (KPCA) and +Least-Squares Support Vector Machines (LSSVM) to be combined into a deep +architecture using visible and hidden units. We propose a new method for DRKM +classification coupling the objectives of KPCA and classification levels, with +the hidden feature matrix lying on the Stiefel manifold. The classification +level can be formulated as an LSSVM or as an MLP feature map, combining depth +in terms of levels and layers. The classification level is expressed in its +primal formulation, as the deep KPCA levels, in their dual formulation, can +embed the most informative components of the data in a much lower dimensional +space. The dual setting is independent of the dimension of the inputs and the +primal setting is parametric, which makes the proposed method computationally +efficient for both high-dimensional inputs and large datasets. In the +experiments, we show that our developed algorithm can effectively learn from +small datasets, while using less memory than the convolutional neural network +(CNN) with high-dimensional data. and that models with multiple KPCA levels can +outperform models with a single level. On the tested larger-scale datasets, +DRKM is more energy efficient than CNN while maintaining comparable +performance. + +
+
+
+
+
+ + ♻ ☆ Efficient Representation of Natural Image Patches + + +
+ In the complex domain of neural information processing, discerning +fundamental principles from ancillary details remains a significant challenge. +While there is extensive knowledge about the anatomy and physiology of the +early visual system, a comprehensive computational theory remains elusive. Can +we gain insights into the underlying principles of a biological system by +abstracting away from its detailed implementation and focusing on the +fundamental problems that the system is designed to solve? Utilizing an +abstract model based on minimal yet realistic assumptions, we show how to +achieve the early visual system's two ultimate objectives: efficient +information transmission and sensor probability distribution modeling. We show +that optimizing for information transmission does not yield optimal probability +distribution modeling. We illustrate, using a two-pixel (2D) system and image +patches, that an efficient representation can be realized via nonlinear +population code driven by two types of biologically plausible loss functions +that depend solely on output. After unsupervised learning, our abstract IPU +model bears remarkable resemblances to biological systems, despite not +mimicking many features of real neurons, such as spiking activity. A +preliminary comparison with a contemporary deep learning model suggests that +the IPU model offers a significant efficiency advantage. Our model provides +novel insights into the computational theory of early visual systems as well as +a potential new approach to enhance the efficiency of deep learning models. + +
+
+
+
+
+ + ♻ ☆ Incomplete Multi-View Weak-Label Learning with Noisy Features and + Imbalanced Labels + + +
+ A variety of modern applications exhibit multi-view multi-label learning, +where each sample has multi-view features, and multiple labels are correlated +via common views. Current methods usually fail to directly deal with the +setting where only a subset of features and labels are observed for each +sample, and ignore the presence of noisy views and imbalanced labels in +real-world problems. In this paper, we propose a novel method to overcome the +limitations. It jointly embeds incomplete views and weak labels into a +low-dimensional subspace with adaptive weights, and facilitates the difference +between embedding weight matrices via auto-weighted Hilbert-Schmidt +Independence Criterion (HSIC) to reduce the redundancy. Moreover, it adaptively +learns view-wise importance for embedding to detect noisy views, and mitigates +the label imbalance problem by focal loss. Experimental results on four +real-world multi-view multi-label datasets demonstrate the effectiveness of the +proposed method. + +
+
+ comment: 6 pages, 2 figures, conference +
+
+
+
+
+ + ♻ ☆ Multi-label Node Classification On Graph-Structured Data + + +
+ Graph Neural Networks (GNNs) have shown state-of-the-art improvements in node +classification tasks on graphs. While these improvements have been largely +demonstrated in a multi-class classification scenario, a more general and +realistic scenario in which each node could have multiple labels has so far +received little attention. The first challenge in conducting focused studies on +multi-label node classification is the limited number of publicly available +multi-label graph datasets. Therefore, as our first contribution, we collect +and release three real-world biological datasets and develop a multi-label +graph generator to generate datasets with tunable properties. While high label +similarity (high homophily) is usually attributed to the success of GNNs, we +argue that a multi-label scenario does not follow the usual semantics of +homophily and heterophily so far defined for a multi-class scenario. As our +second contribution, besides defining homophily for the multi-label scenario, +we develop a new approach that dynamically fuses the feature and label +correlation information to learn label-informed representations. Finally, we +perform a large-scale comparative study with $10$ methods and $9$ datasets +which also showcase the effectiveness of our approach. We release our benchmark +at \url{https://anonymous.4open.science/r/LFLF-5D8C/}. + +
+
+
+
+
+ + ♻ ☆ Improving Few-Shot Prompts with Relevant Static Analysis Products + + +
+ Large Language Models (LLM) are a new class of computation engines, +"programmed" via prompt engineering. We are still learning how to best +"program" these LLMs to help developers. We start with the intuition that +developers tend to consciously and unconsciously have a collection of semantics +facts in mind when working on coding tasks. Mostly these are shallow, simple +facts arising from a quick read. For a function, examples of facts might +include parameter and local variable names, return expressions, simple pre- and +post-conditions, and basic control and data flow, etc. + One might assume that the powerful multi-layer architecture of +transformer-style LLMs makes them inherently capable of doing this simple level +of "code analysis" and extracting such information, implicitly, while +processing code: but are they, really? If they aren't, could explicitly adding +this information help? Our goal here is to investigate this question, using the +code summarization task and evaluate whether automatically augmenting an LLM's +prompt with semantic facts explicitly, actually helps. + Prior work shows that LLM performance on code summarization benefits from +few-shot samples drawn either from the same-project or from examples found via +information retrieval methods (such as BM25). While summarization performance +has steadily increased since the early days, there is still room for +improvement: LLM performance on code summarization still lags its performance +on natural-language tasks like translation and text summarization. + We find that adding semantic facts actually does help! This approach improves +performance in several different settings suggested by prior work, including +for two different Large Language Models. In most cases, improvement nears or +exceeds 2 BLEU; for the PHP language in the challenging CodeSearchNet dataset, +this augmentation actually yields performance surpassing 30 BLEU. + +
+
+
+
+
+ + ♻ ☆ AdaTerm: Adaptive T-Distribution Estimated Robust Moments for + Noise-Robust Stochastic Gradient Optimization + + +
+ With the increasing practicality of deep learning applications, practitioners +are inevitably faced with datasets corrupted by noise from various sources such +as measurement errors, mislabeling, and estimated surrogate inputs/outputs that +can adversely impact the optimization results. It is a common practice to +improve the optimization algorithm's robustness to noise, since this algorithm +is ultimately in charge of updating the network parameters. Previous studies +revealed that the first-order moment used in Adam-like stochastic gradient +descent optimizers can be modified based on the Student's t-distribution. While +this modification led to noise-resistant updates, the other associated +statistics remained unchanged, resulting in inconsistencies in the assumed +models. In this paper, we propose AdaTerm, a novel approach that incorporates +the Student's t-distribution to derive not only the first-order moment but also +all the associated statistics. This provides a unified treatment of the +optimization process, offering a comprehensive framework under the statistical +model of the t-distribution for the first time. The proposed approach offers +several advantages over previously proposed approaches, including reduced +hyperparameters and improved robustness and adaptability. This noise-adaptive +behavior contributes to AdaTerm's exceptional learning performance, as +demonstrated through various optimization problems with different and/or +unknown noise ratios. Furthermore, we introduce a new technique for deriving a +theoretical regret bound without relying on AMSGrad, providing a valuable +contribution to the field + +
+
+ comment: 27 pages; Final version accepted by Elsevier Neurocomputing Journal + (2023-08; https://doi.org/10.1016/j.neucom.2023.126692) +
+
+
+
+
+ + ♻ ☆ Ballistocardiogram artifact removal in simultaneous EEG-fMRI using + generative adversarial network + + +
+ Due to its advantages of high temporal and spatial resolution, the technology +of simultaneous electroencephalogram-functional magnetic resonance imaging +(EEG-fMRI) acquisition and analysis has attracted much attention, and has been +widely used in various research fields of brain science. However, during the +fMRI of the brain, ballistocardiogram (BCG) artifacts can seriously contaminate +the EEG. As an unpaired problem, BCG artifact removal now remains a +considerable challenge. Aiming to provide a solution, this paper proposed a +novel modular generative adversarial network (GAN) and corresponding training +strategy to improve the network performance by optimizing the parameters of +each module. In this manner, we hope to improve the local representation +ability of the network model, thereby improving its overall performance and +obtaining a reliable generator for BCG artifact removal. Moreover, the proposed +method does not rely on additional reference signal or complex hardware +equipment. Experimental results show that, compared with multiple methods, the +technique presented in this paper can remove the BCG artifact more effectively +while retaining essential EEG information. + +
+
+
+
+
+ + ♻ ☆ EquiDiff: A Conditional Equivariant Diffusion Model For Trajectory + Prediction + + +
+ Accurate trajectory prediction is crucial for the safe and efficient +operation of autonomous vehicles. The growing popularity of deep learning has +led to the development of numerous methods for trajectory prediction. While +deterministic deep learning models have been widely used, deep generative +models have gained popularity as they learn data distributions from training +data and account for trajectory uncertainties. In this study, we propose +EquiDiff, a deep generative model for predicting future vehicle trajectories. +EquiDiff is based on the conditional diffusion model, which generates future +trajectories by incorporating historical information and random Gaussian noise. +The backbone model of EquiDiff is an SO(2)-equivariant transformer that fully +utilizes the geometric properties of location coordinates. In addition, we +employ Recurrent Neural Networks and Graph Attention Networks to extract social +interactions from historical trajectories. To evaluate the performance of +EquiDiff, we conduct extensive experiments on the NGSIM dataset. Our results +demonstrate that EquiDiff outperforms other baseline models in short-term +prediction, but has slightly higher errors for long-term prediction. +Furthermore, we conduct an ablation study to investigate the contribution of +each component of EquiDiff to the prediction accuracy. Additionally, we present +a visualization of the generation process of our diffusion model, providing +insights into the uncertainty of the prediction. + +
+
+
+
+
+ + ♻ ☆ A Conditional Denoising Diffusion Probabilistic Model for Radio + Interferometric Image Reconstruction ECAI 2023 + + +
+ In radio astronomy, signals from radio telescopes are transformed into images +of observed celestial objects, or sources. However, these images, called dirty +images, contain real sources as well as artifacts due to signal sparsity and +other factors. Therefore, radio interferometric image reconstruction is +performed on dirty images, aiming to produce clean images in which artifacts +are reduced and real sources are recovered. So far, existing methods have +limited success on recovering faint sources, preserving detailed structures, +and eliminating artifacts. In this paper, we present VIC-DDPM, a Visibility and +Image Conditioned Denoising Diffusion Probabilistic Model. Our main idea is to +use both the original visibility data in the spectral domain and dirty images +in the spatial domain to guide the image generation process with DDPM. This +way, we can leverage DDPM to generate fine details and eliminate noise, while +utilizing visibility data to separate signals from noise and retaining spatial +information in dirty images. We have conducted experiments in comparison with +both traditional methods and recent deep learning based approaches. Our results +show that our method significantly improves the resulting images by reducing +artifacts, preserving fine details, and recovering dim sources. This +advancement further facilitates radio astronomical data analysis tasks on +celestial phenomena. + +
+
+ comment: Accepted by ECAI 2023 +
+
+
+
+
+ + ♻ ☆ Risk-optimized Outlier Removal for Robust Point Cloud Classification + + +
+ With the growth of 3D sensing technology, deep learning system for 3D point +clouds has become increasingly important, especially in applications like +autonomous vehicles where safety is a primary concern. However, there are also +growing concerns about the reliability of these systems when they encounter +noisy point clouds, whether occurring naturally or introduced with malicious +intent. This paper highlights the challenges of point cloud classification +posed by various forms of noise, from simple background noise to malicious +backdoor attacks that can intentionally skew model predictions. While there's +an urgent need for optimized point cloud denoising, current point outlier +removal approaches, an essential step for denoising, rely heavily on +handcrafted strategies and are not adapted for higher-level tasks, such as +classification. To address this issue, we introduce an innovative point outlier +cleansing method that harnesses the power of downstream classification models. +By employing gradient-based attribution analysis, we define a novel concept: +point risk. Drawing inspiration from tail risk minimization in finance, we +recast the outlier removal process as an optimization problem, named PointCVaR. +Extensive experiments show that our proposed technique not only robustly +filters diverse point cloud outliers but also consistently and significantly +enhances existing robust methods for point cloud classification. + +
+
+
+
+
+ + ♻ ☆ On the Robustness of ChatGPT: An Adversarial and Out-of-distribution + Perspective ICLR 2023 + + +
+ ChatGPT is a recent chatbot service released by OpenAI and is receiving +increasing attention over the past few months. While evaluations of various +aspects of ChatGPT have been done, its robustness, i.e., the performance to +unexpected inputs, is still unclear to the public. Robustness is of particular +concern in responsible AI, especially for safety-critical applications. In this +paper, we conduct a thorough evaluation of the robustness of ChatGPT from the +adversarial and out-of-distribution (OOD) perspective. To do so, we employ the +AdvGLUE and ANLI benchmarks to assess adversarial robustness and the Flipkart +review and DDXPlus medical diagnosis datasets for OOD evaluation. We select +several popular foundation models as baselines. Results show that ChatGPT shows +consistent advantages on most adversarial and OOD classification and +translation tasks. However, the absolute performance is far from perfection, +which suggests that adversarial and OOD robustness remains a significant threat +to foundation models. Moreover, ChatGPT shows astounding performance in +understanding dialogue-related texts and we find that it tends to provide +informal suggestions for medical tasks instead of definitive answers. Finally, +we present in-depth discussions of possible research directions. + +
+
+ comment: Highlighted paper at ICLR 2023 workshop on Trustworthy and Reliable + Large-Scale Machine Learning Models; code is at: + https://github.com/microsoft/robustlearn; more works: + https://llm-eval.github.io/ +
+
+
+
+
+ + ♻ ☆ Provable Acceleration of Heavy Ball beyond Quadratics for a Class of + Polyak-Łojasiewicz Functions when the Non-Convexity is Averaged-Out ICML 2022 + + +
+ Heavy Ball (HB) nowadays is one of the most popular momentum methods in +non-convex optimization. It has been widely observed that incorporating the +Heavy Ball dynamic in gradient-based methods accelerates the training process +of modern machine learning models. However, the progress on establishing its +theoretical foundation of acceleration is apparently far behind its empirical +success. Existing provable acceleration results are of the quadratic or +close-to-quadratic functions, as the current techniques of showing HB's +acceleration are limited to the case when the Hessian is fixed. In this work, +we develop some new techniques that help show acceleration beyond quadratics, +which is achieved by analyzing how the change of the Hessian at two consecutive +time points affects the convergence speed. Based on our technical results, a +class of Polyak-\L{}ojasiewicz (PL) optimization problems for which provable +acceleration can be achieved via HB is identified. Moreover, our analysis +demonstrates a benefit of adaptively setting the momentum parameter. + (Update: 08/29/2023) Erratum is added in Appendix J. This is an updated +version that fixes an issue in the previous version. An additional condition +needs to be satisfied for the acceleration result of HB beyond quadratics in +this work, which naturally holds when the dimension is one or, more broadly, +when the Hessian is diagonal. We elaborate on the issue in Appendix J. + +
+
+ comment: (ICML 2022) Proceedings of the 39th International Conference on + Machine Learning; +
+
+
+
+
+ + ♻ ☆ Semi-supervised Vector-valued Learning: Improved Bounds and Algorithms + + +
+ Vector-valued learning, where the output space admits a vector-valued +structure, is an important problem that covers a broad family of important +domains, e.g. multi-task learning and transfer learning. Using local Rademacher +complexity and unlabeled data, we derive novel semi-supervised excess risk +bounds for general vector-valued learning from both kernel perspective and +linear perspective. The derived bounds are much sharper than existing ones and +the convergence rates are improved from the square root of labeled sample size +to the square root of total sample size or directly dependent on labeled sample +size. Motivated by our theoretical analysis, we propose a general +semi-supervised algorithm for efficiently learning vector-valued functions, +incorporating both local Rademacher complexity and Laplacian regularization. +Extensive experimental results illustrate the proposed algorithm significantly +outperforms the compared methods, which coincides with our theoretical +findings. + +
+
+ comment: Accepted at Pattern Recognition +
+
+
+
+
+ + ♻ ☆ All-in-SAM: from Weak Annotation to Pixel-wise Nuclei Segmentation with + Prompt-based Finetuning + + +
+ The Segment Anything Model (SAM) is a recently proposed prompt-based +segmentation model in a generic zero-shot segmentation approach. With the +zero-shot segmentation capacity, SAM achieved impressive flexibility and +precision on various segmentation tasks. However, the current pipeline requires +manual prompts during the inference stage, which is still resource intensive +for biomedical image segmentation. In this paper, instead of using prompts +during the inference stage, we introduce a pipeline that utilizes the SAM, +called all-in-SAM, through the entire AI development workflow (from annotation +generation to model finetuning) without requiring manual prompts during the +inference stage. Specifically, SAM is first employed to generate pixel-level +annotations from weak prompts (e.g., points, bounding box). Then, the +pixel-level annotations are used to finetune the SAM segmentation model rather +than training from scratch. Our experimental results reveal two key findings: +1) the proposed pipeline surpasses the state-of-the-art (SOTA) methods in a +nuclei segmentation task on the public Monuseg dataset, and 2) the utilization +of weak and few annotations for SAM finetuning achieves competitive performance +compared to using strong pixel-wise annotated data. + +
+
+
+
+
+ + ♻ ☆ The Wyner Variational Autoencoder for Unsupervised Multi-Layer Wireless + Fingerprinting + + +
+ Wireless fingerprinting refers to a device identification method leveraging +hardware imperfections and wireless channel variations as signatures. Beyond +physical layer characteristics, recent studies demonstrated that user behaviors +could be identified through network traffic, e.g., packet length, without +decryption of the payload. Inspired by these results, we propose a multi-layer +fingerprinting framework that jointly considers the multi-layer signatures for +improved identification performance. In contrast to previous works, by +leveraging the recent multi-view machine learning paradigm, i.e., data with +multiple forms, our method can cluster the device information shared among the +multi-layer features without supervision. Our information-theoretic approach +can be extended to supervised and semi-supervised settings with straightforward +derivations. In solving the formulated problem, we obtain a tight surrogate +bound using variational inference for efficient optimization. In extracting the +shared device information, we develop an algorithm based on the Wyner common +information method, enjoying reduced computation complexity as compared to +existing approaches. The algorithm can be applied to data distributions +belonging to the exponential family class. Empirically, we evaluate the +algorithm in a synthetic dataset with real-world video traffic and simulated +physical layer characteristics. Our empirical results show that the proposed +method outperforms the state-of-the-art baselines in both supervised and +unsupervised settings. + +
+
+
+
+
+ + ♻ ☆ Group Equality in Adaptive Submodular Maximization + + +
+ In this paper, we study the classic submodular maximization problem subject +to a group equality constraint under both non-adaptive and adaptive settings. +It has been shown that the utility function of many machine learning +applications, including data summarization, influence maximization in social +networks, and personalized recommendation, satisfies the property of +submodularity. Hence, maximizing a submodular function subject to various +constraints can be found at the heart of many of those applications. On a high +level, submodular maximization aims to select a group of most representative +items (e.g., data points). However, the design of most existing algorithms does +not incorporate the fairness constraint, leading to under- or +over-representation of some particular groups. This motivates us to study the +submodular maximization problem with group equality, where we aim to select a +group of items to maximize a (possibly non-monotone) submodular utility +function subject to a group equality constraint. To this end, we develop the +first constant-factor approximation algorithm for this problem. The design of +our algorithm is robust enough to be extended to solving the submodular +maximization problem under a more complicated adaptive setting. Moreover, we +further extend our study to incorporating a global cardinality constraint and +other fairness notations. + +
+
+ comment: This paper has been accepted by INFORMS Journal on Computing +
+
+
+
+
+ + ♻ ☆ Reinforcement Learning for Generative AI: A Survey + + +
+ Deep Generative AI has been a long-standing essential topic in the machine +learning community, which can impact a number of application areas like text +generation and computer vision. The major paradigm to train a generative model +is maximum likelihood estimation, which pushes the learner to capture and +approximate the target data distribution by decreasing the divergence between +the model distribution and the target distribution. This formulation +successfully establishes the objective of generative tasks, while it is +incapable of satisfying all the requirements that a user might expect from a +generative model. Reinforcement learning, serving as a competitive option to +inject new training signals by creating new objectives that exploit novel +signals, has demonstrated its power and flexibility to incorporate human +inductive bias from multiple angles, such as adversarial learning, +hand-designed rules and learned reward model to build a performant model. +Thereby, reinforcement learning has become a trending research field and has +stretched the limits of generative AI in both model design and application. It +is reasonable to summarize and conclude advances in recent years with a +comprehensive review. Although there are surveys in different application areas +recently, this survey aims to shed light on a high-level review that spans a +range of application areas. We provide a rigorous taxonomy in this area and +make sufficient coverage on various models and applications. Notably, we also +surveyed the fast-developing large language model area. We conclude this survey +by showing the potential directions that might tackle the limit of current +models and expand the frontiers for generative AI. + +
+
+
+
+
+ + ♻ ☆ Symmetry-Preserving Program Representations for Learning Code Semantics + + +
+ Large Language Models (LLMs) have shown promise in automated program +reasoning, a crucial aspect of many security tasks. However, existing LLM +architectures for code are often borrowed from other domains like natural +language processing, raising concerns about their generalization and robustness +to unseen code. A key generalization challenge is to incorporate the knowledge +of code semantics, including control and data flow, into the LLM architectures. + Drawing inspiration from examples of convolution layers exploiting +translation symmetry, we explore how code symmetries can enhance LLM +architectures for program analysis and modeling. We present a rigorous +group-theoretic framework that formally defines code symmetries as +semantics-preserving transformations and provides techniques for precisely +reasoning about symmetry preservation within LLM architectures. Using this +framework, we introduce a novel variant of self-attention that preserves +program symmetries, demonstrating its effectiveness in generalization and +robustness through detailed experimental evaluations across different binary +and source code analysis tasks. Overall, our code symmetry framework offers +rigorous and powerful reasoning techniques that can guide the future +development of specialized LLMs for code and advance LLM-guided program +reasoning tasks. + +
+
+
+
+
+ + ♻ ☆ Benchmarking Robustness of AI-Enabled Multi-sensor Fusion Systems: + Challenges and Opportunities + + +
+ Multi-Sensor Fusion (MSF) based perception systems have been the foundation +in supporting many industrial applications and domains, such as self-driving +cars, robotic arms, and unmanned aerial vehicles. Over the past few years, the +fast progress in data-driven artificial intelligence (AI) has brought a +fast-increasing trend to empower MSF systems by deep learning techniques to +further improve performance, especially on intelligent systems and their +perception systems. Although quite a few AI-enabled MSF perception systems and +techniques have been proposed, up to the present, limited benchmarks that focus +on MSF perception are publicly available. Given that many intelligent systems +such as self-driving cars are operated in safety-critical contexts where +perception systems play an important role, there comes an urgent need for a +more in-depth understanding of the performance and reliability of these MSF +systems. To bridge this gap, we initiate an early step in this direction and +construct a public benchmark of AI-enabled MSF-based perception systems +including three commonly adopted tasks (i.e., object detection, object +tracking, and depth completion). Based on this, to comprehensively understand +MSF systems' robustness and reliability, we design 14 common and realistic +corruption patterns to synthesize large-scale corrupted datasets. We further +perform a systematic evaluation of these systems through our large-scale +evaluation. Our results reveal the vulnerability of the current AI-enabled MSF +perception systems, calling for researchers and practitioners to take +robustness and reliability into account when designing AI-enabled MSF. + +
+
+ comment: To appear in ESEC/FSE 2023 +
+
+
+
+
+ + ♻ ☆ Block-State Transformer + + +
+ State space models (SSMs) have shown impressive results on tasks that require +modeling long-range dependencies and efficiently scale to long sequences owing +to their subquadratic runtime complexity. Originally designed for continuous +signals, SSMs have shown superior performance on a plethora of tasks, in vision +and audio; however, SSMs still lag Transformer performance in Language Modeling +tasks. In this work, we propose a hybrid layer named Block-State Transformer +(BST), that internally combines an SSM sublayer for long-range +contextualization, and a Block Transformer sublayer for short-term +representation of sequences. We study three different, and completely +parallelizable, variants that integrate SSMs and block-wise attention. We show +that our model outperforms similar Transformer-based architectures on language +modeling perplexity and generalizes to longer sequences. In addition, the +Block-State Transformer demonstrates more than tenfold increase in speed at the +layer level compared to the Block-Recurrent Transformer when model +parallelization is employed. + +
+
+
+
+
+ + ♻ ☆ On Optimal Caching and Model Multiplexing for Large Model Inference + + +
+ Large Language Models (LLMs) and other large foundation models have achieved +noteworthy success, but their size exacerbates existing resource consumption +and latency challenges. In particular, the large-scale deployment of these +models is hindered by the significant resource requirements during inference. +In this paper, we study two approaches for mitigating these challenges: +employing a cache to store previous queries and learning a model multiplexer to +choose from an ensemble of models for query processing. + Theoretically, we provide an optimal algorithm for jointly optimizing both +approaches to reduce the inference cost in both offline and online tabular +settings. By combining a caching algorithm, namely Greedy Dual Size with +Frequency (GDSF) or Least Expected Cost (LEC), with a model multiplexer, we +achieve optimal rates in both offline and online settings. Empirically, +simulations show that the combination of our caching and model multiplexing +algorithms greatly improves over the baselines, with up to $50\times$ +improvement over the baseline when the ratio between the maximum cost and +minimum cost is $100$. Experiments on real datasets show a $4.3\times$ +improvement in FLOPs over the baseline when the ratio for FLOPs is $10$, and a +$1.8\times$ improvement in latency when the ratio for average latency is +$1.85$. + +
+
+
+
+
+ + ♻ ☆ Fix Fairness, Don't Ruin Accuracy: Performance Aware Fairness Repair + using AutoML + + +
+ Machine learning (ML) is increasingly being used in critical decision-making +software, but incidents have raised questions about the fairness of ML +predictions. To address this issue, new tools and methods are needed to +mitigate bias in ML-based software. Previous studies have proposed bias +mitigation algorithms that only work in specific situations and often result in +a loss of accuracy. Our proposed solution is a novel approach that utilizes +automated machine learning (AutoML) techniques to mitigate bias. Our approach +includes two key innovations: a novel optimization function and a +fairness-aware search space. By improving the default optimization function of +AutoML and incorporating fairness objectives, we are able to mitigate bias with +little to no loss of accuracy. Additionally, we propose a fairness-aware search +space pruning method for AutoML to reduce computational cost and repair time. +Our approach, built on the state-of-the-art Auto-Sklearn tool, is designed to +reduce bias in real-world scenarios. In order to demonstrate the effectiveness +of our approach, we evaluated our approach on four fairness problems and 16 +different ML models, and our results show a significant improvement over the +baseline and existing bias mitigation techniques. Our approach, Fair-AutoML, +successfully repaired 60 out of 64 buggy cases, while existing bias mitigation +techniques only repaired up to 44 out of 64 cases. + +
+
+ comment: In Proceedings of The 31st ACM Joint European Software Engineering + Conference and Symposium on the Foundations of Software Engineering (ESEC/FSE + 2023) +
+
+
+
+
+ + ♻ ☆ Variational Inference for Deblending Crowded Starfields + + +
+ In images collected by astronomical surveys, stars and galaxies often overlap +visually. Deblending is the task of distinguishing and characterizing +individual light sources in survey images. We propose StarNet, a Bayesian +method to deblend sources in astronomical images of crowded star fields. +StarNet leverages recent advances in variational inference, including amortized +variational distributions and an optimization objective targeting an +expectation of the forward KL divergence. In our experiments with SDSS images +of the M2 globular cluster, StarNet is substantially more accurate than two +competing methods: Probabilistic Cataloging (PCAT), a method that uses MCMC for +inference, and DAOPHOT, a software pipeline employed by SDSS for deblending. In +addition, the amortized approach to inference gives StarNet the scaling +characteristics necessary to perform Bayesian inference on modern astronomical +surveys. + +
+
+
+
+
+ + ♻ ☆ On the Existence of the Adversarial Bayes Classifier (Extended Version) NeurIPS + + +
+ Adversarial robustness is a critical property in a variety of modern machine +learning applications. While it has been the subject of several recent +theoretical studies, many important questions related to adversarial robustness +are still open. In this work, we study a fundamental question regarding Bayes +optimality for adversarial robustness. We provide general sufficient conditions +under which the existence of a Bayes optimal classifier can be guaranteed for +adversarial robustness. Our results can provide a useful tool for a subsequent +study of surrogate losses in adversarial robustness and their consistency +properties. This manuscript is the extended and corrected version of the paper +\emph{On the Existence of the Adversarial Bayes Classifier} published in +NeurIPS 2021. There were two errors in theorem statements in the original paper +-- one in the definition of pseudo-certifiable robustness and the other in the +measurability of $A^\e$ for arbitrary metric spaces. In this version we correct +the errors. Furthermore, the results of the original paper did not apply to +some non-strictly convex norms and here we extend our results to all possible +norms. + +
+
+ comment: 27 pages, 3 figures. Version 2: Corrects 2 errors in the paper "On + the Existence of the Adversarial Bayes Classifier" published in NeurIPS. + Version 3: Update to acknowledgements +
+
+
+
+
+ + ♻ ☆ Human-Inspired Multi-Agent Navigation using Knowledge Distillation IROS + + +
+ Despite significant advancements in the field of multi-agent navigation, +agents still lack the sophistication and intelligence that humans exhibit in +multi-agent settings. In this paper, we propose a framework for learning a +human-like general collision avoidance policy for agent-agent interactions in +fully decentralized, multi-agent environments. Our approach uses knowledge +distillation with reinforcement learning to shape the reward function based on +expert policies extracted from human trajectory demonstrations through behavior +cloning. We show that agents trained with our approach can take human-like +trajectories in collision avoidance and goal-directed steering tasks not +provided by the demonstrations, outperforming the experts as well as +learning-based agents trained without knowledge distillation. + +
+
+ comment: IEEE/RSJ International Conference on Intelligent Robots and Systems + (IROS), 2021 +
+
+
+
+
+ + ♻ ☆ Preserving Privacy and Security in Federated Learning + + +
+ Federated learning is known to be vulnerable to both security and privacy +issues. Existing research has focused either on preventing poisoning attacks +from users or on concealing the local model updates from the server, but not +both. However, integrating these two lines of research remains a crucial +challenge since they often conflict with one another with respect to the threat +model. In this work, we develop a principle framework that offers both privacy +guarantees for users and detection against poisoning attacks from them. With a +new threat model that includes both an honest-but-curious server and malicious +users, we first propose a secure aggregation protocol using homomorphic +encryption for the server to combine local model updates in a private manner. +Then, a zero-knowledge proof protocol is leveraged to shift the task of +detecting attacks in the local models from the server to the users. The key +observation here is that the server no longer needs access to the local models +for attack detection. Therefore, our framework enables the central server to +identify poisoned model updates without violating the privacy guarantees of +secure aggregation. + +
+
+ comment: Published in IEEE/ACM Transactions on Networking +
+
+
+
+
+ + ♻ ☆ Compressive Fourier collocation methods for high-dimensional diffusion + equations with periodic boundary conditions + + +
+ High-dimensional Partial Differential Equations (PDEs) are a popular +mathematical modelling tool, with applications ranging from finance to +computational chemistry. However, standard numerical techniques for solving +these PDEs are typically affected by the curse of dimensionality. In this work, +we tackle this challenge while focusing on stationary diffusion equations +defined over a high-dimensional domain with periodic boundary conditions. +Inspired by recent progress in sparse function approximation in high +dimensions, we propose a new method called compressive Fourier collocation. +Combining ideas from compressive sensing and spectral collocation, our method +replaces the use of structured collocation grids with Monte Carlo sampling and +employs sparse recovery techniques, such as orthogonal matching pursuit and +$\ell^1$ minimization, to approximate the Fourier coefficients of the PDE +solution. We conduct a rigorous theoretical analysis showing that the +approximation error of the proposed method is comparable with the best $s$-term +approximation (with respect to the Fourier basis) to the solution. Using the +recently introduced framework of random sampling in bounded Riesz systems, our +analysis shows that the compressive Fourier collocation method mitigates the +curse of dimensionality with respect to the number of collocation points under +sufficient conditions on the regularity of the diffusion coefficient. We also +present numerical experiments that illustrate the accuracy and stability of the +method for the approximation of sparse and compressible solutions. + +
+
+ comment: 33 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Trustworthy Representation Learning Across Domains + + +
+ As AI systems have obtained significant performance to be deployed widely in +our daily live and human society, people both enjoy the benefits brought by +these technologies and suffer many social issues induced by these systems. To +make AI systems good enough and trustworthy, plenty of researches have been +done to build guidelines for trustworthy AI systems. Machine learning is one of +the most important parts for AI systems and representation learning is the +fundamental technology in machine learning. How to make the representation +learning trustworthy in real-world application, e.g., cross domain scenarios, +is very valuable and necessary for both machine learning and AI system fields. +Inspired by the concepts in trustworthy AI, we proposed the first trustworthy +representation learning across domains framework which includes four concepts, +i.e, robustness, privacy, fairness, and explainability, to give a comprehensive +literature review on this research direction. Specifically, we first introduce +the details of the proposed trustworthy framework for representation learning +across domains. Second, we provide basic notions and comprehensively summarize +existing methods for the trustworthy framework from four concepts. Finally, we +conclude this survey with insights and discussions on future research +directions. + +
+
+ comment: 38 pages, 15 figures +
+
+
+
+
+ + ♻ ☆ Eliciting Latent Predictions from Transformers with the Tuned Lens + + +
+ We analyze transformers from the perspective of iterative inference, seeking +to understand how model predictions are refined layer by layer. To do so, we +train an affine probe for each block in a frozen pretrained model, making it +possible to decode every hidden state into a distribution over the vocabulary. +Our method, the tuned lens, is a refinement of the earlier "logit lens" +technique, which yielded useful insights but is often brittle. + We test our method on various autoregressive language models with up to 20B +parameters, showing it to be more predictive, reliable and unbiased than the +logit lens. With causal experiments, we show the tuned lens uses similar +features to the model itself. We also find the trajectory of latent predictions +can be used to detect malicious inputs with high accuracy. All code needed to +reproduce our results can be found at +https://github.com/AlignmentResearch/tuned-lens. + +
+
+
+
+
+ + ♻ ☆ Contrastive Credibility Propagation for Reliable Semi-Supervised + Learning + + +
+ Producing labels for unlabeled data is error-prone, making semi-supervised +learning (SSL) troublesome. Often, little is known about when and why an +algorithm fails to outperform a supervised baseline. Using benchmark datasets, +we craft five common real-world SSL data scenarios: few-label, open-set, +noisy-label, and class distribution imbalance/misalignment in the labeled and +unlabeled sets. We propose a novel algorithm called Contrastive Credibility +Propagation (CCP) for deep SSL via iterative transductive pseudo-label +refinement. CCP unifies semi-supervised learning and noisy label learning for +the goal of reliably outperforming a supervised baseline in any data scenario. +Compared to prior methods which focus on a subset of scenarios, CCP uniquely +outperforms the supervised baseline in all scenarios, supporting practitioners +when the qualities of labeled or unlabeled data are unknown. + +
+
+
+
+
+ + ♻ ☆ Regression with Label Differential Privacy ICLR '23 + + +
+ We study the task of training regression models with the guarantee of label +differential privacy (DP). Based on a global prior distribution on label +values, which could be obtained privately, we derive a label DP randomization +mechanism that is optimal under a given regression loss function. We prove that +the optimal mechanism takes the form of a "randomized response on bins", and +propose an efficient algorithm for finding the optimal bin values. We carry out +a thorough experimental evaluation on several datasets demonstrating the +efficacy of our algorithm. + +
+
+ comment: Appeared at ICLR '23, 28 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ On Low-rank Trace Regression under General Sampling Distribution + + +
+ In this paper, we study the trace regression when a matrix of parameters B* +is estimated via the convex relaxation of a rank-regularized regression or via +regularized non-convex optimization. It is known that these estimators satisfy +near-optimal error bounds under assumptions on the rank, coherence, and +spikiness of B*. We start by introducing a general notion of spikiness for B* +that provides a generic recipe to prove the restricted strong convexity of the +sampling operator of the trace regression and obtain near-optimal and +non-asymptotic error bounds for the estimation error. Similar to the existing +literature, these results require the regularization parameter to be above a +certain theory-inspired threshold that depends on observation noise that may be +unknown in practice. Next, we extend the error bounds to cases where the +regularization parameter is chosen via cross-validation. This result is +significant in that existing theoretical results on cross-validated estimators +(Kale et al., 2011; Kumar et al., 2013; Abou-Moustafa and Szepesvari, 2017) do +not apply to our setting since the estimators we study are not known to satisfy +their required notion of stability. Finally, using simulations on synthetic and +real data, we show that the cross-validated estimator selects a near-optimal +penalty parameter and outperforms the theory-inspired approach of selecting the +parameter. + +
+
+ comment: 49 pages, 6 figure2 +
+
+
+
+
+ + ♻ ☆ GAMIVAL: Video Quality Prediction on Mobile Cloud Gaming Content SP + + +
+ The mobile cloud gaming industry has been rapidly growing over the last +decade. When streaming gaming videos are transmitted to customers' client +devices from cloud servers, algorithms that can monitor distorted video quality +without having any reference video available are desirable tools. However, +creating No-Reference Video Quality Assessment (NR VQA) models that can +accurately predict the quality of streaming gaming videos rendered by computer +graphics engines is a challenging problem, since gaming content generally +differs statistically from naturalistic videos, often lacks detail, and +contains many smooth regions. Until recently, the problem has been further +complicated by the lack of adequate subjective quality databases of mobile +gaming content. We have created a new gaming-specific NR VQA model called the +Gaming Video Quality Evaluator (GAMIVAL), which combines and leverages the +advantages of spatial and temporal gaming distorted scene statistics models, a +neural noise model, and deep semantic features. Using a support vector +regression (SVR) as a regressor, GAMIVAL achieves superior performance on the +new LIVE-Meta Mobile Cloud Gaming (LIVE-Meta MCG) video quality database. + +
+
+ comment: Accepted to IEEE SPL 2023. The implementation of GAMIVAL has been + made available online: https://github.com/lskdream/GAMIVAL +
+
+
+
+
+ + ♻ ☆ Soft Prompt Tuning for Augmenting Dense Retrieval with Large Language + Models + + +
+ Dense retrieval (DR) converts queries and documents into dense embeddings and +measures the similarity between queries and documents in vector space. One of +the challenges in DR is the lack of domain-specific training data. While DR +models can learn from large-scale public datasets like MS MARCO through +transfer learning, evidence shows that not all DR models and domains can +benefit from transfer learning equally. Recently, some researchers have +resorted to large language models (LLMs) to improve the zero-shot and few-shot +DR models. However, the hard prompts or human-written prompts utilized in these +works cannot guarantee the good quality of generated weak queries. To tackle +this, we propose soft prompt tuning for augmenting DR (SPTAR): For each task, +we leverage soft prompt-tuning to optimize a task-specific soft prompt on +limited ground truth data and then prompt the LLMs to tag unlabeled documents +with weak queries, yielding enough weak document-query pairs to train +task-specific dense retrievers. We design a filter to select high-quality +example document-query pairs in the prompt to further improve the quality of +weak tagged queries. To the best of our knowledge, there is no prior work +utilizing soft prompt tuning to augment DR models. The experiments demonstrate +that SPTAR outperforms the unsupervised baselines BM25 and the recently +proposed LLMs-based augmentation method for DR. + +
+
+ comment: fix typos +
+
+
+
+
+ + ♻ ☆ A Transformer-based Framework For Multi-variate Time Series: A Remaining + Useful Life Prediction Use Case + + +
+ In recent times, Large Language Models (LLMs) have captured a global +spotlight and revolutionized the field of Natural Language Processing. One of +the factors attributed to the effectiveness of LLMs is the model architecture +used for training, transformers. Transformer models excel at capturing +contextual features in sequential data since time series data are sequential, +transformer models can be leveraged for more efficient time series data +prediction. The field of prognostics is vital to system health management and +proper maintenance planning. A reliable estimation of the remaining useful life +(RUL) of machines holds the potential for substantial cost savings. This +includes avoiding abrupt machine failures, maximizing equipment usage, and +serving as a decision support system (DSS). This work proposed an +encoder-transformer architecture-based framework for multivariate time series +prediction for a prognostics use case. We validated the effectiveness of the +proposed framework on all four sets of the C-MAPPS benchmark dataset for the +remaining useful life prediction task. To effectively transfer the knowledge +and application of transformers from the natural language domain to time +series, three model-specific experiments were conducted. Also, to enable the +model awareness of the initial stages of the machine life and its degradation +path, a novel expanding window method was proposed for the first time in this +work, it was compared with the sliding window method, and it led to a large +improvement in the performance of the encoder transformer model. Finally, the +performance of the proposed encoder-transformer model was evaluated on the test +dataset and compared with the results from 13 other state-of-the-art (SOTA) +models in the literature and it outperformed them all with an average +performance increase of 137.65% over the next best model across all the +datasets. + +
+
+
+
+
+ + ♻ ☆ Diversifying AI: Towards Creative Chess with AlphaZero + + +
+ In recent years, Artificial Intelligence (AI) systems have surpassed human +intelligence in a variety of computational tasks. However, AI systems, like +humans, make mistakes, have blind spots, hallucinate, and struggle to +generalize to new situations. This work explores whether AI can benefit from +creative decision-making mechanisms when pushed to the limits of its +computational rationality. In particular, we investigate whether a team of +diverse AI systems can outperform a single AI in challenging tasks by +generating more ideas as a group and then selecting the best ones. We study +this question in the game of chess, the so-called drosophila of AI. We build on +AlphaZero (AZ) and extend it to represent a league of agents via a +latent-conditioned architecture, which we call AZ_db. We train AZ_db to +generate a wider range of ideas using behavioral diversity techniques and +select the most promising ones with sub-additive planning. Our experiments +suggest that AZ_db plays chess in diverse ways, solves more puzzles as a group +and outperforms a more homogeneous team. Notably, AZ_db solves twice as many +challenging puzzles as AZ, including the challenging Penrose positions. When +playing chess from different openings, we notice that players in AZ_db +specialize in different openings, and that selecting a player for each opening +using sub-additive planning results in a 50 Elo improvement over AZ. Our +findings suggest that diversity bonuses emerge in teams of AI agents, just as +they do in teams of humans and that diversity is a valuable asset in solving +computationally hard problems. + +
+
+
+
+
+ + ♻ ☆ RecXplainer: Amortized Attribute-based Personalized Explanations for + Recommender Systems NeurIPS 2022 + + +
+ Recommender systems influence many of our interactions in the digital world +-- impacting how we shop for clothes, sorting what we see when browsing YouTube +or TikTok, and determining which restaurants and hotels we are shown when using +hospitality platforms. Modern recommender systems are large, opaque models +trained on a mixture of proprietary and open-source datasets. Naturally, issues +of trust arise on both the developer and user side: is the system working +correctly, and why did a user receive (or not receive) a particular +recommendation? Providing an explanation alongside a recommendation alleviates +some of these concerns. The status quo for auxiliary recommender system +feedback is either user-specific explanations (e.g., "users who bought item B +also bought item A") or item-specific explanations (e.g., "we are recommending +item A because you watched/bought item B"). However, users bring personalized +context into their search experience, valuing an item as a function of that +item's attributes and their own personal preferences. In this work, we propose +RecXplainer, a novel method for generating fine-grained explanations based on a +user's preferences over the attributes of recommended items. We evaluate +RecXplainer on five real-world and large-scale recommendation datasets using +five different kinds of recommender systems to demonstrate the efficacy of +RecXplainer in capturing users' preferences over item attributes and using them +to explain recommendations. We also compare RecXplainer to five baselines and +show RecXplainer's exceptional performance on ten metrics. + +
+
+ comment: Awarded the Best Student Paper at TEA Workshop at NeurIPS 2022 +
+
+
+
+
+ + ♻ ☆ Variationally Mimetic Operator Networks + + +
+ In recent years operator networks have emerged as promising deep learning +tools for approximating the solution to partial differential equations (PDEs). +These networks map input functions that describe material properties, forcing +functions and boundary data to the solution of a PDE. This work describes a new +architecture for operator networks that mimics the form of the numerical +solution obtained from an approximate variational or weak formulation of the +problem. The application of these ideas to a generic elliptic PDE leads to a +variationally mimetic operator network (VarMiON). Like the conventional Deep +Operator Network (DeepONet) the VarMiON is also composed of a sub-network that +constructs the basis functions for the output and another that constructs the +coefficients for these basis functions. However, in contrast to the DeepONet, +the architecture of these sub-networks in the VarMiON is precisely determined. +An analysis of the error in the VarMiON solution reveals that it contains +contributions from the error in the training data, the training error, the +quadrature error in sampling input and output functions, and a "covering error" +that measures the distance between the test input functions and the nearest +functions in the training dataset. It also depends on the stability constants +for the exact solution operator and its VarMiON approximation. The application +of the VarMiON to a canonical elliptic PDE and a nonlinear PDE reveals that for +approximately the same number of network parameters, on average the VarMiON +incurs smaller errors than a standard DeepONet and a recently proposed +multiple-input operator network (MIONet). Further, its performance is more +robust to variations in input functions, the techniques used to sample the +input and output functions, the techniques used to construct the basis +functions, and the number of input functions. + +
+
+ comment: 49 pages, 18 figures, 1 Appendix +
+
+
+
+
+ + ♻ ☆ EntropyRank: Unsupervised Keyphrase Extraction via Side-Information + Optimization for Language Model-based Text Compression + + +
+ We propose an unsupervised method to extract keywords and keyphrases from +texts based on a pre-trained language model (LM) and Shannon's information +maximization. Specifically, our method extracts phrases having the highest +conditional entropy under the LM. The resulting set of keyphrases turns out to +solve a relevant information-theoretic problem: if provided as side +information, it leads to the expected minimal binary code length in compressing +the text using the LM and an entropy encoder. Alternately, the resulting set is +an approximation via a causal LM to the set of phrases that minimize the +entropy of the text when conditioned upon it. Empirically, the method provides +results comparable to the most commonly used methods in various keyphrase +extraction benchmark challenges. + +
+
+
+
+
+ + ♻ ☆ The Future of Fundamental Science Led by Generative Closed-Loop + Artificial Intelligence + + +
+ Recent advances in machine learning and AI, including Generative AI and LLMs, +are disrupting technological innovation, product development, and society as a +whole. AI's contribution to technology can come from multiple approaches that +require access to large training data sets and clear performance evaluation +criteria, ranging from pattern recognition and classification to generative +models. Yet, AI has contributed less to fundamental science in part because +large data sets of high-quality data for scientific practice and model +discovery are more difficult to access. Generative AI, in general, and Large +Language Models in particular, may represent an opportunity to augment and +accelerate the scientific discovery of fundamental deep science with +quantitative models. Here we explore and investigate aspects of an AI-driven, +automated, closed-loop approach to scientific discovery, including self-driven +hypothesis generation and open-ended autonomous exploration of the hypothesis +space. Integrating AI-driven automation into the practice of science would +mitigate current problems, including the replication of findings, systematic +production of data, and ultimately democratisation of the scientific process. +Realising these possibilities requires a vision for augmented AI coupled with a +diversity of AI approaches able to deal with fundamental aspects of causality +analysis and model discovery while enabling unbiased search across the space of +putative explanations. These advances hold the promise to unleash AI's +potential for searching and discovering the fundamental structure of our world +beyond what human scientists have been able to achieve. Such a vision would +push the boundaries of new fundamental science rather than automatize current +workflows and instead open doors for technological innovation to tackle some of +the greatest challenges facing humanity today. + +
+
+ comment: 35 pages, first draft of the final report from the Alan Turing + Institute on AI for Scientific Discovery +
+
+
+
+
+
+
+
+ + Multimedia 2 + +
+
+
+ + ☆ On the Steganographic Capacity of Selected Learning Models + + +
+ Machine learning and deep learning models are potential vectors for various +attack scenarios. For example, previous research has shown that malware can be +hidden in deep learning models. Hiding information in a learning model can be +viewed as a form of steganography. In this research, we consider the general +question of the steganographic capacity of learning models. Specifically, for a +wide range of models, we determine the number of low-order bits of the trained +parameters that can be overwritten, without adversely affecting model +performance. For each model considered, we graph the accuracy as a function of +the number of low-order bits that have been overwritten, and for selected +models, we also analyze the steganographic capacity of individual layers. The +models that we test include the classic machine learning techniques of Linear +Regression (LR) and Support Vector Machine (SVM); the popular general deep +learning models of Multilayer Perceptron (MLP) and Convolutional Neural Network +(CNN); the highly-successful Recurrent Neural Network (RNN) architecture of +Long Short-Term Memory (LSTM); the pre-trained transfer learning-based models +VGG16, DenseNet121, InceptionV3, and Xception; and, finally, an Auxiliary +Classifier Generative Adversarial Network (ACGAN). In all cases, we find that a +majority of the bits of each trained parameter can be overwritten before the +accuracy degrades. Of the models tested, the steganographic capacity ranges +from 7.04 KB for our LR experiments, to 44.74 MB for InceptionV3. We discuss +the implications of our results and consider possible avenues for further +research. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2306.17189 +
+
+
+
+
+ + ♻ ☆ GAMIVAL: Video Quality Prediction on Mobile Cloud Gaming Content SP + + +
+ The mobile cloud gaming industry has been rapidly growing over the last +decade. When streaming gaming videos are transmitted to customers' client +devices from cloud servers, algorithms that can monitor distorted video quality +without having any reference video available are desirable tools. However, +creating No-Reference Video Quality Assessment (NR VQA) models that can +accurately predict the quality of streaming gaming videos rendered by computer +graphics engines is a challenging problem, since gaming content generally +differs statistically from naturalistic videos, often lacks detail, and +contains many smooth regions. Until recently, the problem has been further +complicated by the lack of adequate subjective quality databases of mobile +gaming content. We have created a new gaming-specific NR VQA model called the +Gaming Video Quality Evaluator (GAMIVAL), which combines and leverages the +advantages of spatial and temporal gaming distorted scene statistics models, a +neural noise model, and deep semantic features. Using a support vector +regression (SVR) as a regressor, GAMIVAL achieves superior performance on the +new LIVE-Meta Mobile Cloud Gaming (LIVE-Meta MCG) video quality database. + +
+
+ comment: Accepted to IEEE SPL 2023. The implementation of GAMIVAL has been + made available online: https://github.com/lskdream/GAMIVAL +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 59 + +
+
+
+ + ☆ Fine-Tuning Llama 2 Large Language Models for Detecting Online Sexual + Predatory Chats and Abusive Texts + + +
+ Detecting online sexual predatory behaviours and abusive language on social +media platforms has become a critical area of research due to the growing +concerns about online safety, especially for vulnerable populations such as +children and adolescents. Researchers have been exploring various techniques +and approaches to develop effective detection systems that can identify and +mitigate these risks. Recent development of large language models (LLMs) has +opened a new opportunity to address this problem more effectively. This paper +proposes an approach to detection of online sexual predatory chats and abusive +language using the open-source pretrained Llama 2 7B-parameter model, recently +released by Meta GenAI. We fine-tune the LLM using datasets with different +sizes, imbalance degrees, and languages (i.e., English, Roman Urdu and Urdu). +Based on the power of LLMs, our approach is generic and automated without a +manual search for a synergy between feature extraction and classifier design +steps like conventional methods in this domain. Experimental results show a +strong performance of the proposed approach, which performs proficiently and +consistently across three distinct datasets with five sets of experiments. This +study's outcomes indicate that the proposed method can be implemented in +real-world applications (even with non-English languages) for flagging sexual +predators, offensive or toxic content, hate speech, and discriminatory language +in online discussions and comments to maintain respectful internet or digital +communities. Furthermore, it can be employed for solving text classification +problems with other potential applications such as sentiment analysis, spam and +phishing detection, sorting legal documents, fake news detection, language +identification, user intent recognition, text-based product categorization, +medical record analysis, and resume screening. + +
+
+
+
+
+ + ☆ ANER: Arabic and Arabizi Named Entity Recognition using + Transformer-Based Approach + + +
+ One of the main tasks of Natural Language Processing (NLP), is Named Entity +Recognition (NER). It is used in many applications and also can be used as an +intermediate step for other tasks. We present ANER, a web-based named entity +recognizer for the Arabic, and Arabizi languages. The model is built upon BERT, +which is a transformer-based encoder. It can recognize 50 different entity +classes, covering various fields. We trained our model on the WikiFANE\_Gold +dataset which consists of Wikipedia articles. We achieved an F1 score of +88.7\%, which beats CAMeL Tools' F1 score of 83\% on the ANERcorp dataset, +which has only 4 classes. We also got an F1 score of 77.7\% on the +NewsFANE\_Gold dataset which contains out-of-domain data from News articles. +The system is deployed on a user-friendly web interface that accepts users' +inputs in Arabic, or Arabizi. It allows users to explore the entities in the +text by highlighting them. It can also direct users to get information about +entities through Wikipedia directly. We added the ability to do NER using our +model, or CAMeL Tools' model through our website. ANER is publicly accessible +at \url{http://www.aner.online}. We also deployed our model on HuggingFace at +https://huggingface.co/boda/ANER, to allow developers to test and use it. + +
+
+
+
+
+ + ☆ Joint Multiple Intent Detection and Slot Filling with Supervised + Contrastive Learning and Self-Distillation ECAI 2023 + + +
+ Multiple intent detection and slot filling are two fundamental and crucial +tasks in spoken language understanding. Motivated by the fact that the two +tasks are closely related, joint models that can detect intents and extract +slots simultaneously are preferred to individual models that perform each task +independently. The accuracy of a joint model depends heavily on the ability of +the model to transfer information between the two tasks so that the result of +one task can correct the result of the other. In addition, since a joint model +has multiple outputs, how to train the model effectively is also challenging. +In this paper, we present a method for multiple intent detection and slot +filling by addressing these challenges. First, we propose a bidirectional joint +model that explicitly employs intent information to recognize slots and slot +features to detect intents. Second, we introduce a novel method for training +the proposed joint model using supervised contrastive learning and +self-distillation. Experimental results on two benchmark datasets MixATIS and +MixSNIPS show that our method outperforms state-of-the-art models in both +tasks. The results also demonstrate the contributions of both bidirectional +design and the training method to the accuracy improvement. Our source code is +available at https://github.com/anhtunguyen98/BiSLU + +
+
+ comment: Accepted at ECAI 2023 +
+
+
+
+
+ + ☆ Challenges of GPT-3-based Conversational Agents for Healthca + + +
+ The potential to provide patients with faster information access while +allowing medical specialists to concentrate on critical tasks makes medical +domain dialog agents appealing. However, the integration of large-language +models (LLMs) into these agents presents certain limitations that may result in +serious consequences. This paper investigates the challenges and risks of using +GPT-3-based models for medical question-answering (MedQA). We perform several +evaluations contextualized in terms of standard medical principles. We provide +a procedure for manually designing patient queries to stress-test high-risk +limitations of LLMs in MedQA systems. Our analysis reveals that LLMs fail to +respond adequately to these queries, generating erroneous medical information, +unsafe recommendations, and content that may be considered offensive. + +
+
+ comment: 12 pages, 9 Tables, accepted to RANLP 2023 +
+
+
+
+
+ + ☆ Breaking the Bank with ChatGPT: Few-Shot Text Classification for Finance IJCAI-2023 + + +
+ We propose the use of conversational GPT models for easy and quick few-shot +text classification in the financial domain using the Banking77 dataset. Our +approach involves in-context learning with GPT-3.5 and GPT-4, which minimizes +the technical expertise required and eliminates the need for expensive GPU +computing while yielding quick and accurate results. Additionally, we fine-tune +other pre-trained, masked language models with SetFit, a recent contrastive +learning technique, to achieve state-of-the-art results both in full-data and +few-shot settings. Our findings show that querying GPT-3.5 and GPT-4 can +outperform fine-tuned, non-generative models even with fewer examples. However, +subscription fees associated with these solutions may be considered costly for +small organizations. Lastly, we find that generative models perform better on +the given task when shown representative samples selected by a human expert +rather than when shown random ones. We conclude that a) our proposed methods +offer a practical solution for few-shot tasks in datasets with limited label +availability, and b) our state-of-the-art results can inspire future work in +the area. + +
+
+ comment: Early pre-print; Accepted at the 5th FinNLP workshop @ IJCAI-2023 +
+
+
+
+
+ + ☆ AI in the Gray: Exploring Moderation Policies in Dialogic Large Language + Models vs. Human Answers in Controversial Topics + + +
+ The introduction of ChatGPT and the subsequent improvement of Large Language +Models (LLMs) have prompted more and more individuals to turn to the use of +ChatBots, both for information and assistance with decision-making. However, +the information the user is after is often not formulated by these ChatBots +objectively enough to be provided with a definite, globally accepted answer. + Controversial topics, such as "religion", "gender identity", "freedom of +speech", and "equality", among others, can be a source of conflict as partisan +or biased answers can reinforce preconceived notions or promote disinformation. +By exposing ChatGPT to such debatable questions, we aim to understand its level +of awareness and if existing models are subject to socio-political and/or +economic biases. We also aim to explore how AI-generated answers compare to +human ones. For exploring this, we use a dataset of a social media platform +created for the purpose of debating human-generated claims on polemic subjects +among users, dubbed Kialo. + Our results show that while previous versions of ChatGPT have had important +issues with controversial topics, more recent versions of ChatGPT +(gpt-3.5-turbo) are no longer manifesting significant explicit biases in +several knowledge areas. In particular, it is well-moderated regarding economic +aspects. However, it still maintains degrees of implicit libertarian leaning +toward right-winged ideals which suggest the need for increased moderation from +the socio-political point of view. In terms of domain knowledge on +controversial topics, with the exception of the "Philosophical" category, +ChatGPT is performing well in keeping up with the collective human level of +knowledge. Finally, we see that sources of Bing AI have slightly more tendency +to the center when compared to human answers. All the analyses we make are +generalizable to other types of biases and domains. + +
+
+
+
+
+ + ☆ Spoken Language Intelligence of Large Language Models for Language + Learning + + +
+ People have long hoped for a conversational system that can assist in +real-life situations, and recent progress on large language models (LLMs) is +bringing this idea closer to reality. While LLMs are often impressive in +performance, their efficacy in real-world scenarios that demand expert +knowledge remains unclear. LLMs are believed to hold the most potential and +value in education, especially in the development of Artificial intelligence +(AI) based virtual teachers capable of facilitating language learning. Our +focus is centered on evaluating the efficacy of LLMs in the realm of education, +specifically in the areas of spoken language learning which encompass +phonetics, phonology, and second language acquisition. We introduce a new +multiple-choice question dataset to evaluate the effectiveness of LLMs in the +aforementioned scenarios, including understanding and application of spoken +language knowledge. In addition, we investigate the influence of various +prompting techniques such as zero- and few-shot method (prepending the question +with question-answer exemplars), chain-of-thought (CoT, think step-by-step), +in-domain exampler and external tools (Google, Wikipedia). We conducted +large-scale evaluation on popular LLMs (20 distinct models) using these +methods. We achieved significant performance improvements compared to the +zero-shot baseline in the practical questions reasoning (GPT-3.5, 49.1% -> +63.1%; LLaMA2-70B-Chat, 42.2% -> 48.6%). We found that models of different +sizes have good understanding of concepts in phonetics, phonology, and second +language acquisition, but show limitations in reasoning for real-world +problems. Additionally, we also explore preliminary findings on conversational +communication. + +
+
+ comment: 28 pages, 7 figures, Preprint +
+
+
+
+
+ + ☆ A Multi-Task Semantic Decomposition Framework with Task-specific + Pre-training for Few-Shot NER CIKM 2023 + + +
+ The objective of few-shot named entity recognition is to identify named +entities with limited labeled instances. Previous works have primarily focused +on optimizing the traditional token-wise classification framework, while +neglecting the exploration of information based on NER data characteristics. To +address this issue, we propose a Multi-Task Semantic Decomposition Framework +via Joint Task-specific Pre-training (MSDP) for few-shot NER. Drawing +inspiration from demonstration-based and contrastive learning, we introduce two +novel pre-training tasks: Demonstration-based Masked Language Modeling (MLM) +and Class Contrastive Discrimination. These tasks effectively incorporate +entity boundary information and enhance entity representation in Pre-trained +Language Models (PLMs). In the downstream main task, we introduce a multi-task +joint optimization framework with the semantic decomposing method, which +facilitates the model to integrate two different semantic information for +entity classification. Experimental results of two few-shot NER benchmarks +demonstrate that MSDP consistently outperforms strong baselines by a large +margin. Extensive analyses validate the effectiveness and generalization of +MSDP. + +
+
+ comment: Accepted by CIKM 2023 (Oral Presentation) +
+
+
+
+
+ + ☆ LongBench: A Bilingual, Multitask Benchmark for Long Context + Understanding + + +
+ Although large language models (LLMs) demonstrate impressive performance for +many language tasks, most of them can only handle texts a few thousand tokens +long, limiting their applications on longer sequence inputs, such as books, +reports, and codebases. Recent works have proposed methods to improve LLMs' +long context capabilities by extending context windows and more sophisticated +memory mechanisms. However, comprehensive benchmarks tailored for evaluating +long context understanding are lacking. In this paper, we introduce LongBench, +the first bilingual, multi-task benchmark for long context understanding, +enabling a more rigorous evaluation of long context understanding. LongBench +comprises 21 datasets across 6 task categories in both English and Chinese, +with an average length of 6,711 words (English) and 13,386 characters +(Chinese). These tasks cover key long-text application areas including +single-doc QA, multi-doc QA, summarization, few-shot learning, synthetic tasks, +and code completion. All datasets in LongBench are standardized into a unified +format, allowing for effortless automatic evaluation of LLMs. Upon +comprehensive evaluation of 8 LLMs on LongBench, we find that: (1) Commercial +model (GPT-3.5-Turbo-16k) outperforms other open-sourced models, but still +struggles on longer contexts. (2) Scaled position embedding and fine-tuning on +longer sequences lead to substantial improvement on long context understanding. +(3) Context compression technique such as retrieval brings improvement for +model with weak ability on long contexts, but the performance still lags behind +models that have strong long context understanding capability. The code and +datasets are available at https://github.com/THUDM/LongBench. + +
+
+ comment: 18 pages, 6 figures +
+
+
+
+
+ + ☆ Multimodal Detection of Social Spambots in Twitter using Transformers + + +
+ Although not all bots are malicious, the vast majority of them are +responsible for spreading misinformation and manipulating the public opinion +about several issues, i.e., elections and many more. Therefore, the early +detection of social spambots is crucial. Although there have been proposed +methods for detecting bots in social media, there are still substantial +limitations. For instance, existing research initiatives still extract a large +number of features and train traditional machine learning algorithms or use +GloVe embeddings and train LSTMs. However, feature extraction is a tedious +procedure demanding domain expertise. Also, language models based on +transformers have been proved to be better than LSTMs. Other approaches create +large graphs and train graph neural networks requiring in this way many hours +for training and access to computational resources. To tackle these +limitations, this is the first study employing only the user description field +and images of three channels denoting the type and content of tweets posted by +the users. Firstly, we create digital DNA sequences, transform them to 3d +images, and apply pretrained models of the vision domain, including +EfficientNet, AlexNet, VGG16, etc. Next, we propose a multimodal approach, +where we use TwHIN-BERT for getting the textual representation of the user +description field and employ VGG16 for acquiring the visual representation for +the image modality. We propose three different fusion methods, namely +concatenation, gated multimodal unit, and crossmodal attention, for fusing the +different modalities and compare their performances. Extensive experiments +conducted on the Cresci '17 dataset demonstrate valuable advantages of our +introduced approaches over state-of-the-art ones reaching Accuracy up to +99.98%. + +
+
+
+
+
+ + ☆ An Empirical Study of Consistency Regularization for End-to-End + Speech-to-Text Translation + + +
+ Consistency regularization methods, such as R-Drop (Liang et al., 2021) and +CrossConST (Gao et al., 2023), have achieved impressive supervised and +zero-shot performance in the neural machine translation (NMT) field. Can we +also boost end-to-end (E2E) speech-to-text translation (ST) by leveraging +consistency regularization? In this paper, we conduct empirical studies on +intra-modal and cross-modal consistency and propose two training strategies, +SimRegCR and SimZeroCR, for E2E ST in regular and zero-shot scenarios. +Experiments on the MuST-C benchmark show that our approaches achieve +state-of-the-art (SOTA) performance in most translation directions. The +analyses prove that regularization brought by the intra-modal consistency, +instead of modality gap, is crucial for the regular E2E ST, and the cross-modal +consistency could close the modality gap and boost the zero-shot E2E ST +performance. + +
+
+
+
+
+ + ☆ Bridging the KB-Text Gap: Leveraging Structured Knowledge-aware + Pre-training for KBQA CIKM 2023 + + +
+ Knowledge Base Question Answering (KBQA) aims to answer natural language +questions with factual information such as entities and relations in KBs. +However, traditional Pre-trained Language Models (PLMs) are directly +pre-trained on large-scale natural language corpus, which poses challenges for +them in understanding and representing complex subgraphs in structured KBs. To +bridge the gap between texts and structured KBs, we propose a Structured +Knowledge-aware Pre-training method (SKP). In the pre-training stage, we +introduce two novel structured knowledge-aware tasks, guiding the model to +effectively learn the implicit relationship and better representations of +complex subgraphs. In downstream KBQA task, we further design an efficient +linearization strategy and an interval attention mechanism, which assist the +model to better encode complex subgraphs and shield the interference of +irrelevant subgraphs during reasoning respectively. Detailed experiments and +analyses on WebQSP verify the effectiveness of SKP, especially the significant +improvement in subgraph retrieval (+4.08% H@10). + +
+
+ comment: Accepted as a short paper at CIKM 2023 +
+
+
+
+
+ + ☆ Biomedical Entity Linking with Triple-aware Pre-Training + + +
+ Linking biomedical entities is an essential aspect in biomedical natural +language processing tasks, such as text mining and question answering. However, +a difficulty of linking the biomedical entities using current large language +models (LLM) trained on a general corpus is that biomedical entities are +scarcely distributed in texts and therefore have been rarely seen during +training by the LLM. At the same time, those LLMs are not aware of high level +semantic connection between different biomedical entities, which are useful in +identifying similar concepts in different textual contexts. To cope with +aforementioned problems, some recent works focused on injecting knowledge graph +information into LLMs. However, former methods either ignore the relational +knowledge of the entities or lead to catastrophic forgetting. Therefore, we +propose a novel framework to pre-train the powerful generative LLM by a corpus +synthesized from a KG. In the evaluations we are unable to confirm the benefit +of including synonym, description or relational information. + +
+
+
+
+
+ + ☆ GADePo: Graph-Assisted Declarative Pooling Transformers for + Document-Level Relation Extraction + + +
+ Document-level relation extraction aims to identify relationships between +entities within a document. Current methods rely on text-based encoders and +employ various hand-coded pooling heuristics to aggregate information from +entity mentions and associated contexts. In this paper, we replace these rigid +pooling functions with explicit graph relations by leveraging the intrinsic +graph processing capabilities of the Transformer model. We propose a joint +text-graph Transformer model, and a graph-assisted declarative pooling (GADePo) +specification of the input which provides explicit and high-level instructions +for information aggregation. This allows the pooling process to be guided by +domain-specific knowledge or desired outcomes but still learned by the +Transformer, leading to more flexible and customizable pooling strategies. We +extensively evaluate our method across diverse datasets and models, and show +that our approach yields promising results that are comparable to those +achieved by the hand-coded pooling functions. + +
+
+
+
+
+ + ☆ FIRE: Food Image to REcipe generation + + +
+ Food computing has emerged as a prominent multidisciplinary field of research +in recent years. An ambitious goal of food computing is to develop end-to-end +intelligent systems capable of autonomously producing recipe information for a +food image. Current image-to-recipe methods are retrieval-based and their +success depends heavily on the dataset size and diversity, as well as the +quality of learned embeddings. Meanwhile, the emergence of powerful +attention-based vision and language models presents a promising avenue for +accurate and generalizable recipe generation, which has yet to be extensively +explored. This paper proposes FIRE, a novel multimodal methodology tailored to +recipe generation in the food computing domain, which generates the food title, +ingredients, and cooking instructions based on input food images. FIRE +leverages the BLIP model to generate titles, utilizes a Vision Transformer with +a decoder for ingredient extraction, and employs the T5 model to generate +recipes incorporating titles and ingredients as inputs. We showcase two +practical applications that can benefit from integrating FIRE with large +language model prompting: recipe customization to fit recipes to user +preferences and recipe-to-code transformation to enable automated cooking +processes. Our experimental findings validate the efficacy of our proposed +approach, underscoring its potential for future advancements and widespread +adoption in food computing. + +
+
+ comment: 5 figures, 4 tables +
+
+
+
+
+ + ☆ Effect of Attention and Self-Supervised Speech Embeddings on + Non-Semantic Speech Tasks + + +
+ Human emotion understanding is pivotal in making conversational technology +mainstream. We view speech emotion understanding as a perception task which is +a more realistic setting. With varying contexts (languages, demographics, etc.) +different share of people perceive the same speech segment as a non-unanimous +emotion. As part of the ACM Multimedia 2023 Computational Paralinguistics +ChallengE (ComParE) in the EMotion Share track, we leverage their rich dataset +of multilingual speakers and multi-label regression target of 'emotion share' +or perception of that emotion. We demonstrate that the training scheme of +different foundation models dictates their effectiveness for tasks beyond +speech recognition, especially for non-semantic speech tasks like emotion + understanding. This is a very complex task due to multilingual speakers, +variability in the target labels, and inherent imbalance in the regression +dataset. Our results show that HuBERT-Large with a self-attention-based +light-weight sequence model provides 4.6% improvement over the reported +baseline. + +
+
+ comment: Accepted to appear at ACM Multimedia 2023 Multimedia Grand Challenges + Track +
+
+
+
+
+ + ☆ ZhuJiu: A Multi-dimensional, Multi-faceted Chinese Benchmark for Large + Language Models + + +
+ The unprecedented performance of large language models (LLMs) requires +comprehensive and accurate evaluation. We argue that for LLMs evaluation, +benchmarks need to be comprehensive and systematic. To this end, we propose the +ZhuJiu benchmark, which has the following strengths: (1) Multi-dimensional +ability coverage: We comprehensively evaluate LLMs across 7 ability dimensions +covering 51 tasks. Especially, we also propose a new benchmark that focuses on +knowledge ability of LLMs. (2) Multi-faceted evaluation methods collaboration: +We use 3 different yet complementary evaluation methods to comprehensively +evaluate LLMs, which can ensure the authority and accuracy of the evaluation +results. (3) Comprehensive Chinese benchmark: ZhuJiu is the pioneering +benchmark that fully assesses LLMs in Chinese, while also providing equally +robust evaluation abilities in English. (4) Avoiding potential data leakage: To +avoid data leakage, we construct evaluation data specifically for 37 tasks. We +evaluate 10 current mainstream LLMs and conduct an in-depth discussion and +analysis of their results. The ZhuJiu benchmark and open-participation +leaderboard are publicly released at http://www.zhujiu-benchmark.com/ and we +also provide a demo video at https://youtu.be/qypkJ89L1Ic. + +
+
+
+
+
+ + ☆ EdgeMoE: Fast On-Device Inference of MoE-based Large Language Models + + +
+ Large Language Models (LLMs) such as GPTs and LLaMa have ushered in a +revolution in machine intelligence, owing to their exceptional capabilities in +a wide range of machine learning tasks. However, the transition of LLMs from +data centers to edge devices presents a set of challenges and opportunities. +While this shift can enhance privacy and availability, it is hampered by the +enormous parameter sizes of these models, leading to impractical runtime costs. +In light of these considerations, we introduce EdgeMoE, the first on-device +inference engine tailored for mixture-of-expert (MoE) LLMs, a popular variant +of sparse LLMs that exhibit nearly constant computational complexity as their +parameter size scales. EdgeMoE achieves both memory and computational +efficiency by strategically partitioning the model across the storage +hierarchy. Specifically, non-expert weights are stored in the device's memory, +while expert weights are kept in external storage and are fetched into memory +only when they are activated. This design is underpinned by a crucial insight +that expert weights, though voluminous, are infrequently accessed due to sparse +activation patterns. To further mitigate the overhead associated with expert +I/O swapping, EdgeMoE incorporates two innovative techniques: (1) Expert-wise +bitwidth adaptation: This method reduces the size of expert weights with an +acceptable level of accuracy loss. (2) Expert management: It predicts the +experts that will be activated in advance and preloads them into the +compute-I/O pipeline, thus further optimizing the process. In empirical +evaluations conducted on well-established MoE LLMs and various edge devices, +EdgeMoE demonstrates substantial memory savings and performance improvements +when compared to competitive baseline solutions. + +
+
+
+
+
+ + ☆ DISC-MedLLM: Bridging General Large Language Models and Real-World + Medical Consultation + + +
+ We propose DISC-MedLLM, a comprehensive solution that leverages Large +Language Models (LLMs) to provide accurate and truthful medical response in +end-to-end conversational healthcare services. To construct high-quality +Supervised Fine-Tuning (SFT) datasets, we employ three strategies: utilizing +medical knowledge-graphs, reconstructing real-world dialogues, and +incorporating human-guided preference rephrasing. These datasets are +instrumental in training DISC-MedLLM, surpassing existing medical LLMs in both +single-turn and multi-turn consultation scenarios. Extensive experimental +results demonstrate the effectiveness of the proposed model in bridging the gap +between general language models and real-world medical consultation. +Additionally, we release the constructed dataset and model weights to further +contribute to research and development. Further details and resources can be +found at https://github.com/FudanDISC/DISC-MedLLM + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Cognitive Effects in Large Language Models ECAI + + +
+ Large Language Models (LLMs) such as ChatGPT have received enormous attention +over the past year and are now used by hundreds of millions of people every +day. The rapid adoption of this technology naturally raises questions about the +possible biases such models might exhibit. In this work, we tested one of these +models (GPT-3) on a range of cognitive effects, which are systematic patterns +that are usually found in human cognitive tasks. We found that LLMs are indeed +prone to several human cognitive effects. Specifically, we show that the +priming, distance, SNARC, and size congruity effects were presented with GPT-3, +while the anchoring effect is absent. We describe our methodology, and +specifically the way we converted real-world experiments to text-based +experiments. Finally, we speculate on the possible reasons why GPT-3 exhibits +these effects and discuss whether they are imitated or reinvented. + +
+
+ comment: Accepted and will be published in the ECAI conference +
+
+
+
+
+ + ☆ Leveraging A Medical Knowledge Graph into Large Language Models for + Diagnosis Prediction + + +
+ Electronic Health Records (EHRs) and routine documentation practices play a +vital role in patients' daily care, providing a holistic record of health, +diagnoses, and treatment. However, complex and verbose EHR narratives overload +healthcare providers, risking diagnostic inaccuracies. While Large Language +Models (LLMs) have showcased their potential in diverse language tasks, their +application in the healthcare arena needs to ensure the minimization of +diagnostic errors and the prevention of patient harm. In this paper, we outline +an innovative approach for augmenting the proficiency of LLMs in the realm of +automated diagnosis generation, achieved through the incorporation of a medical +knowledge graph (KG) and a novel graph model: Dr.Knows, inspired by the +clinical diagnostic reasoning process. We derive the KG from the National +Library of Medicine's Unified Medical Language System (UMLS), a robust +repository of biomedical knowledge. Our method negates the need for +pre-training and instead leverages the KG as an auxiliary instrument aiding in +the interpretation and summarization of complex medical concepts. Using +real-world hospital datasets, our experimental results demonstrate that the +proposed approach of combining LLMs with KG has the potential to improve the +accuracy of automated diagnosis generation. More importantly, our approach +offers an explainable diagnostic pathway, edging us closer to the realization +of AI-augmented diagnostic decision support systems. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Evaluating the Robustness to Instructions of Large Language Models + + +
+ Recently, Instruction fine-tuning has risen to prominence as a potential +method for enhancing the zero-shot capabilities of Large Language Models (LLMs) +on novel tasks. This technique has shown an exceptional ability to boost the +performance of moderately sized LLMs, sometimes even reaching performance +levels comparable to those of much larger model variants. The focus is on the +robustness of instruction-tuned LLMs to seen and unseen tasks. We conducted an +exploration of six models including Alpaca, Vicuna, WizardLM, and Traditional +Task-oriented Models(Flan-T5-XL/XXL, T0++) using real-world relation extraction +datasets as case studies. We carried out a comprehensive evaluation of these +instruction-following LLMs which have been tuned based on open-domain +instructions and task-oriented instructions. The main discussion is their +performance and robustness towards instructions. We have observed that in most +cases, the model's performance in dealing with unfamiliar instructions tends to +worsen significantly, and the robustness of the model for RE instructions +deteriorates compared to QA. Further, we discovered that up until a certain +parameter size threshold (3B), the performance of the FLAN-T5 model improves as +the parameter count increases. The robustness of different scales of FLAN-T5 +models to RE instruction is worse than the robustness to QA instruction. + +
+
+ comment: work in progress +
+
+
+
+
+ + ☆ FonMTL: Towards Multitask Learning for the Fon Language EMNLP 2023 + + +
+ The Fon language, spoken by an average 2 million of people, is a truly +low-resourced African language, with a limited online presence, and existing +datasets (just to name but a few). Multitask learning is a learning paradigm +that aims to improve the generalization capacity of a model by sharing +knowledge across different but related tasks: this could be prevalent in very +data-scarce scenarios. In this paper, we present the first explorative approach +to multitask learning, for model capabilities enhancement in Natural Language +Processing for the Fon language. Specifically, we explore the tasks of Named +Entity Recognition (NER) and Part of Speech Tagging (POS) for Fon. We leverage +two language model heads as encoders to build shared representations for the +inputs, and we use linear layers blocks for classification relative to each +task. Our results on the NER and POS tasks for Fon, show competitive (or +better) performances compared to several multilingual pretrained language +models finetuned on single tasks. Additionally, we perform a few ablation +studies to leverage the efficiency of two different loss combination strategies +and find out that the equal loss weighting approach works best in our case. Our +code is open-sourced at https://github.com/bonaventuredossou/multitask_fon. + +
+
+ comment: Accepted at WiNLP workshop, co-located at EMNLP 2023 +
+
+
+
+
+ + ☆ Goodhart's Law Applies to NLP's Explanation Benchmarks + + +
+ Despite the rising popularity of saliency-based explanations, the research +community remains at an impasse, facing doubts concerning their purpose, +efficacy, and tendency to contradict each other. Seeking to unite the +community's efforts around common goals, several recent works have proposed +evaluation metrics. In this paper, we critically examine two sets of metrics: +the ERASER metrics (comprehensiveness and sufficiency) and the EVAL-X metrics, +focusing our inquiry on natural language processing. First, we show that we can +inflate a model's comprehensiveness and sufficiency scores dramatically without +altering its predictions or explanations on in-distribution test inputs. Our +strategy exploits the tendency for extracted explanations and their complements +to be "out-of-support" relative to each other and in-distribution inputs. Next, +we demonstrate that the EVAL-X metrics can be inflated arbitrarily by a simple +method that encodes the label, even though EVAL-X is precisely motivated to +address such exploits. Our results raise doubts about the ability of current +metrics to guide explainability research, underscoring the need for a broader +reassessment of what precisely these metrics are intended to capture. + +
+
+
+
+
+ + ☆ SalesBot 2.0: A Human-Like Intent-Guided Chit-Chat Dataset + + +
+ In recent research on dialogue systems and corpora, there has been a +significant focus on two distinct categories: task-oriented (TOD) and +open-domain (chit-chat) dialogues. TOD systems aim to satisfy specific user +goals, such as finding a movie to watch, whereas open-domain systems primarily +focus on generating engaging conversations. A recent study by Chiu et al. +(2022) introduced SalesBot, which provides simulators and a dataset with +one-turn transition from chit-chat to task-oriented dialogues. However, the +previously generated data solely relied on BlenderBot, which raised concerns +about its long-turn naturalness and consistency during a conversation. To +address this issue, this paper aims to build SalesBot 2.0, a revised version of +the published data, by leveraging the commonsense knowledge of large language +models (LLMs) through proper prompting. The objective is to gradually bridge +the gap between chit-chat and TOD towards better naturalness and consistency. +The newly released large-scale dataset with detailed annotations exhibits +smoother transitions between topics and is more human-like in terms of +naturalness and consistency. It can serve as a valuable resource for both +academic research and commercial applications. Furthermore, our proposed +framework can be applied to generate numerous dialogues with various target +intents. + +
+
+
+
+
+ + ☆ The Cultural Psychology of Large Language Models: Is ChatGPT a Holistic + or Analytic Thinker? + + +
+ The prevalent use of Large Language Models (LLMs) has necessitated studying +their mental models, yielding noteworthy theoretical and practical +implications. Current research has demonstrated that state-of-the-art LLMs, +such as ChatGPT, exhibit certain theory of mind capabilities and possess +relatively stable Big Five and/or MBTI personality traits. In addition, +cognitive process features form an essential component of these mental models. +Research in cultural psychology indicated significant differences in the +cognitive processes of Eastern and Western people when processing information +and making judgments. While Westerners predominantly exhibit analytical +thinking that isolates things from their environment to analyze their nature +independently, Easterners often showcase holistic thinking, emphasizing +relationships and adopting a global viewpoint. In our research, we probed the +cultural cognitive traits of ChatGPT. We employed two scales that directly +measure the cognitive process: the Analysis-Holism Scale (AHS) and the Triadic +Categorization Task (TCT). Additionally, we used two scales that investigate +the value differences shaped by cultural thinking: the Dialectical Self Scale +(DSS) and the Self-construal Scale (SCS). In cognitive process tests (AHS/TCT), +ChatGPT consistently tends towards Eastern holistic thinking, but regarding +value judgments (DSS/SCS), ChatGPT does not significantly lean towards the East +or the West. We suggest that the result could be attributed to both the +training paradigm and the training data in LLM development. We discuss the +potential value of this finding for AI research and directions for future +research. + +
+
+
+
+
+ + ☆ Gender bias and stereotypes in Large Language Models + + +
+ Large Language Models (LLMs) have made substantial progress in the past +several months, shattering state-of-the-art benchmarks in many domains. This +paper investigates LLMs' behavior with respect to gender stereotypes, a known +issue for prior models. We use a simple paradigm to test the presence of gender +bias, building on but differing from WinoBias, a commonly used gender bias +dataset, which is likely to be included in the training data of current LLMs. +We test four recently published LLMs and demonstrate that they express biased +assumptions about men and women's occupations. Our contributions in this paper +are as follows: (a) LLMs are 3-6 times more likely to choose an occupation that +stereotypically aligns with a person's gender; (b) these choices align with +people's perceptions better than with the ground truth as reflected in official +job statistics; (c) LLMs in fact amplify the bias beyond what is reflected in +perceptions or the ground truth; (d) LLMs ignore crucial ambiguities in +sentence structure 95% of the time in our study items, but when explicitly +prompted, they recognize the ambiguity; (e) LLMs provide explanations for their +choices that are factually inaccurate and likely obscure the true reason behind +their predictions. That is, they provide rationalizations of their biased +behavior. This highlights a key property of these models: LLMs are trained on +imbalanced datasets; as such, even with the recent successes of reinforcement +learning with human feedback, they tend to reflect those imbalances back at us. +As with other types of societal biases, we suggest that LLMs must be carefully +tested to ensure that they treat minoritized individuals and communities +equitably. + +
+
+ comment: ACM Collective Intelligence +
+
+
+
+
+ + ☆ Neural approaches to spoken content embedding + + +
+ Comparing spoken segments is a central operation to speech processing. +Traditional approaches in this area have favored frame-level dynamic +programming algorithms, such as dynamic time warping, because they require no +supervision, but they are limited in performance and efficiency. As an +alternative, acoustic word embeddings -- fixed-dimensional vector +representations of variable-length spoken word segments -- have begun to be +considered for such tasks as well. However, the current space of such +discriminative embedding models, training approaches, and their application to +real-world downstream tasks is limited. We start by considering ``single-view" +training losses where the goal is to learn an acoustic word embedding model +that separates same-word and different-word spoken segment pairs. Then, we +consider ``multi-view" contrastive losses. In this setting, acoustic word +embeddings are learned jointly with embeddings of character sequences to +generate acoustically grounded embeddings of written words, or acoustically +grounded word embeddings. + In this thesis, we contribute new discriminative acoustic word embedding +(AWE) and acoustically grounded word embedding (AGWE) approaches based on +recurrent neural networks (RNNs). We improve model training in terms of both +efficiency and performance. We take these developments beyond English to +several low-resource languages and show that multilingual training improves +performance when labeled data is limited. We apply our embedding models, both +monolingual and multilingual, to the downstream tasks of query-by-example +speech search and automatic speech recognition. Finally, we show how our +embedding approaches compare with and complement more recent self-supervised +speech models. + +
+
+ comment: PhD thesis +
+
+
+
+
+ + ☆ MEMORY-VQ: Compression for Tractable Internet-Scale Memory + + +
+ Retrieval augmentation is a powerful but expensive method to make language +models more knowledgeable about the world. Memory-based methods like LUMEN +pre-compute token representations for retrieved passages to drastically speed +up inference. However, memory also leads to much greater storage requirements +from storing pre-computed representations. + We propose MEMORY-VQ, a new method to reduce storage requirements of +memory-augmented models without sacrificing performance. Our method uses a +vector quantization variational autoencoder (VQ-VAE) to compress token +representations. We apply MEMORY-VQ to the LUMEN model to obtain LUMEN-VQ, a +memory model that achieves a 16x compression rate with comparable performance +on the KILT benchmark. LUMEN-VQ enables practical retrieval augmentation even +for extremely large retrieval corpora. + +
+
+
+
+
+ + ☆ Multiscale Contextual Learning for Speech Emotion Recognition in + Emergency Call Center Conversations + + +
+ Emotion recognition in conversations is essential for ensuring advanced +human-machine interactions. However, creating robust and accurate emotion +recognition systems in real life is challenging, mainly due to the scarcity of +emotion datasets collected in the wild and the inability to take into account +the dialogue context. The CEMO dataset, composed of conversations between +agents and patients during emergency calls to a French call center, fills this +gap. The nature of these interactions highlights the role of the emotional flow +of the conversation in predicting patient emotions, as context can often make a +difference in understanding actual feelings. This paper presents a multi-scale +conversational context learning approach for speech emotion recognition, which +takes advantage of this hypothesis. We investigated this approach on both +speech transcriptions and acoustic segments. Experimentally, our method uses +the previous or next information of the targeted segment. In the text domain, +we tested the context window using a wide range of tokens (from 10 to 100) and +at the speech turns level, considering inputs from both the same and opposing +speakers. According to our tests, the context derived from previous tokens has +a more significant influence on accurate prediction than the following tokens. +Furthermore, taking the last speech turn of the same speaker in the +conversation seems useful. In the acoustic domain, we conducted an in-depth +analysis of the impact of the surrounding emotions on the prediction. While +multi-scale conversational context learning using Transformers can enhance +performance in the textual modality for emergency call recordings, +incorporating acoustic context is more challenging. + +
+
+
+
+
+ + ☆ CommunityFish: A Poisson-based Document Scaling With Hierarchical + Clustering + + +
+ Document scaling has been a key component in text-as-data applications for +social scientists and a major field of interest for political researchers, who +aim at uncovering differences between speakers or parties with the help of +different probabilistic and non-probabilistic approaches. Yet, most of these +techniques are either built upon the agnostically bag-of-word hypothesis or use +prior information borrowed from external sources that might embed the results +with a significant bias. If the corpus has long been considered as a collection +of documents, it can also be seen as a dense network of connected words whose +structure could be clustered to differentiate independent groups of words, +based on their co-occurrences in documents, known as communities. This paper +introduces CommunityFish as an augmented version of Wordfish based on a +hierarchical clustering, namely the Louvain algorithm, on the word space to +yield communities as semantic and independent n-grams emerging from the corpus +and use them as an input to Wordfish method, instead of considering the word +space. This strategy emphasizes the interpretability of the results, since +communities have a non-overlapping structure, hence a crucial informative power +in discriminating parties or speakers, in addition to allowing a faster +execution of the Poisson scaling model. Aside from yielding communities, +assumed to be subtopic proxies, the application of this technique outperforms +the classic Wordfish model by highlighting historical developments in the U.S. +State of the Union addresses and was found to replicate the prevailing +political stance in Germany when using the corpus of parties' legislative +manifestos. + +
+
+
+
+
+ + ☆ Attention Visualizer Package: Revealing Word Importance for Deeper + Insight into Encoder-Only Transformer Models + + +
+ This report introduces the Attention Visualizer package, which is crafted to +visually illustrate the significance of individual words in encoder-only +transformer-based models. In contrast to other methods that center on tokens +and self-attention scores, our approach will examine the words and their impact +on the final embedding representation. Libraries like this play a crucial role +in enhancing the interpretability and explainability of neural networks. They +offer the opportunity to illuminate their internal mechanisms, providing a +better understanding of how they operate and can be enhanced. You can access +the code and review examples on the following GitHub repository: +https://github.com/AlaFalaki/AttentionVisualizer. + +
+
+ comment: 12 pages, 15 figures +
+
+
+
+
+ + ♻ ☆ Parameter-Efficient Finetuning for Robust Continual Multilingual + Learning ACL + + +
+ We introduce and study the problem of Continual Multilingual Learning (CML) +where a previously trained multilingual model is periodically updated using new +data arriving in stages. If the new data is present only in a subset of +languages, we find that the resulting model shows improved performance only on +the languages included in the latest update (and a few closely related +languages) while its performance on all the remaining languages degrade +significantly. We address this challenge by proposing LAFT-URIEL, a +parameter-efficient finetuning strategy which aims to increase the number of +languages on which the model improves after an update, while reducing the +magnitude of loss in performance for the remaining languages. LAFT-URIEL uses +linguistic knowledge to balance overfitting and knowledge sharing across +languages, allowing for an additional 25% of task languages to see an +improvement in performance after an update, while also reducing the average +magnitude of losses on the remaining languages by 78% relative. + +
+
+ comment: Published at ACL Findings 2023 +
+
+
+
+
+ + ♻ ☆ Training and Meta-Evaluating Machine Translation Evaluation Metrics at + the Paragraph Level + + +
+ As research on machine translation moves to translating text beyond the +sentence level, it remains unclear how effective automatic evaluation metrics +are at scoring longer translations. In this work, we first propose a method for +creating paragraph-level data for training and meta-evaluating metrics from +existing sentence-level data. Then, we use these new datasets to benchmark +existing sentence-level metrics as well as train learned metrics at the +paragraph level. Interestingly, our experimental results demonstrate that using +sentence-level metrics to score entire paragraphs is equally as effective as +using a metric designed to work at the paragraph level. We speculate this +result can be attributed to properties of the task of reference-based +evaluation as well as limitations of our datasets with respect to capturing all +types of phenomena that occur in paragraph-level translations. + +
+
+ comment: Removing extra "and" from author list +
+
+
+
+
+ + ♻ ☆ Evaluating Open-QA Evaluation + + +
+ This study focuses on the evaluation of the Open Question Answering (Open-QA) +task, which can directly estimate the factuality of large language models +(LLMs). Current automatic evaluation methods have shown limitations, indicating +that human evaluation still remains the most reliable approach. We introduce a +new task, Evaluating QA Evaluation (QA-Eval) and the corresponding dataset +EVOUNA, designed to assess the accuracy of AI-generated answers in relation to +standard answers within Open-QA. Our evaluation of these methods utilizes +human-annotated results to measure their performance. Specifically, the work +investigates methods that show high correlation with human evaluations, deeming +them more reliable. We also discuss the pitfalls of current methods and methods +to improve LLM-based evaluators. We believe this new QA-Eval task and +corresponding dataset EVOUNA will facilitate the development of more effective +automatic evaluation tools and prove valuable for future research in this area. +All resources are available at \url{https://github.com/wangcunxiang/QA-Eval} +and it is under the Apache-2.0 License. + +
+
+
+
+
+ + ♻ ☆ Large Language Models are Fixated by Red Herrings: Exploring Creative + Problem Solving and Einstellung Effect using the Only Connect Wall Dataset + + +
+ The quest for human imitative AI has been an enduring topic in AI research +since its inception. The technical evolution and emerging capabilities of the +latest cohort of large language models (LLMs) have reinvigorated the subject +beyond academia to the cultural zeitgeist. While recent NLP evaluation +benchmark tasks test some aspects of human-imitative behaviour (e.g., +BIG-bench's 'human-like behavior' tasks), few, if not none, examine creative +problem solving abilities. Creative problem solving in humans is a well-studied +topic in cognitive neuroscience with standardized tests that predominantly use +the ability to associate (heterogeneous) connections among clue words as a +metric for creativity. Exposure to misleading stimuli - distractors dubbed red +herrings - impede human performance in such tasks via the fixation effect and +Einstellung paradigm. In cognitive neuroscience studies, such fixations are +experimentally induced by pre-exposing participants to orthographically similar +incorrect words to subsequent word-fragments or clues. The popular British quiz +show Only Connect's Connecting Wall segment essentially mimics Mednick's Remote +Associates Test (RAT) formulation with built-in, deliberate red herrings, which +makes it an ideal proxy dataset to explore and study fixation effect and +Einstellung paradigm from cognitive neuroscience in LLMs. In this paper we +present the novel Only Connect Wall (OCW) dataset and report results from our +evaluation of selected pre-trained language models and LLMs on creative problem +solving tasks like grouping clue words by heterogeneous connections, and +identifying correct open knowledge domain connections in respective groups. We +synthetically generate two additional datasets: OCW-Randomized, OCW-WordNet to +further analyze our red-herrings hypothesis in language models. The code and +link to the dataset are available at https://github.com/TaatiTeam/OCW. + +
+
+ comment: V2: with added OCW-Randomized and OCW-WordNet results in Section 4.3 + (added). 22 pages with Appendix +
+
+
+
+
+ + ♻ ☆ Quantum Circuit Compiler for a Shuttling-Based Trapped-Ion Quantum + Computer + + +
+ The increasing capabilities of quantum computing hardware and the challenge +of realizing deep quantum circuits require fully automated and efficient tools +for compiling quantum circuits. To express arbitrary circuits in a sequence of +native gates specific to the quantum computer architecture, it is necessary to +make algorithms portable across the landscape of quantum hardware providers. In +this work, we present a compiler capable of transforming and optimizing a +quantum circuit targeting a shuttling-based trapped-ion quantum processor. It +consists of custom algorithms set on top of the quantum circuit framework +Pytket. The performance was evaluated for a wide range of quantum circuits and +the results show that the gate counts can be reduced by factors up to 5.1 +compared to standard Pytket and up to 2.2 compared to standard Qiskit +compilation. + +
+
+ comment: 35 pages, 25 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Large Language Models Vote: Prompting for Rare Disease Identification + + +
+ The emergence of generative Large Language Models (LLMs) emphasizes the need +for accurate and efficient prompting approaches. LLMs are often applied in +Few-Shot Learning (FSL) contexts, where tasks are executed with minimal +training data. FSL has become popular in many Artificial Intelligence (AI) +subdomains, including AI for health. Rare diseases affect a small fraction of +the population. Rare disease identification from clinical notes inherently +requires FSL techniques due to limited data availability. Manual data +collection and annotation is both expensive and time-consuming. In this paper, +we propose Models-Vote Prompting (MVP), a flexible prompting approach for +improving the performance of LLM queries in FSL settings. MVP works by +prompting numerous LLMs to perform the same tasks and then conducting a +majority vote on the resulting outputs. This method achieves improved results +to any one model in the ensemble on one-shot rare disease identification and +classification tasks. We also release a novel rare disease dataset for FSL, +available to those who signed the MIMIC-IV Data Use Agreement (DUA). +Furthermore, in using MVP, each model is prompted multiple times, substantially +increasing the time needed for manual annotation, and to address this, we +assess the feasibility of using JSON for automating generative LLM evaluation. + +
+
+
+
+
+ + ♻ ☆ Bridging the Gap: Deciphering Tabular Data Using Large Language Model + + +
+ In the realm of natural language processing, the understanding of tabular +data has perpetually stood as a focal point of scholarly inquiry. The emergence +of expansive language models, exemplified by the likes of ChatGPT, has ushered +in a wave of endeavors wherein researchers aim to harness these models for +tasks related to table-based question answering. Central to our investigative +pursuits is the elucidation of methodologies that amplify the aptitude of such +large language models in discerning both the structural intricacies and +inherent content of tables, ultimately facilitating their capacity to provide +informed responses to pertinent queries. To this end, we have architected a +distinctive module dedicated to the serialization of tables for seamless +integration with expansive language models. Additionally, we've instituted a +corrective mechanism within the model to rectify potential inaccuracies. +Experimental results indicate that, although our proposed method trails the +SOTA by approximately 11.7% in overall metrics, it surpasses the SOTA by about +1.2% in tests on specific datasets. This research marks the first application +of large language models to table-based question answering tasks, enhancing the +model's comprehension of both table structures and content. + +
+
+
+
+
+ + ♻ ☆ Making first order linear logic a generating grammar + + +
+ It is known that different categorial grammars have surface representation in +a fragment of first order multiplicative linear logic (MLL1). We show that the +fragment of interest is equivalent to the recently introduced extended tensor +type calculus (ETTC). ETTC is a calculus of specific typed terms, which +represent tuples of strings, more precisely bipartite graphs decorated with +strings. Types are derived from linear logic formulas, and rules correspond to +concrete operations on these string-labeled graphs, so that they can be +conveniently visualized. This provides the above mentioned fragment of MLL1 +that is relevant for language modeling not only with some alternative syntax +and intuitive geometric representation, but also with an intrinsic deductive +system, which has been absent. + In this work we consider a non-trivial notationally enriched variation of the +previously introduced {\bf ETTC}, which allows more concise and transparent +computations. We present both a cut-free sequent calculus and a natural +deduction formalism. + +
+
+ comment: Revised and extended version with detailed proofs. arXiv admin note: + substantial text overlap with arXiv:2112.15253 +
+
+
+
+
+ + ♻ ☆ Towards Versatile and Efficient Visual Knowledge Integration into + Pre-trained Language Models with Cross-Modal Adapters + + +
+ Humans learn language via multi-modal knowledge. However, due to the +text-only pre-training scheme, most existing pre-trained language models (PLMs) +are hindered from the multi-modal information. + To inject visual knowledge into PLMs, existing methods incorporate either the +text or image encoder of vision-language models (VLMs) to encode the visual +information and update all the original parameters of PLMs for knowledge +fusion. + In this paper, we propose a new plug-and-play module, X-adapter, to flexibly +leverage the aligned visual and textual knowledge learned in pre-trained VLMs +and efficiently inject them into PLMs. + Specifically, we insert X-adapters into PLMs, and only the added parameters +are updated during adaptation. + To fully exploit the potential in VLMs, X-adapters consist of two +sub-modules, V-expert and T-expert, to fuse VLMs' image and text +representations, respectively. + We can opt for activating different sub-modules depending on the downstream +tasks. + Experimental results show that our method can significantly improve the +performance on object-color reasoning and natural language understanding (NLU) +tasks compared with PLM baselines. + +
+
+
+
+
+ + ♻ ☆ Enhancing Self-Disclosure In Neural Dialog Models By Candidate + Re-ranking + + +
+ Neural language modelling has progressed the state-of-the-art in different +downstream Natural Language Processing (NLP) tasks. One such area is of +open-domain dialog modelling, neural dialog models based on GPT-2 such as +DialoGPT have shown promising performance in single-turn conversation. However, +such (neural) dialog models have been criticized for generating responses which +although may have relevance to the previous human response, tend to quickly +dissipate human interest and descend into trivial conversation. One reason for +such performance is the lack of explicit conversation strategy being employed +in human-machine conversation. Humans employ a range of conversation strategies +while engaging in a conversation, one such key social strategies is +Self-disclosure(SD). A phenomenon of revealing information about one-self to +others. Social penetration theory (SPT) proposes that communication between two +people moves from shallow to deeper levels as the relationship progresses +primarily through self-disclosure. Disclosure helps in creating rapport among +the participants engaged in a conversation. In this paper, Self-disclosure +enhancement architecture (SDEA) is introduced utilizing Self-disclosure Topic +Model (SDTM) during inference stage of a neural dialog model to re-rank +response candidates to enhance self-disclosure in single-turn responses from +from the model. + +
+
+ comment: 10 pages, 3 figures, 2 table +
+
+
+
+
+ + ♻ ☆ Out of the Cage: How Stochastic Parrots Win in Cyber Security + Environments + + +
+ Large Language Models (LLMs) have gained widespread popularity across diverse +domains involving text generation, summarization, and various natural language +processing tasks. Despite their inherent limitations, LLM-based designs have +shown promising capabilities in planning and navigating open-world scenarios. +This paper introduces a novel application of pre-trained LLMs as agents within +cybersecurity network environments, focusing on their utility for sequential +decision-making processes. + We present an approach wherein pre-trained LLMs are leveraged as attacking +agents in two reinforcement learning environments. Our proposed agents +demonstrate similar or better performance against state-of-the-art agents +trained for thousands of episodes in most scenarios and configurations. In +addition, the best LLM agents perform similarly to human testers of the +environment without any additional training process. This design highlights the +potential of LLMs to efficiently address complex decision-making tasks within +cybersecurity. + Furthermore, we introduce a new network security environment named +NetSecGame. The environment is designed to eventually support complex +multi-agent scenarios within the network security domain. The proposed +environment mimics real network attacks and is designed to be highly modular +and adaptable for various scenarios. + +
+
+ comment: Under review. 10 pages plus appendices, 7 figures, 4 tables. Edit: + fix e-mails and code repository +
+
+
+
+
+ + ♻ ☆ Comparing Abstractive Summaries Generated by ChatGPT to Real Summaries + Through Blinded Reviewers and Text Classification Algorithms + + +
+ Large Language Models (LLMs) have gathered significant attention due to their +impressive performance on a variety of tasks. ChatGPT, developed by OpenAI, is +a recent addition to the family of language models and is being called a +disruptive technology by a few, owing to its human-like text-generation +capabilities. Although, many anecdotal examples across the internet have +evaluated ChatGPT's strength and weakness, only a few systematic research +studies exist. To contribute to the body of literature of systematic research +on ChatGPT, we evaluate the performance of ChatGPT on Abstractive Summarization +by the means of automated metrics and blinded human reviewers. We also build +automatic text classifiers to detect ChatGPT generated summaries. We found that +while text classification algorithms can distinguish between real and generated +summaries, humans are unable to distinguish between real summaries and those +produced by ChatGPT. + +
+
+
+
+
+ + ♻ ☆ The Effects of Political Martyrdom on Election Results: The + Assassination of Abe + + +
+ In developed nations assassinations are rare and thus the impact of such acts +on the electoral and political landscape is understudied. In this paper, we +focus on Twitter data to examine the effects of Japan's former Primer Minister +Abe's assassination on the Japanese House of Councillors elections in 2022. We +utilize sentiment analysis and emotion detection together with topic modeling +on over 2 million tweets and compare them against tweets during previous +election cycles. Our findings indicate that Twitter sentiments were negatively +impacted by the event in the short term and that social media attention span +has shortened. We also discuss how "necropolitics" affected the outcome of the +elections in favor of the deceased's party meaning that there seems to have +been an effect of Abe's death on the election outcome though the findings +warrant further investigation for conclusive results. + +
+
+
+
+
+ + ♻ ☆ EmotionIC: Emotional Inertia and Contagion-Driven Dependency Modeling + for Emotion Recognition in Conversation + + +
+ Emotion Recognition in Conversation (ERC) has attracted growing attention in +recent years as a result of the advancement and implementation of +human-computer interface technologies. In this paper, we propose a novel +approach to dependency modeling driven by Emotional Inertia and Contagion +(EmotionIC) for ERC task. Our EmotionIC consists of three main components, +i.e., Identity Masked Multi-Head Attention (IMMHA), Dialogue-based Gated +Recurrent Unit (DiaGRU), and Skip-chain Conditional Random Field (SkipCRF). +Compared to previous ERC models, EmotionIC can model a conversation more +thoroughly at both the feature-extraction and classification levels. The +proposed model attempts to integrate the advantages of attention- and +recurrence-based methods at the feature-extraction level. Specifically, IMMHA +is applied to capture identity-based global contextual dependencies, while +DiaGRU is utilized to extract speaker- and temporal-aware local contextual +information. At the classification level, SkipCRF can explicitly mine complex +emotional flows from higher-order neighboring utterances in the conversation. +Experimental results show that our method can significantly outperform the +state-of-the-art models on four benchmark datasets. The ablation studies +confirm that our modules can effectively model emotional inertia and contagion. + +
+
+ comment: 19 pages,10 figures +
+
+
+
+
+ + ♻ ☆ Communicative Agents for Software Development + + +
+ Software engineering is a domain characterized by intricate decision-making +processes, often relying on nuanced intuition and consultation. Recent +advancements in deep learning have started to revolutionize software +engineering practices through elaborate designs implemented at various stages +of software development. In this paper, we present an innovative paradigm that +leverages large language models (LLMs) throughout the entire software +development process, streamlining and unifying key processes through natural +language communication, thereby eliminating the need for specialized models at +each phase. At the core of this paradigm lies ChatDev, a virtual chat-powered +software development company that mirrors the established waterfall model, +meticulously dividing the development process into four distinct chronological +stages: designing, coding, testing, and documenting. Each stage engages a team +of agents, such as programmers, code reviewers, and test engineers, fostering +collaborative dialogue and facilitating a seamless workflow. The chat chain +acts as a facilitator, breaking down each stage into atomic subtasks. This +enables dual roles, allowing for proposing and validating solutions through +context-aware communication, leading to efficient resolution of specific +subtasks. The instrumental analysis of ChatDev highlights its remarkable +efficacy in software generation, enabling the completion of the entire software +development process in under seven minutes at a cost of less than one dollar. +It not only identifies and alleviates potential vulnerabilities but also +rectifies potential hallucinations while maintaining commendable efficiency and +cost-effectiveness. The potential of ChatDev unveils fresh possibilities for +integrating LLMs into the realm of software development. + +
+
+ comment: https://github.com/OpenBMB/ChatDev +
+
+
+
+
+ + ♻ ☆ Latent Jailbreak: A Benchmark for Evaluating Text Safety and Output + Robustness of Large Language Models + + +
+ Considerable research efforts have been devoted to ensuring that large +language models (LLMs) align with human values and generate safe text. However, +an excessive focus on sensitivity to certain topics can compromise the model's +robustness in following instructions, thereby impacting its overall performance +in completing tasks. Previous benchmarks for jailbreaking LLMs have primarily +focused on evaluating the safety of the models without considering their +robustness. In this paper, we propose a benchmark that assesses both the safety +and robustness of LLMs, emphasizing the need for a balanced approach. To +comprehensively study text safety and output robustness, we introduce a latent +jailbreak prompt dataset, each involving malicious instruction embedding. +Specifically, we instruct the model to complete a regular task, such as +translation, with the text to be translated containing malicious instructions. +To further analyze safety and robustness, we design a hierarchical annotation +framework. We present a systematic analysis of the safety and robustness of +LLMs regarding the position of explicit normal instructions, word replacements +(verbs in explicit normal instructions, target groups in malicious +instructions, cue words for explicit normal instructions), and instruction +replacements (different explicit normal instructions). Our results demonstrate +that current LLMs not only prioritize certain instruction verbs but also +exhibit varying jailbreak rates for different instruction verbs in explicit +normal instructions. Code and data are available at +https://github.com/qiuhuachuan/latent-jailbreak. + +
+
+ comment: Code and data are available at + https://github.com/qiuhuachuan/latent-jailbreak +
+
+
+
+
+ + ♻ ☆ Efficient Domain Adaptation of Sentence Embeddings Using Adapters + + +
+ Sentence embeddings enable us to capture the semantic similarity of short +texts. Most sentence embedding models are trained for general semantic textual +similarity tasks. Therefore, to use sentence embeddings in a particular domain, +the model must be adapted to it in order to achieve good results. Usually, this +is done by fine-tuning the entire sentence embedding model for the domain of +interest. While this approach yields state-of-the-art results, all of the +model's weights are updated during fine-tuning, making this method +resource-intensive. Therefore, instead of fine-tuning entire sentence embedding +models for each target domain individually, we propose to train lightweight +adapters. These domain-specific adapters do not require fine-tuning all +underlying sentence embedding model parameters. Instead, we only train a small +number of additional parameters while keeping the weights of the underlying +sentence embedding model fixed. Training domain-specific adapters allows always +using the same base model and only exchanging the domain-specific adapters to +adapt sentence embeddings to a specific domain. We show that using adapters for +parameter-efficient domain adaptation of sentence embeddings yields competitive +performance within 1% of a domain-adapted, entirely fine-tuned sentence +embedding model while only training approximately 3.6% of the parameters. + +
+
+ comment: Accepted to the 14th International Conference on Recent Advances in + Natural Language Processing (RANLP 2023) +
+
+
+
+
+ + ♻ ☆ The Re-Label Method For Data-Centric Machine Learning + + +
+ In industry deep learning application, our manually labeled data has a +certain number of noisy data. To solve this problem and achieve more than 90 +score in dev dataset, we present a simple method to find the noisy data and +re-label the noisy data by human, given the model predictions as references in +human labeling. In this paper, we illustrate our idea for a broad set of deep +learning tasks, includes classification, sequence tagging, object detection, +sequence generation, click-through rate prediction. The experimental results +and human evaluation results verify our idea. + +
+
+
+
+
+ + ♻ ☆ MindMap: Knowledge Graph Prompting Sparks Graph of Thoughts in Large + Language Models + + +
+ LLMs usually exhibit limitations in their ability to incorporate new +knowledge, the generation of hallucinations, and the transparency of their +decision-making process. In this paper, we explore how to prompt LLMs with +knowledge graphs (KG), working as a remedy to engage LLMs with up-to-date +knowledge and elicit the reasoning pathways from LLMs. Specifically, we build a +prompting pipeline that endows LLMs with the capability of comprehending KG +inputs and inferring with a combined implicit knowledge and the retrieved +external knowledge. In addition, we investigate eliciting the mind map on which +LLMs perform the reasoning and generate the answers. It is identified that the +produced mind map exhibits the reasoning pathways of LLMs grounded on the +ontology of knowledge, hence bringing the prospects of probing and gauging LLM +inference in production. The experiments on three question & answering datasets +also show that MindMap prompting leads to a striking empirical gain. For +instance, prompting a GPT-3.5 with MindMap yields an overwhelming performance +over GPT-4 consistently. We also demonstrate that with structured facts +retrieved from KG, MindMap can outperform a series of +prompting-with-document-retrieval methods, benefiting from more accurate, +concise, and comprehensive knowledge from KGs. + +
+
+ comment: 7 pages, 8 figures, 9 tables +
+
+
+
+
+ + ♻ ☆ Region-Aware Pretraining for Open-Vocabulary Object Detection with + Vision Transformers CVPR 2023 + + +
+ We present Region-aware Open-vocabulary Vision Transformers (RO-ViT) - a +contrastive image-text pretraining recipe to bridge the gap between image-level +pretraining and open-vocabulary object detection. At the pretraining phase, we +propose to randomly crop and resize regions of positional embeddings instead of +using the whole image positional embeddings. This better matches the use of +positional embeddings at region-level in the detection finetuning phase. In +addition, we replace the common softmax cross entropy loss in contrastive +learning with focal loss to better learn the informative yet difficult +examples. Finally, we leverage recent advances in novel object proposals to +improve open-vocabulary detection finetuning. We evaluate our full model on the +LVIS and COCO open-vocabulary detection benchmarks and zero-shot transfer. +RO-ViT achieves a state-of-the-art 34.1 $AP_r$ on LVIS, surpassing the best +existing approach by +7.8 points in addition to competitive zero-shot transfer +detection. Surprisingly, RO-ViT improves the image-level representation as well +and achieves the state of the art on 9 out of 12 metrics on COCO and Flickr +image-text retrieval benchmarks, outperforming competitive approaches with +larger models. + +
+
+ comment: CVPR 2023 Highlight - https://github.com/mcahny/rovit ; adds LAION-2B + result +
+
+
+
+
+ + ♻ ☆ Explaining Machine Learning Models in Natural Conversations: Towards a + Conversational XAI Agent + + +
+ The goal of Explainable AI (XAI) is to design methods to provide insights +into the reasoning process of black-box models, such as deep neural networks, +in order to explain them to humans. Social science research states that such +explanations should be conversational, similar to human-to-human explanations. +In this work, we show how to incorporate XAI in a conversational agent, using a +standard design for the agent comprising natural language understanding and +generation components. We build upon an XAI question bank which we extend by +quality-controlled paraphrases to understand the user's information needs. We +further systematically survey the literature for suitable explanation methods +that provide the information to answer those questions, and present a +comprehensive list of suggestions. Our work is the first step towards truly +natural conversations about machine learning models with an explanation agent. +The comprehensive list of XAI questions and the corresponding explanation +methods may support other researchers in providing the necessary information to +address users' demands. + +
+
+ comment: Accepted at The World Conference on eXplainable Artificial + Intelligence 2023 (XAI-2023) +
+
+
+
+
+ + ♻ ☆ A Survey on Evaluation of Large Language Models + + +
+ Large language models (LLMs) are gaining increasing popularity in both +academia and industry, owing to their unprecedented performance in various +applications. As LLMs continue to play a vital role in both research and daily +use, their evaluation becomes increasingly critical, not only at the task +level, but also at the society level for better understanding of their +potential risks. Over the past years, significant efforts have been made to +examine LLMs from various perspectives. This paper presents a comprehensive +review of these evaluation methods for LLMs, focusing on three key dimensions: +what to evaluate, where to evaluate, and how to evaluate. Firstly, we provide +an overview from the perspective of evaluation tasks, encompassing general +natural language processing tasks, reasoning, medical usage, ethics, +educations, natural and social sciences, agent applications, and other areas. +Secondly, we answer the `where' and `how' questions by diving into the +evaluation methods and benchmarks, which serve as crucial components in +assessing performance of LLMs. Then, we summarize the success and failure cases +of LLMs in different tasks. Finally, we shed light on several future challenges +that lie ahead in LLMs evaluation. Our aim is to offer invaluable insights to +researchers in the realm of LLMs evaluation, thereby aiding the development of +more proficient LLMs. Our key point is that evaluation should be treated as an +essential discipline to better assist the development of LLMs. We consistently +maintain the related open-source materials at: +https://github.com/MLGroupJLU/LLM-eval-survey. + +
+
+ comment: 26 pages; a major update to include more recent works; + https://llm-eval.github.io/ +
+
+
+
+
+ + ♻ ☆ EcomGPT: Instruction-tuning Large Language Models with Chain-of-Task + Tasks for E-commerce + + +
+ Recently, instruction-following Large Language Models (LLMs) , represented by +ChatGPT, have exhibited exceptional performance in general Natural Language +Processing (NLP) tasks. However, the unique characteristics of E-commerce data +pose significant challenges to general LLMs. An LLM tailored specifically for +E-commerce scenarios, possessing robust cross-dataset/task generalization +capabilities, is a pressing necessity. To solve this issue, in this work, we +proposed the first e-commerce instruction dataset EcomInstruct, with a total of +2.5 million instruction data. EcomInstruct scales up the data size and task +diversity by constructing atomic tasks with E-commerce basic data types, such +as product information, user reviews. Atomic tasks are defined as intermediate +tasks implicitly involved in solving a final task, which we also call +Chain-of-Task tasks. We developed EcomGPT with different parameter scales by +training the backbone model BLOOMZ with the EcomInstruct. Benefiting from the +fundamental semantic understanding capabilities acquired from the Chain-of-Task +tasks, EcomGPT exhibits excellent zero-shot generalization capabilities. +Extensive experiments and human evaluations demonstrate that EcomGPT +outperforms ChatGPT in term of cross-dataset/task generalization on E-commerce +tasks. + +
+
+ comment: Initial version of EcomGPT +
+
+
+
+
+ + ♻ ☆ Does Human Collaboration Enhance the Accuracy of Identifying + LLM-Generated Deepfake Texts? AAAI + + +
+ Advances in Large Language Models (e.g., GPT-4, LLaMA) have improved the +generation of coherent sentences resembling human writing on a large scale, +resulting in the creation of so-called deepfake texts. However, this progress +poses security and privacy concerns, necessitating effective solutions for +distinguishing deepfake texts from human-written ones. Although prior works +studied humans' ability to detect deepfake texts, none has examined whether +"collaboration" among humans improves the detection of deepfake texts. In this +study, to address this gap of understanding on deepfake texts, we conducted +experiments with two groups: (1) nonexpert individuals from the AMT platform +and (2) writing experts from the Upwork platform. The results demonstrate that +collaboration among humans can potentially improve the detection of deepfake +texts for both groups, increasing detection accuracies by 6.36% for non-experts +and 12.76% for experts, respectively, compared to individuals' detection +accuracies. We further analyze the explanations that humans used for detecting +a piece of text as deepfake text, and find that the strongest indicator of +deepfake texts is their lack of coherence and consistency. Our study provides +useful insights for future tools and framework designs to facilitate the +collaborative human detection of deepfake texts. The experiment datasets and +AMT implementations are available at: +https://github.com/huashen218/llm-deepfake-human-study.git + +
+
+ comment: Accepted at The 11th AAAI Conference on Human Computation and + Crowdsourcing (HCOMP 2023) +
+
+
+
+
+ + ♻ ☆ Scissorhands: Exploiting the Persistence of Importance Hypothesis for + LLM KV Cache Compression at Test Time + + +
+ Large language models(LLMs) have sparked a new wave of exciting AI +applications. Hosting these models at scale requires significant memory +resources. One crucial memory bottleneck for the deployment stems from the +context window. It is commonly recognized that model weights are memory hungry; +however, the size of key-value embedding stored during the generation process +(KV cache) can easily surpass the model size. The enormous size of the KV cache +puts constraints on the inference batch size, which is crucial for high +throughput inference workload. Inspired by an interesting observation of the +attention scores, we hypothesize the persistence of importance: only pivotal +tokens, which had a substantial influence at one step, will significantly +influence future generations. Based on our empirical verification and +theoretical analysis around this hypothesis, we propose Scissorhands, a system +that maintains the memory usage of the KV cache at a fixed budget without +finetuning the model. In essence, Scissorhands manages the KV cache by storing +the pivotal tokens with a higher probability. We validate that Scissorhands +reduces the inference memory usage of the KV cache by up to 5X without +compromising model quality. We further demonstrate that Scissorhands can be +combined with 4-bit quantization, traditionally used to compress model weights, +to achieve up to 20X compression. + +
+
+
+
+
+ + ♻ ☆ When Do Annotator Demographics Matter? Measuring the Influence of + Annotator Demographics with the POPQUORN Dataset + + +
+ Annotators are not fungible. Their demographics, life experiences, and +backgrounds all contribute to how they label data. However, NLP has only +recently considered how annotator identity might influence their decisions. +Here, we present POPQUORN (the POtato-Prolific dataset for QUestion-Answering, +Offensiveness, text Rewriting, and politeness rating with demographic Nuance). +POPQUORN contains 45,000 annotations from 1,484 annotators, drawn from a +representative sample regarding sex, age, and race as the US population. +Through a series of analyses, we show that annotators' background plays a +significant role in their judgments. Further, our work shows that backgrounds +not previously considered in NLP (e.g., education), are meaningful and should +be considered. Our study suggests that understanding the background of +annotators and collecting labels from a demographically balanced pool of crowd +workers is important to reduce the bias of datasets. The dataset, annotator +background, and annotation interface are available at +https://github.com/Jiaxin-Pei/potato-prolific-dataset . + +
+
+
+
+
+ + ♻ ☆ Blockwise Parallel Transformer for Large Context Models + + +
+ Transformers have emerged as the cornerstone of state-of-the-art natural +language processing models, showcasing exceptional performance across a wide +range of AI applications. However, the memory demands posed by the +self-attention mechanism and the large feedforward network in Transformers +limit their ability to handle long sequences, thereby creating challenges for +tasks involving multiple long sequences or long-term dependencies. We present a +distinct approach, Blockwise Parallel Transformer (BPT), that leverages +blockwise computation of self-attention and feedforward network fusion to +minimize memory costs. By processing longer input sequences while maintaining +memory efficiency, BPT enables training sequences 32 times longer than vanilla +Transformers and up to 4 times longer than previous memory-efficient methods. +Extensive experiments on language modeling and reinforcement learning tasks +demonstrate the effectiveness of BPT in reducing memory requirements and +improving performance. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 120 + +
+
+
+ + ☆ Efficient Discovery and Effective Evaluation of Visual Perceptual + Similarity: A Benchmark and Beyond ICCV 2023 + + +
+ Visual similarities discovery (VSD) is an important task with broad +e-commerce applications. Given an image of a certain object, the goal of VSD is +to retrieve images of different objects with high perceptual visual similarity. +Although being a highly addressed problem, the evaluation of proposed methods +for VSD is often based on a proxy of an identification-retrieval task, +evaluating the ability of a model to retrieve different images of the same +object. We posit that evaluating VSD methods based on identification tasks is +limited, and faithful evaluation must rely on expert annotations. In this +paper, we introduce the first large-scale fashion visual similarity benchmark +dataset, consisting of more than 110K expert-annotated image pairs. Besides +this major contribution, we share insight from the challenges we faced while +curating this dataset. Based on these insights, we propose a novel and +efficient labeling procedure that can be applied to any dataset. Our analysis +examines its limitations and inductive biases, and based on these findings, we +propose metrics to mitigate those limitations. Though our primary focus lies on +visual similarity, the methodologies we present have broader applications for +discovering and evaluating perceptual similarity across various domains. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ MagicEdit: High-Fidelity and Temporally Coherent Video Editing + + +
+ In this report, we present MagicEdit, a surprisingly simple yet effective +solution to the text-guided video editing task. We found that high-fidelity and +temporally coherent video-to-video translation can be achieved by explicitly +disentangling the learning of content, structure and motion signals during +training. This is in contradict to most existing methods which attempt to +jointly model both the appearance and temporal representation within a single +framework, which we argue, would lead to degradation in per-frame quality. +Despite its simplicity, we show that MagicEdit supports various downstream +video editing tasks, including video stylization, local editing, video-MagicMix +and video outpainting. + +
+
+ comment: Project page: https://magic-edit.github.io/ +
+
+
+
+
+ + ☆ MagicAvatar: Multimodal Avatar Generation and Animation + + +
+ This report presents MagicAvatar, a framework for multimodal video generation +and animation of human avatars. Unlike most existing methods that generate +avatar-centric videos directly from multimodal inputs (e.g., text prompts), +MagicAvatar explicitly disentangles avatar video generation into two stages: +(1) multimodal-to-motion and (2) motion-to-video generation. The first stage +translates the multimodal inputs into motion/ control signals (e.g., human +pose, depth, DensePose); while the second stage generates avatar-centric video +guided by these motion signals. Additionally, MagicAvatar supports avatar +animation by simply providing a few images of the target person. This +capability enables the animation of the provided human identity according to +the specific motion derived from the first stage. We demonstrate the +flexibility of MagicAvatar through various applications, including text-guided +and video-guided avatar generation, as well as multimodal avatar animation. + +
+
+ comment: Project page: https://magic-avatar.github.io/ +
+
+
+
+
+ + ☆ CoVR: Learning Composed Video Retrieval from Web Video Captions + + +
+ Composed Image Retrieval (CoIR) has recently gained popularity as a task that +considers both text and image queries together, to search for relevant images +in a database. Most CoIR approaches require manually annotated datasets, +comprising image-text-image triplets, where the text describes a modification +from the query image to the target image. However, manual curation of CoIR +triplets is expensive and prevents scalability. In this work, we instead +propose a scalable automatic dataset creation methodology that generates +triplets given video-caption pairs, while also expanding the scope of the task +to include composed video retrieval (CoVR). To this end, we mine paired videos +with a similar caption from a large database, and leverage a large language +model to generate the corresponding modification text. Applying this +methodology to the extensive WebVid2M collection, we automatically construct +our WebVid-CoVR dataset, resulting in 1.6 million triplets. Moreover, we +introduce a new benchmark for CoVR with a manually annotated evaluation set, +along with baseline results. Our experiments further demonstrate that training +a CoVR model on our dataset effectively transfers to CoIR, leading to improved +state-of-the-art performance in the zero-shot setup on both the CIRR and +FashionIQ benchmarks. Our code, datasets, and models are publicly available at +https://imagine.enpc.fr/~ventural/covr. + +
+
+
+
+
+ + ☆ Total Selfie: Generating Full-Body Selfies + + +
+ We present a method to generate full-body selfies -- photos that you take of +yourself, but capturing your whole body as if someone else took the photo of +you from a few feet away. Our approach takes as input a pre-captured video of +your body, a target pose photo, and a selfie + background pair for each +location. We introduce a novel diffusion-based approach to combine all of this +information into high quality, well-composed photos of you with the desired +pose and background. + +
+
+ comment: Project page: + https://homes.cs.washington.edu/~boweiche/project_page/totalselfie/ +
+
+
+
+
+ + ☆ Flexible Techniques for Differentiable Rendering with 3D Gaussians + + +
+ Fast, reliable shape reconstruction is an essential ingredient in many +computer vision applications. Neural Radiance Fields demonstrated that +photorealistic novel view synthesis is within reach, but was gated by +performance requirements for fast reconstruction of real scenes and objects. +Several recent approaches have built on alternative shape representations, in +particular, 3D Gaussians. We develop extensions to these renderers, such as +integrating differentiable optical flow, exporting watertight meshes and +rendering per-ray normals. Additionally, we show how two of the recent methods +are interoperable with each other. These reconstructions are quick, robust, and +easily performed on GPU or CPU. For code and visual examples, see +https://leonidk.github.io/fmb-plus + +
+
+
+
+
+ + ☆ PanoSwin: a Pano-style Swin Transformer for Panorama Understanding CVPR 2023 + + +
+ In panorama understanding, the widely used equirectangular projection (ERP) +entails boundary discontinuity and spatial distortion. It severely deteriorates +the conventional CNNs and vision Transformers on panoramas. In this paper, we +propose a simple yet effective architecture named PanoSwin to learn panorama +representations with ERP. To deal with the challenges brought by +equirectangular projection, we explore a pano-style shift windowing scheme and +novel pitch attention to address the boundary discontinuity and the spatial +distortion, respectively. Besides, based on spherical distance and Cartesian +coordinates, we adapt absolute positional embeddings and relative positional +biases for panoramas to enhance panoramic geometry information. Realizing that +planar image understanding might share some common knowledge with panorama +understanding, we devise a novel two-stage learning framework to facilitate +knowledge transfer from the planar images to panoramas. We conduct experiments +against the state-of-the-art on various panoramic tasks, i.e., panoramic object +detection, panoramic classification, and panoramic layout estimation. The +experimental results demonstrate the effectiveness of PanoSwin in panorama +understanding. + +
+
+ comment: CVPR 2023 +
+
+
+
+
+ + ☆ R3D3: Dense 3D Reconstruction of Dynamic Scenes from Multiple Cameras ICCV 2023 + + +
+ Dense 3D reconstruction and ego-motion estimation are key challenges in +autonomous driving and robotics. Compared to the complex, multi-modal systems +deployed today, multi-camera systems provide a simpler, low-cost alternative. +However, camera-based 3D reconstruction of complex dynamic scenes has proven +extremely difficult, as existing solutions often produce incomplete or +incoherent results. We propose R3D3, a multi-camera system for dense 3D +reconstruction and ego-motion estimation. Our approach iterates between +geometric estimation that exploits spatial-temporal information from multiple +cameras, and monocular depth refinement. We integrate multi-camera feature +correlation and dense bundle adjustment operators that yield robust geometric +depth and pose estimates. To improve reconstruction where geometric depth is +unreliable, e.g. for moving objects or low-textured regions, we introduce +learnable scene priors via a depth refinement network. We show that this design +enables a dense, consistent 3D reconstruction of challenging, dynamic outdoor +environments. Consequently, we achieve state-of-the-art dense depth prediction +on the DDAD and NuScenes benchmarks. + +
+
+ comment: Accepted to ICCV 2023. Project page is available at + https://www.vis.xyz/pub/r3d3/ +
+
+
+
+
+ + ☆ VideoCutLER: Surprisingly Simple Unsupervised Video Instance + Segmentation + + +
+ Existing approaches to unsupervised video instance segmentation typically +rely on motion estimates and experience difficulties tracking small or +divergent motions. We present VideoCutLER, a simple method for unsupervised +multi-instance video segmentation without using motion-based learning signals +like optical flow or training on natural videos. Our key insight is that using +high-quality pseudo masks and a simple video synthesis method for model +training is surprisingly sufficient to enable the resulting video model to +effectively segment and track multiple instances across video frames. We show +the first competitive unsupervised learning results on the challenging +YouTubeVIS-2019 benchmark, achieving 50.7% APvideo^50 , surpassing the previous +state-of-the-art by a large margin. VideoCutLER can also serve as a strong +pretrained model for supervised video instance segmentation tasks, exceeding +DINO by 15.9% on YouTubeVIS-2019 in terms of APvideo. + +
+
+ comment: Preprint. Code: https://github.com/facebookresearch/CutLER +
+
+
+
+
+ + ☆ 360-Degree Panorama Generation from Few Unregistered NFoV Images + + +
+ 360$^\circ$ panoramas are extensively utilized as environmental light sources +in computer graphics. However, capturing a 360$^\circ$ $\times$ 180$^\circ$ +panorama poses challenges due to the necessity of specialized and costly +equipment, and additional human resources. Prior studies develop various +learning-based generative methods to synthesize panoramas from a single Narrow +Field-of-View (NFoV) image, but they are limited in alterable input patterns, +generation quality, and controllability. To address these issues, we propose a +novel pipeline called PanoDiff, which efficiently generates complete +360$^\circ$ panoramas using one or more unregistered NFoV images captured from +arbitrary angles. Our approach has two primary components to overcome the +limitations. Firstly, a two-stage angle prediction module to handle various +numbers of NFoV inputs. Secondly, a novel latent diffusion-based panorama +generation model uses incomplete panorama and text prompts as control signals +and utilizes several geometric augmentation schemes to ensure geometric +properties in generated panoramas. Experiments show that PanoDiff achieves +state-of-the-art panoramic generation quality and high controllability, making +it suitable for applications such as content editing. + +
+
+ comment: Accepted to ACM Multimedia 2023 (MM' 23). Code is available: + https://github.com/shanemankiw/Panodiff +
+
+
+
+
+ + ☆ Video-Based Hand Pose Estimation for Remote Assessment of Bradykinesia + in Parkinson's Disease + + +
+ There is a growing interest in using pose estimation algorithms for +video-based assessment of Bradykinesia in Parkinson's Disease (PD) to +facilitate remote disease assessment and monitoring. However, the accuracy of +pose estimation algorithms in videos from video streaming services during +Telehealth appointments has not been studied. In this study, we used seven +off-the-shelf hand pose estimation models to estimate the movement of the thumb +and index fingers in videos of the finger-tapping (FT) test recorded from +Healthy Controls (HC) and participants with PD and under two different +conditions: streaming (videos recorded during a live Zoom meeting) and +on-device (videos recorded locally with high-quality cameras). The accuracy and +reliability of the models were estimated by comparing the models' output with +manual results. Three of the seven models demonstrated good accuracy for +on-device recordings, and the accuracy decreased significantly for streaming +recordings. We observed a negative correlation between movement speed and the +model's accuracy for the streaming recordings. Additionally, we evaluated the +reliability of ten movement features related to bradykinesia extracted from +video recordings of PD patients performing the FT test. While most of the +features demonstrated excellent reliability for on-device recordings, most of +the features demonstrated poor to moderate reliability for streaming +recordings. Our findings highlight the limitations of pose estimation +algorithms when applied to video recordings obtained during Telehealth visits, +and demonstrate that on-device recordings can be used for automatic +video-assessment of bradykinesia in PD. + +
+
+ comment: 12 pages, 3 figures, 2 tables +
+
+
+
+
+ + ☆ Neural Network-Based Histologic Remission Prediction In Ulcerative + Colitis + + +
+ BACKGROUND & AIMS: Histological remission (HR) is advocated and considered as +a new therapeutic target in ulcerative colitis (UC). Diagnosis of histologic +remission currently relies on biopsy; during this process, patients are at risk +for bleeding, infection, and post-biopsy fibrosis. In addition, histologic +response scoring is complex and time-consuming, and there is heterogeneity +among pathologists. Endocytoscopy (EC) is a novel ultra-high magnification +endoscopic technique that can provide excellent in vivo assessment of glands. +Based on the EC technique, we propose a neural network model that can assess +histological disease activity in UC using EC images to address the above +issues. The experiment results demonstrate that the proposed method can assist +patients in precise treatment and prognostic assessment. + METHODS: We construct a neural network model for UC evaluation. A total of +5105 images of 154 intestinal segments from 87 patients undergoing EC treatment +at a center in China between March 2022 and March 2023 are scored according to +the Geboes score. Subsequently, 103 intestinal segments are used as the +training set, 16 intestinal segments are used as the validation set for neural +network training, and the remaining 35 intestinal segments are used as the test +set to measure the model performance together with the validation set. + RESULTS: By treating HR as a negative category and histologic activity as a +positive category, the proposed neural network model can achieve an accuracy of +0.9, a specificity of 0.95, a sensitivity of 0.75, and an area under the curve +(AUC) of 0.81. + CONCLUSION: We develop a specific neural network model that can distinguish +histologic remission/activity in EC images of UC, which helps to accelerate +clinical histological diagnosis. + keywords: ulcerative colitis; Endocytoscopy; Geboes score; neural network. + +
+
+
+
+
+ + ☆ Comparison of automated crater catalogs for Mars from Benedix et al. + (2020) and Lee and Hogan (2021) + + +
+ Crater mapping using neural networks and other automated methods has +increased recently with automated Crater Detection Algorithms (CDAs) applied to +planetary bodies throughout the solar system. A recent publication by Benedix +et al. (2020) showed high performance at small scales compared to similar +automated CDAs but with a net positive diameter bias in many crater candidates. +I compare the publicly available catalogs from Benedix et al. (2020) and Lee & +Hogan (2021) and show that the reported performance is sensitive to the metrics +used to test the catalogs. I show how the more permissive comparison methods +indicate a higher CDA performance by allowing worse candidate craters to match +ground-truth craters. I show that the Benedix et al. (2020) catalog has a +substantial performance loss with increasing latitude and identify an image +projection issue that might cause this loss. Finally, I suggest future +applications of neural networks in generating large scientific datasets be +validated using secondary networks with independent data sources or training +methods. + +
+
+ comment: 14 pages, 6 figures. Accepted August 13th 2023 +
+
+
+
+
+ + ☆ VesselShot: Few-shot learning for cerebral blood vessel segmentation + + +
+ Angiography is widely used to detect, diagnose, and treat cerebrovascular +diseases. While numerous techniques have been proposed to segment the vascular +network from different imaging modalities, deep learning (DL) has emerged as a +promising approach. However, existing DL methods often depend on proprietary +datasets and extensive manual annotation. Moreover, the availability of +pre-trained networks specifically for medical domains and 3D volumes is +limited. To overcome these challenges, we propose a few-shot learning approach +called VesselShot for cerebrovascular segmentation. VesselShot leverages +knowledge from a few annotated support images and mitigates the scarcity of +labeled data and the need for extensive annotation in cerebral blood vessel +segmentation. We evaluated the performance of VesselShot using the publicly +available TubeTK dataset for the segmentation task, achieving a mean Dice +coefficient (DC) of 0.62(0.03). + +
+
+
+
+
+ + ☆ Compositional Semantic Mix for Domain Adaptation in Point Cloud + Segmentation + + +
+ Deep-learning models for 3D point cloud semantic segmentation exhibit limited +generalization capabilities when trained and tested on data captured with +different sensors or in varying environments due to domain shift. Domain +adaptation methods can be employed to mitigate this domain shift, for instance, +by simulating sensor noise, developing domain-agnostic generators, or training +point cloud completion networks. Often, these methods are tailored for range +view maps or necessitate multi-modal input. In contrast, domain adaptation in +the image domain can be executed through sample mixing, which emphasizes input +data manipulation rather than employing distinct adaptation modules. In this +study, we introduce compositional semantic mixing for point cloud domain +adaptation, representing the first unsupervised domain adaptation technique for +point cloud segmentation based on semantic and geometric sample mixing. We +present a two-branch symmetric network architecture capable of concurrently +processing point clouds from a source domain (e.g. synthetic) and point clouds +from a target domain (e.g. real-world). Each branch operates within one domain +by integrating selected data fragments from the other domain and utilizing +semantic information derived from source labels and target (pseudo) labels. +Additionally, our method can leverage a limited number of human point-level +annotations (semi-supervised) to further enhance performance. We assess our +approach in both synthetic-to-real and real-to-real scenarios using LiDAR +datasets and demonstrate that it significantly outperforms state-of-the-art +methods in both unsupervised and semi-supervised settings. + +
+
+ comment: TPAMI. arXiv admin note: text overlap with arXiv:2207.09778 +
+
+
+
+
+ + ☆ VoroMesh: Learning Watertight Surface Meshes with Voronoi Diagrams + + +
+ In stark contrast to the case of images, finding a concise, learnable +discrete representation of 3D surfaces remains a challenge. In particular, +while polygon meshes are arguably the most common surface representation used +in geometry processing, their irregular and combinatorial structure often make +them unsuitable for learning-based applications. In this work, we present +VoroMesh, a novel and differentiable Voronoi-based representation of watertight +3D shape surfaces. From a set of 3D points (called generators) and their +associated occupancy, we define our boundary representation through the Voronoi +diagram of the generators as the subset of Voronoi faces whose two associated +(equidistant) generators are of opposite occupancy: the resulting polygon mesh +forms a watertight approximation of the target shape's boundary. To learn the +position of the generators, we propose a novel loss function, dubbed VoroLoss, +that minimizes the distance from ground truth surface samples to the closest +faces of the Voronoi diagram which does not require an explicit construction of +the entire Voronoi diagram. A direct optimization of the Voroloss to obtain +generators on the Thingi32 dataset demonstrates the geometric efficiency of our +representation compared to axiomatic meshing algorithms and recent +learning-based mesh representations. We further use VoroMesh in a +learning-based mesh prediction task from input SDF grids on the ABC dataset, +and show comparable performance to state-of-the-art methods while guaranteeing +closed output surfaces free of self-intersections. + +
+
+
+
+
+ + ☆ MS-Net: A Multi-modal Self-supervised Network for Fine-Grained + Classification of Aircraft in SAR Images + + +
+ Synthetic aperture radar (SAR) imaging technology is commonly used to provide +24-hour all-weather earth observation. However, it still has some drawbacks in +SAR target classification, especially in fine-grained classification of +aircraft: aircrafts in SAR images have large intra-class diversity and +inter-class similarity; the number of effective samples is insufficient and +it's hard to annotate. To address these issues, this article proposes a novel +multi-modal self-supervised network (MS-Net) for fine-grained classification of +aircraft. Firstly, in order to entirely exploit the potential of multi-modal +information, a two-sided path feature extraction network (TSFE-N) is +constructed to enhance the image feature of the target and obtain the domain +knowledge feature of text mode. Secondly, a contrastive self-supervised +learning (CSSL) framework is employed to effectively learn useful +label-independent feature from unbalanced data, a similarity per-ception loss +(SPloss) is proposed to avoid network overfitting. Finally, TSFE-N is used as +the encoder of CSSL to obtain the classification results. Through a large +number of experiments, our MS-Net can effectively reduce the difficulty of +classifying similar types of aircrafts. In the case of no label, the proposed +algorithm achieves an accuracy of 88.46% for 17 types of air-craft +classification task, which has pioneering significance in the field of +fine-grained classification of aircraft in SAR images. + +
+
+
+
+
+ + ☆ A Transformer-Conditioned Neural Fields Pipeline with Polar Coordinate + Representation for Astronomical Radio Interferometric Data Reconstruction + + +
+ In radio astronomy, visibility data, which are measurements of wave signals +from radio telescopes, are transformed into images for observation of distant +celestial objects. However, these resultant images usually contain both real +sources and artifacts, due to signal sparsity and other factors. One way to +obtain cleaner images is to reconstruct samples into dense forms before +imaging. Unfortunately, existing visibility reconstruction methods may miss +some components of the frequency data, so blurred object edges and persistent +artifacts remain in the images. Furthermore, the computation overhead is high +on irregular visibility samples due to the data skew. To address these +problems, we propose PolarRec, a reconstruction method for interferometric +visibility data, which consists of a transformer-conditioned neural fields +pipeline with a polar coordinate representation. This representation matches +the way in which telescopes observe a celestial area as the Earth rotates. We +further propose Radial Frequency Loss function, using radial coordinates in the +polar coordinate system to correlate with the frequency information, to help +reconstruct complete visibility. We also group visibility sample points by +angular coordinates in the polar coordinate system, and use groups as the +granularity for subsequent encoding with a Transformer encoder. Consequently, +our method can capture the inherent characteristics of visibility data +effectively and efficiently. Our experiments demonstrate that PolarRec markedly +improves imaging results by faithfully reconstructing all frequency components +in the visibility domain while significantly reducing the computation cost. + +
+
+
+
+
+ + ☆ A Generalization of Continuous Relaxation in Structured Pruning + + +
+ Deep learning harnesses massive parallel floating-point processing to train +and evaluate large neural networks. Trends indicate that deeper and larger +neural networks with an increasing number of parameters achieve higher accuracy +than smaller neural networks. This performance improvement, which often +requires heavy compute for both training and evaluation, eventually needs to +translate well to resource-constrained hardware for practical value. Structured +pruning asserts that while large networks enable us to find solutions to +complex computer vision problems, a smaller, computationally efficient +sub-network can be derived from the large neural network that retains model +accuracy but significantly improves computational efficiency. + We generalize structured pruning with algorithms for network augmentation, +pruning, sub-network collapse and removal. In addition, we demonstrate +efficient and stable convergence up to 93% sparsity and 95% FLOPs reduction +without loss of inference accuracy using with continuous relaxation matching or +exceeding the state of the art for all structured pruning methods. The +resulting CNN executes efficiently on GPU hardware without computationally +expensive sparse matrix operations. We achieve this with routine automatable +operations on classification and segmentation problems using CIFAR-10, +ImageNet, and CityScapes datasets with the ResNet and U-NET network +architectures. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ SAM-PARSER: Fine-tuning SAM Efficiently by Parameter Space + Reconstruction + + +
+ Segment Anything Model (SAM) has received remarkable attention as it offers a +powerful and versatile solution for object segmentation in images. However, +fine-tuning SAM for downstream segmentation tasks under different scenarios +remains a challenge, as the varied characteristics of different scenarios +naturally requires diverse model parameter spaces. Most existing fine-tuning +methods attempt to bridge the gaps among different scenarios by introducing a +set of new parameters to modify SAM's original parameter space. Unlike these +works, in this paper, we propose fine-tuning SAM efficiently by parameter space +reconstruction (SAM-PARSER), which introduce nearly zero trainable parameters +during fine-tuning. In SAM-PARSER, we assume that SAM's original parameter +space is relatively complete, so that its bases are able to reconstruct the +parameter space of a new scenario. We obtain the bases by matrix decomposition, +and fine-tuning the coefficients to reconstruct the parameter space tailored to +the new scenario by an optimal linear combination of the bases. Experimental +results show that SAM-PARSER exhibits superior segmentation performance across +various scenarios, while reducing the number of trainable parameters by +$\approx 290$ times compared with current parameter-efficient fine-tuning +methods. + +
+
+
+
+
+ + ☆ S-TREK: Sequential Translation and Rotation Equivariant Keypoints for + local feature extraction ICCV 2023 + + +
+ In this work we introduce S-TREK, a novel local feature extractor that +combines a deep keypoint detector, which is both translation and rotation +equivariant by design, with a lightweight deep descriptor extractor. We train +the S-TREK keypoint detector within a framework inspired by reinforcement +learning, where we leverage a sequential procedure to maximize a reward +directly related to keypoint repeatability. Our descriptor network is trained +following a "detect, then describe" approach, where the descriptor loss is +evaluated only at those locations where keypoints have been selected by the +already trained detector. Extensive experiments on multiple benchmarks confirm +the effectiveness of our proposed method, with S-TREK often outperforming other +state-of-the-art methods in terms of repeatability and quality of the recovered +poses, especially when dealing with in-plane rotations. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ☆ Adversarial Attacks on Foundational Vision Models + + +
+ Rapid progress is being made in developing large, pretrained, task-agnostic +foundational vision models such as CLIP, ALIGN, DINOv2, etc. In fact, we are +approaching the point where these models do not have to be finetuned +downstream, and can simply be used in zero-shot or with a lightweight probing +head. Critically, given the complexity of working at this scale, there is a +bottleneck where relatively few organizations in the world are executing the +training then sharing the models on centralized platforms such as HuggingFace +and torch.hub. The goal of this work is to identify several key adversarial +vulnerabilities of these models in an effort to make future designs more +robust. Intuitively, our attacks manipulate deep feature representations to +fool an out-of-distribution (OOD) detector which will be required when using +these open-world-aware models to solve closed-set downstream tasks. Our methods +reliably make in-distribution (ID) images (w.r.t. a downstream task) be +predicted as OOD and vice versa while existing in extremely +low-knowledge-assumption threat models. We show our attacks to be potent in +whitebox and blackbox settings, as well as when transferred across foundational +model types (e.g., attack DINOv2 with CLIP)! This work is only just the +beginning of a long journey towards adversarially robust foundational vision +models. + +
+
+
+
+
+ + ☆ LatentDR: Improving Model Generalization Through Sample-Aware Latent + Degradation and Restoration + + +
+ Despite significant advances in deep learning, models often struggle to +generalize well to new, unseen domains, especially when training data is +limited. To address this challenge, we propose a novel approach for +distribution-aware latent augmentation that leverages the relationships across +samples to guide the augmentation procedure. Our approach first degrades the +samples stochastically in the latent space, mapping them to augmented labels, +and then restores the samples from their corrupted versions during training. +This process confuses the classifier in the degradation step and restores the +overall class distribution of the original samples, promoting diverse +intra-class/cross-domain variability. We extensively evaluate our approach on a +diverse set of datasets and tasks, including domain generalization benchmarks +and medical imaging datasets with strong domain shift, where we show our +approach achieves significant improvements over existing methods for latent +space augmentation. We further show that our method can be flexibly adapted to +long-tail recognition tasks, demonstrating its versatility in building more +generalizable models. Code is available at +https://github.com/nerdslab/LatentDR. + +
+
+
+
+
+ + ☆ Neural Network Training Strategy to Enhance Anomaly Detection + Performance: A Perspective on Reconstruction Loss Amplification + + +
+ Unsupervised anomaly detection (UAD) is a widely adopted approach in industry +due to rare anomaly occurrences and data imbalance. A desirable characteristic +of an UAD model is contained generalization ability which excels in the +reconstruction of seen normal patterns but struggles with unseen anomalies. +Recent studies have pursued to contain the generalization capability of their +UAD models in reconstruction from different perspectives, such as design of +neural network (NN) structure and training strategy. In contrast, we note that +containing of generalization ability in reconstruction can also be obtained +simply from steep-shaped loss landscape. Motivated by this, we propose a loss +landscape sharpening method by amplifying the reconstruction loss, dubbed Loss +AMPlification (LAMP). LAMP deforms the loss landscape into a steep shape so the +reconstruction error on unseen anomalies becomes greater. Accordingly, the +anomaly detection performance is improved without any change of the NN +architecture. Our findings suggest that LAMP can be easily applied to any +reconstruction error metrics in UAD settings where the reconstruction model is +trained with anomaly-free samples only. + +
+
+ comment: 5 pages, 4 figures, 2 tables +
+
+
+
+
+ + ☆ Learning to Read Analog Gauges from Synthetic Data + + +
+ Manually reading and logging gauge data is time inefficient, and the effort +increases according to the number of gauges available. We present a computer +vision pipeline that automates the reading of analog gauges. We propose a +two-stage CNN pipeline that identifies the key structural components of an +analog gauge and outputs an angular reading. To facilitate the training of our +approach, a synthetic dataset is generated thus obtaining a set of realistic +analog gauges with their corresponding annotation. To validate our proposal, an +additional real-world dataset was collected with 4.813 manually curated images. +When compared against state-of-the-art methodologies, our method shows a +significant improvement of 4.55 in the average error, which is a 52% relative +improvement. The resources for this project will be made available at: +https://github.com/fuankarion/automatic-gauge-reading. + +
+
+
+
+
+ + ☆ Referring Image Segmentation Using Text Supervision ICCV 2023 + + +
+ Existing Referring Image Segmentation (RIS) methods typically require +expensive pixel-level or box-level annotations for supervision. In this paper, +we observe that the referring texts used in RIS already provide sufficient +information to localize the target object. Hence, we propose a novel +weakly-supervised RIS framework to formulate the target localization problem as +a classification process to differentiate between positive and negative text +expressions. While the referring text expressions for an image are used as +positive expressions, the referring text expressions from other images can be +used as negative expressions for this image. Our framework has three main +novelties. First, we propose a bilateral prompt method to facilitate the +classification process, by harmonizing the domain discrepancy between visual +and linguistic features. Second, we propose a calibration method to reduce +noisy background information and improve the correctness of the response maps +for target object localization. Third, we propose a positive response map +selection strategy to generate high-quality pseudo-labels from the enhanced +response maps, for training a segmentation network for RIS inference. For +evaluation, we propose a new metric to measure localization accuracy. +Experiments on four benchmarks show that our framework achieves promising +performances to existing fully-supervised RIS methods while outperforming +state-of-the-art weakly-supervised methods adapted from related areas. Code is +available at https://github.com/fawnliu/TRIS. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ SAAN: Similarity-aware attention flow network for change detection with + VHR remote sensing images + + +
+ Change detection (CD) is a fundamental and important task for monitoring the +land surface dynamics in the earth observation field. Existing deep +learning-based CD methods typically extract bi-temporal image features using a +weight-sharing Siamese encoder network and identify change regions using a +decoder network. These CD methods, however, still perform far from +satisfactorily as we observe that 1) deep encoder layers focus on irrelevant +background regions and 2) the models' confidence in the change regions is +inconsistent at different decoder stages. The first problem is because deep +encoder layers cannot effectively learn from imbalanced change categories using +the sole output supervision, while the second problem is attributed to the lack +of explicit semantic consistency preservation. To address these issues, we +design a novel similarity-aware attention flow network (SAAN). SAAN +incorporates a similarity-guided attention flow module with deeply supervised +similarity optimization to achieve effective change detection. Specifically, we +counter the first issue by explicitly guiding deep encoder layers to discover +semantic relations from bi-temporal input images using deeply supervised +similarity optimization. The extracted features are optimized to be +semantically similar in the unchanged regions and dissimilar in the changing +regions. The second drawback can be alleviated by the proposed +similarity-guided attention flow module, which incorporates similarity-guided +attention modules and attention flow mechanisms to guide the model to focus on +discriminative channels and regions. We evaluated the effectiveness and +generalization ability of the proposed method by conducting experiments on a +wide range of CD tasks. The experimental results demonstrate that our method +achieves excellent performance on several CD tasks, with discriminative +features and semantic consistency preserved. + +
+
+ comment: 15 pages,13 figures +
+
+
+
+
+ + ☆ Face Presentation Attack Detection by Excavating Causal Clues and + Adapting Embedding Statistics WACV 2024 + + +
+ Recent face presentation attack detection (PAD) leverages domain adaptation +(DA) and domain generalization (DG) techniques to address performance +degradation on unknown domains. However, DA-based PAD methods require access to +unlabeled target data, while most DG-based PAD solutions rely on a priori, +i.e., known domain labels. Moreover, most DA-/DG-based methods are +computationally intensive, demanding complex model architectures and/or +multi-stage training processes. This paper proposes to model face PAD as a +compound DG task from a causal perspective, linking it to model optimization. +We excavate the causal factors hidden in the high-level representation via +counterfactual intervention. Moreover, we introduce a class-guided MixStyle to +enrich feature-level data distribution within classes instead of focusing on +domain information. Both class-guided MixStyle and counterfactual intervention +components introduce no extra trainable parameters and negligible computational +resources. Extensive cross-dataset and analytic experiments demonstrate the +effectiveness and efficiency of our method compared to state-of-the-art PADs. +The implementation and the trained weights are publicly available. + +
+
+ comment: Accepted at WACV 2024 +
+
+
+
+
+ + ☆ Semi-Supervised Learning for Visual Bird's Eye View Semantic + Segmentation + + +
+ Visual bird's eye view (BEV) semantic segmentation helps autonomous vehicles +understand the surrounding environment only from images, including static +elements (e.g., roads) and dynamic elements (e.g., vehicles, pedestrians). +However, the high cost of annotation procedures of full-supervised methods +limits the capability of the visual BEV semantic segmentation, which usually +needs HD maps, 3D object bounding boxes, and camera extrinsic matrixes. In this +paper, we present a novel semi-supervised framework for visual BEV semantic +segmentation to boost performance by exploiting unlabeled images during the +training. A consistency loss that makes full use of unlabeled data is then +proposed to constrain the model on not only semantic prediction but also the +BEV feature. Furthermore, we propose a novel and effective data augmentation +method named conjoint rotation which reasonably augments the dataset while +maintaining the geometric relationship between the front-view images and the +BEV semantic segmentation. Extensive experiments on the nuScenes and Argoverse +datasets show that our semi-supervised framework can effectively improve +prediction accuracy. To the best of our knowledge, this is the first work that +explores improving visual BEV semantic segmentation performance using unlabeled +data. The code will be publicly available. + +
+
+
+
+
+ + ☆ LAC -- Latent Action Composition for Skeleton-based Action Segmentation ICCV 2023 + + +
+ Skeleton-based action segmentation requires recognizing composable actions in +untrimmed videos. Current approaches decouple this problem by first extracting +local visual features from skeleton sequences and then processing them by a +temporal model to classify frame-wise actions. However, their performances +remain limited as the visual features cannot sufficiently express composable +actions. In this context, we propose Latent Action Composition (LAC), a novel +self-supervised framework aiming at learning from synthesized composable +motions for skeleton-based action segmentation. LAC is composed of a novel +generation module towards synthesizing new sequences. Specifically, we design a +linear latent space in the generator to represent primitive motion. New +composed motions can be synthesized by simply performing arithmetic operations +on latent representations of multiple input skeleton sequences. LAC leverages +such synthesized sequences, which have large diversity and complexity, for +learning visual representations of skeletons in both sequence and frame spaces +via contrastive learning. The resulting visual encoder has a high expressive +power and can be effectively transferred onto action segmentation tasks by +end-to-end fine-tuning without the need for additional temporal models. We +conduct a study focusing on transfer-learning and we show that representations +learned from pre-trained LAC outperform the state-of-the-art by a large margin +on TSU, Charades, PKU-MMD datasets. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ PointHPS: Cascaded 3D Human Pose and Shape Estimation from Point Clouds + + +
+ Human pose and shape estimation (HPS) has attracted increasing attention in +recent years. While most existing studies focus on HPS from 2D images or videos +with inherent depth ambiguity, there are surging need to investigate HPS from +3D point clouds as depth sensors have been frequently employed in commercial +devices. However, real-world sensory 3D points are usually noisy and +incomplete, and also human bodies could have different poses of high diversity. +To tackle these challenges, we propose a principled framework, PointHPS, for +accurate 3D HPS from point clouds captured in real-world settings, which +iteratively refines point features through a cascaded architecture. +Specifically, each stage of PointHPS performs a series of downsampling and +upsampling operations to extract and collate both local and global cues, which +are further enhanced by two novel modules: 1) Cross-stage Feature Fusion (CFF) +for multi-scale feature propagation that allows information to flow effectively +through the stages, and 2) Intermediate Feature Enhancement (IFE) for +body-aware feature aggregation that improves feature quality after each stage. +To facilitate a comprehensive study under various scenarios, we conduct our +experiments on two large-scale benchmarks, comprising i) a dataset that +features diverse subjects and actions captured by real commercial sensors in a +laboratory environment, and ii) controlled synthetic data generated with +realistic considerations such as clothed humans in crowded outdoor scenes. +Extensive experiments demonstrate that PointHPS, with its powerful point +feature extraction and processing scheme, outperforms State-of-the-Art methods +by significant margins across the board. Homepage: +https://caizhongang.github.io/projects/PointHPS/. + +
+
+
+
+
+ + ☆ Group Regression for Query Based Object Detection and Tracking SC 2023 + + +
+ Group regression is commonly used in 3D object detection to predict box +parameters of similar classes in a joint head, aiming to benefit from +similarities while separating highly dissimilar classes. For query-based +perception methods, this has, so far, not been feasible. We close this gap and +present a method to incorporate multi-class group regression, especially +designed for the 3D domain in the context of autonomous driving, into existing +attention and query-based perception approaches. We enhance a transformer based +joint object detection and tracking model with this approach, and thoroughly +evaluate its behavior and performance. For group regression, the classes of the +nuScenes dataset are divided into six groups of similar shape and prevalence, +each being regressed by a dedicated head. We show that the proposed method is +applicable to many existing transformer based perception approaches and can +bring potential benefits. The behavior of query group regression is thoroughly +analyzed in comparison to a unified regression head, e.g. in terms of +class-switching behavior and distribution of the output parameters. The +proposed method offers many possibilities for further research, such as in the +direction of deep multi-hypotheses tracking. + +
+
+ comment: Accepted for publication at the 2023 26th IEEE International + Conference on Intelligent Transportation Systems (ITSC 2023), Sep 24-28, + 2023, in Bilbao, Spain +
+
+
+
+
+ + ☆ Priority-Centric Human Motion Generation in Discrete Latent Space ICCV2023 + + +
+ Text-to-motion generation is a formidable task, aiming to produce human +motions that align with the input text while also adhering to human +capabilities and physical laws. While there have been advancements in diffusion +models, their application in discrete spaces remains underexplored. Current +methods often overlook the varying significance of different motions, treating +them uniformly. It is essential to recognize that not all motions hold the same +relevance to a particular textual description. Some motions, being more salient +and informative, should be given precedence during generation. In response, we +introduce a Priority-Centric Motion Discrete Diffusion Model (M2DM), which +utilizes a Transformer-based VQ-VAE to derive a concise, discrete motion +representation, incorporating a global self-attention mechanism and a +regularization term to counteract code collapse. We also present a motion +discrete diffusion model that employs an innovative noise schedule, determined +by the significance of each motion token within the entire motion sequence. +This approach retains the most salient motions during the reverse diffusion +process, leading to more semantically rich and varied motions. Additionally, we +formulate two strategies to gauge the importance of motion tokens, drawing from +both textual and visual indicators. Comprehensive experiments on the HumanML3D +and KIT-ML datasets confirm that our model surpasses existing techniques in +fidelity and diversity, particularly for intricate textual descriptions. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ Medical needle tip tracking based on Optical Imaging and AI + + +
+ Deep needle insertion to a target often poses a huge challenge, requiring a +combination of specialized skills, assistive technology, and extensive +training. One of the frequently encountered medical scenarios demanding such +expertise includes the needle insertion into a femoral vessel in the groin. +After the access to the femoral vessel, various medical procedures, such as +cardiac catheterization and extracorporeal membrane oxygenation (ECMO) can be +performed. However, even with the aid of Ultrasound imaging, achieving +successful insertion can necessitate multiple attempts due to the complexities +of anatomy and tissue deformation. To address this challenge, this paper +presents an innovative technology for needle tip real-time tracking, aiming for +enhanced needle insertion guidance. Specifically, our approach revolves around +the creation of scattering imaging using an optical fiber-equipped needle, and +uses Convolutional Neural Network (CNN) based algorithms to enable real-time +estimation of the needle tip's position and orientation during insertion +procedures. The efficacy of the proposed technology was rigorously evaluated +through three experiments. The first two experiments involved rubber and bacon +phantoms to simulate groin anatomy. The positional errors averaging 2.3+1.5mm +and 2.0+1.2mm, and the orientation errors averaging 0.2+0.11rad and +0.16+0.1rad. Furthermore, the system's capabilities were validated through +experiments conducted on fresh porcine phantom mimicking more complex +anatomical structures, yielding positional accuracy results of 3.2+3.1mm and +orientational accuracy of 0.19+0.1rad. Given the average femoral arterial +radius of 4 to 5mm, the proposed system is demonstrated with a great potential +for precise needle guidance in femoral artery insertion procedures. In +addition, the findings highlight the broader potential applications of the +system in the medical field. + +
+
+
+
+
+ + ☆ Pixel-Aware Stable Diffusion for Realistic Image Super-resolution and + Personalized Stylization + + +
+ Realistic image super-resolution (Real-ISR) aims to reproduce perceptually +realistic image details from a low-quality input. The commonly used adversarial +training based Real-ISR methods often introduce unnatural visual artifacts and +fail to generate realistic textures for natural scene images. The recently +developed generative stable diffusion models provide a potential solution to +Real-ISR with pre-learned strong image priors. However, the existing methods +along this line either fail to keep faithful pixel-wise image structures or +resort to extra skipped connections to reproduce details, which requires +additional training in image space and limits their extension to other related +tasks in latent space such as image stylization. In this work, we propose a +pixel-aware stable diffusion (PASD) network to achieve robust Real-ISR as well +as personalized stylization. In specific, a pixel-aware cross attention module +is introduced to enable diffusion models perceiving image local structures in +pixel-wise level, while a degradation removal module is used to extract +degradation insensitive features to guide the diffusion process together with +image high level information. By simply replacing the base diffusion model with +a personalized one, our method can generate diverse stylized images without the +need to collect pairwise training data. PASD can be easily integrated into +existing diffusion models such as Stable Diffusion. Experiments on Real-ISR and +personalized stylization demonstrate the effectiveness of our proposed +approach. The source code and models can be found at +\url{https://github.com/yangxy/PASD}. + +
+
+
+
+
+ + ☆ Improving the performance of object detection by preserving label + distribution + + +
+ Object detection is a task that performs position identification and label +classification of objects in images or videos. The information obtained through +this process plays an essential role in various tasks in the field of computer +vision. In object detection, the data utilized for training and validation +typically originate from public datasets that are well-balanced in terms of the +number of objects ascribed to each class in an image. However, in real-world +scenarios, handling datasets with much greater class imbalance, i.e., very +different numbers of objects for each class , is much more common, and this +imbalance may reduce the performance of object detection when predicting unseen +test images. In our study, thus, we propose a method that evenly distributes +the classes in an image for training and validation, solving the class +imbalance problem in object detection. Our proposed method aims to maintain a +uniform class distribution through multi-label stratification. We tested our +proposed method not only on public datasets that typically exhibit balanced +class distribution but also on custom datasets that may have imbalanced class +distribution. We found that our proposed method was more effective on datasets +containing severe imbalance and less data. Our findings indicate that the +proposed method can be effectively used on datasets with substantially +imbalanced class distribution. + +
+
+ comment: Code is available at + https://github.com/leeheewon-01/YOLOstratifiedKFold/tree/main +
+
+
+
+
+ + ☆ Spatio-Temporal Analysis of Patient-Derived Organoid Videos Using Deep + Learning for the Prediction of Drug Efficacy + + +
+ Over the last ten years, Patient-Derived Organoids (PDOs) emerged as the most +reliable technology to generate ex-vivo tumor avatars. PDOs retain the main +characteristics of their original tumor, making them a system of choice for +pre-clinical and clinical studies. In particular, PDOs are attracting interest +in the field of Functional Precision Medicine (FPM), which is based upon an +ex-vivo drug test in which living tumor cells (such as PDOs) from a specific +patient are exposed to a panel of anti-cancer drugs. Currently, the Adenosine +Triphosphate (ATP) based cell viability assay is the gold standard test to +assess the sensitivity of PDOs to drugs. The readout is measured at the end of +the assay from a global PDO population and therefore does not capture single +PDO responses and does not provide time resolution of drug effect. To this end, +in this study, we explore for the first time the use of powerful large +foundation models for the automatic processing of PDO data. In particular, we +propose a novel imaging-based high-throughput screening method to assess +real-time drug efficacy from a time-lapse microscopy video of PDOs. The +recently proposed SAM algorithm for segmentation and DINOv2 model are adapted +in a comprehensive pipeline for processing PDO microscopy frames. Moreover, an +attention mechanism is proposed for fusing temporal and spatial features in a +multiple instance learning setting to predict ATP. We report better results +than other non-time-resolved methods, indicating that the temporality of data +is an important factor for the prediction of ATP. Extensive ablations shed +light on optimizing the experimental setting and automating the prediction both +in real-time and for forecasting. + +
+
+
+
+
+ + ☆ ExpCLIP: Bridging Text and Facial Expressions via Semantic Alignment + + +
+ The objective of stylized speech-driven facial animation is to create +animations that encapsulate specific emotional expressions. Existing methods +often depend on pre-established emotional labels or facial expression +templates, which may limit the necessary flexibility for accurately conveying +user intent. In this research, we introduce a technique that enables the +control of arbitrary styles by leveraging natural language as emotion prompts. +This technique presents benefits in terms of both flexibility and +user-friendliness. To realize this objective, we initially construct a +Text-Expression Alignment Dataset (TEAD), wherein each facial expression is +paired with several prompt-like descriptions.We propose an innovative automatic +annotation method, supported by Large Language Models (LLMs), to expedite the +dataset construction, thereby eliminating the substantial expense of manual +annotation. Following this, we utilize TEAD to train a CLIP-based model, termed +ExpCLIP, which encodes text and facial expressions into semantically aligned +style embeddings. The embeddings are subsequently integrated into the facial +animation generator to yield expressive and controllable facial animations. +Given the limited diversity of facial emotions in existing speech-driven facial +animation training data, we further introduce an effective Expression Prompt +Augmentation (EPA) mechanism to enable the animation generator to support +unprecedented richness in style control. Comprehensive experiments illustrate +that our method accomplishes expressive facial animation generation and offers +enhanced flexibility in effectively conveying the desired style. + +
+
+
+
+
+ + ☆ Data-iterative Optimization Score Model for Stable Ultra-Sparse-View CT + Reconstruction + + +
+ Score-based generative models (SGMs) have gained prominence in sparse-view CT +reconstruction for their precise sampling of complex distributions. In +SGM-based reconstruction, data consistency in the score-based diffusion model +ensures close adherence of generated samples to observed data distribution, +crucial for improving image quality. Shortcomings in data consistency +characterization manifest in three aspects. Firstly, data from the optimization +process can lead to artifacts in reconstructed images. Secondly, it often +neglects that the generation model and original data constraints are +independently completed, fragmenting unity. Thirdly, it predominantly focuses +on constraining intermediate results in the inverse sampling process, rather +than ideal real images. Thus, we propose an iterative optimization data scoring +model. This paper introduces the data-iterative optimization score-based model +(DOSM), integrating innovative data consistency into the Stochastic +Differential Equation, a valuable constraint for ultra-sparse-view CT +reconstruction. The novelty of this data consistency element lies in its sole +reliance on original measurement data to confine generation outcomes, +effectively balancing measurement data and generative model constraints. +Additionally, we pioneer an inference strategy that traces back from current +iteration results to ideal truth, enhancing reconstruction stability. We +leverage conventional iteration techniques to optimize DOSM updates. +Quantitative and qualitative results from 23 views of numerical and clinical +cardiac datasets demonstrate DOSM's superiority over other methods. Remarkably, +even with 10 views, our method achieves excellent performance. + +
+
+ comment: 11 pages, 12 figures +
+
+
+
+
+ + ☆ Graph-based Asynchronous Event Processing for Rapid Object Recognition ICCV 2021 + + +
+ Different from traditional video cameras, event cameras capture asynchronous +events stream in which each event encodes pixel location, trigger time, and the +polarity of the brightness changes. In this paper, we introduce a novel +graph-based framework for event cameras, namely SlideGCN. Unlike some recent +graph-based methods that use groups of events as input, our approach can +efficiently process data event-by-event, unlock the low latency nature of +events data while still maintaining the graph's structure internally. For fast +graph construction, we develop a radius search algorithm, which better exploits +the partial regular structure of event cloud against k-d tree based generic +methods. Experiments show that our method reduces the computational complexity +up to 100 times with respect to current graph-based methods while keeping +state-of-the-art performance on object recognition. Moreover, we verify the +superiority of event-wise processing with our method. When the state becomes +stable, we can give a prediction with high confidence, thus making an early +recognition. Project page: \url{https://zju3dv.github.io/slide_gcn/}. + +
+
+ comment: Accepted to ICCV 2021. Project Page: + https://zju3dv.github.io/slide_gcn/ +
+
+
+
+
+ + ☆ Multi-Scale and Multi-Layer Contrastive Learning for Domain + Generalization + + +
+ During the past decade, deep neural networks have led to fast-paced progress +and significant achievements in computer vision problems, for both academia and +industry. Yet despite their success, state-of-the-art image classification +approaches fail to generalize well in previously unseen visual contexts, as +required by many real-world applications. In this paper, we focus on this +domain generalization (DG) problem and argue that the generalization ability of +deep convolutional neural networks can be improved by taking advantage of +multi-layer and multi-scaled representations of the network. We introduce a +framework that aims at improving domain generalization of image classifiers by +combining both low-level and high-level features at multiple scales, enabling +the network to implicitly disentangle representations in its latent space and +learn domain-invariant attributes of the depicted objects. Additionally, to +further facilitate robust representation learning, we propose a novel objective +function, inspired by contrastive learning, which aims at constraining the +extracted representations to remain invariant under distribution shifts. We +demonstrate the effectiveness of our method by evaluating on the domain +generalization datasets of PACS, VLCS, Office-Home and NICO. Through extensive +experimentation, we show that our model is able to surpass the performance of +previous DG methods and consistently produce competitive and state-of-the-art +results in all datasets. + +
+
+ comment: Manuscript under review at: IEEE Transactions on Artificial + Intelligence +
+
+
+
+
+ + ☆ INF: Implicit Neural Fusion for LiDAR and Camera IROS 2023 + + +
+ Sensor fusion has become a popular topic in robotics. However, conventional +fusion methods encounter many difficulties, such as data representation +differences, sensor variations, and extrinsic calibration. For example, the +calibration methods used for LiDAR-camera fusion often require manual operation +and auxiliary calibration targets. Implicit neural representations (INRs) have +been developed for 3D scenes, and the volume density distribution involved in +an INR unifies the scene information obtained by different types of sensors. +Therefore, we propose implicit neural fusion (INF) for LiDAR and camera. INF +first trains a neural density field of the target scene using LiDAR frames. +Then, a separate neural color field is trained using camera images and the +trained neural density field. Along with the training process, INF both +estimates LiDAR poses and optimizes extrinsic parameters. Our experiments +demonstrate the high accuracy and stable performance of the proposed method. + +
+
+ comment: Accepted to IROS 2023. (project page: + https://ShuyiZhou495.github.io/inf-project-page/) +
+
+
+
+
+ + ☆ Steerable Conditional Diffusion for Out-of-Distribution Adaptation in + Imaging Inverse Problems + + +
+ Denoising diffusion models have emerged as the go-to framework for solving +inverse problems in imaging. A critical concern regarding these models is their +performance on out-of-distribution (OOD) tasks, which remains an under-explored +challenge. Realistic reconstructions inconsistent with the measured data can be +generated, hallucinating image features that are uniquely present in the +training dataset. To simultaneously enforce data-consistency and leverage +data-driven priors, we introduce a novel sampling framework called Steerable +Conditional Diffusion. This framework adapts the denoising network specifically +to the available measured data. Utilising our proposed method, we achieve +substantial enhancements in OOD performance across diverse imaging modalities, +advancing the robust deployment of denoising diffusion models in real-world +applications. + +
+
+
+
+
+ + ☆ Semi-Supervised Semantic Depth Estimation using Symbiotic Transformer + and NearFarMix Augmentation WACV 2024 + + +
+ In computer vision, depth estimation is crucial for domains like robotics, +autonomous vehicles, augmented reality, and virtual reality. Integrating +semantics with depth enhances scene understanding through reciprocal +information sharing. However, the scarcity of semantic information in datasets +poses challenges. Existing convolutional approaches with limited local +receptive fields hinder the full utilization of the symbiotic potential between +depth and semantics. This paper introduces a dataset-invariant semi-supervised +strategy to address the scarcity of semantic information. It proposes the Depth +Semantics Symbiosis module, leveraging the Symbiotic Transformer for achieving +comprehensive mutual awareness by information exchange within both local and +global contexts. Additionally, a novel augmentation, NearFarMix is introduced +to combat overfitting and compensate both depth-semantic tasks by strategically +merging regions from two images, generating diverse and structurally consistent +samples with enhanced control. Extensive experiments on NYU-Depth-V2 and KITTI +datasets demonstrate the superiority of our proposed techniques in indoor and +outdoor environments. + +
+
+ comment: Accepted at WACV 2024 +
+
+
+
+
+ + ☆ Ensemble of Anchor-Free Models for Robust Bangla Document Layout + Segmentation + + +
+ In this research paper, we present an innovative system designed for the +purpose of segmenting the layout of Bangla documents. Our methodology involves +utilizing a sophisticated collection of YOLOv8 models, meticulously adapted for +the DL Sprint 2.0 - BUET CSE Fest 2023 Competition that centers around Bangla +document layout segmentation. Our primary focus lies in elevating various +elements of the task, including techniques like image augmentation, model +architecture, and the use of model ensembles. We intentionally lower the +quality of a subset of document images to enhance the resilience of model +training, consequently leading to an improvement in our cross-validation score. +Employing Bayesian optimization, we determine the optimal confidence and IoU +thresholds for our model ensemble. Through our approach, we successfully +showcase the effectiveness of amalgamating anchor-free models to achieve robust +layout segmentation in Bangla documents. + +
+
+ comment: 4 pages, 5 figures, 6 Tables +
+
+
+
+
+ + ☆ UMMAFormer: A Universal Multimodal-adaptive Transformer Framework for + Temporal Forgery Localization ACM MM 2023 + + +
+ The emergence of artificial intelligence-generated content (AIGC) has raised +concerns about the authenticity of multimedia content in various fields. +However, existing research for forgery content detection has focused mainly on +binary classification tasks of complete videos, which has limited applicability +in industrial settings. To address this gap, we propose UMMAFormer, a novel +universal transformer framework for temporal forgery localization (TFL) that +predicts forgery segments with multimodal adaptation. Our approach introduces a +Temporal Feature Abnormal Attention (TFAA) module based on temporal feature +reconstruction to enhance the detection of temporal differences. We also design +a Parallel Cross-Attention Feature Pyramid Network (PCA-FPN) to optimize the +Feature Pyramid Network (FPN) for subtle feature enhancement. To evaluate the +proposed method, we contribute a novel Temporal Video Inpainting Localization +(TVIL) dataset specifically tailored for video inpainting scenes. Our +experiments show that our approach achieves state-of-the-art performance on +benchmark datasets, including Lav-DF, TVIL, and Psynd, significantly +outperforming previous methods. The code and data are available at +https://github.com/ymhzyj/UMMAFormer/. + +
+
+ comment: 11 pages, 8 figures, 66 references. This paper has been accepted for + ACM MM 2023 +
+
+
+
+
+ + ☆ 1st Place Solution for the 5th LSVOS Challenge: Video Instance + Segmentation + + +
+ Video instance segmentation is a challenging task that serves as the +cornerstone of numerous downstream applications, including video editing and +autonomous driving. In this report, we present further improvements to the SOTA +VIS method, DVIS. First, we introduce a denoising training strategy for the +trainable tracker, allowing it to achieve more stable and accurate object +tracking in complex and long videos. Additionally, we explore the role of +visual foundation models in video instance segmentation. By utilizing a frozen +VIT-L model pre-trained by DINO v2, DVIS demonstrates remarkable performance +improvements. With these enhancements, our method achieves 57.9 AP and 56.0 AP +in the development and test phases, respectively, and ultimately ranked 1st in +the VIS track of the 5th LSVOS Challenge. The code will be available at +https://github.com/zhang-tao-whu/DVIS. + +
+
+
+
+
+ + ☆ FIRE: Food Image to REcipe generation + + +
+ Food computing has emerged as a prominent multidisciplinary field of research +in recent years. An ambitious goal of food computing is to develop end-to-end +intelligent systems capable of autonomously producing recipe information for a +food image. Current image-to-recipe methods are retrieval-based and their +success depends heavily on the dataset size and diversity, as well as the +quality of learned embeddings. Meanwhile, the emergence of powerful +attention-based vision and language models presents a promising avenue for +accurate and generalizable recipe generation, which has yet to be extensively +explored. This paper proposes FIRE, a novel multimodal methodology tailored to +recipe generation in the food computing domain, which generates the food title, +ingredients, and cooking instructions based on input food images. FIRE +leverages the BLIP model to generate titles, utilizes a Vision Transformer with +a decoder for ingredient extraction, and employs the T5 model to generate +recipes incorporating titles and ingredients as inputs. We showcase two +practical applications that can benefit from integrating FIRE with large +language model prompting: recipe customization to fit recipes to user +preferences and recipe-to-code transformation to enable automated cooking +processes. Our experimental findings validate the efficacy of our proposed +approach, underscoring its potential for future advancements and widespread +adoption in food computing. + +
+
+ comment: 5 figures, 4 tables +
+
+
+
+
+ + ☆ Multi-Modal Neural Radiance Field for Monocular Dense SLAM with a + Light-Weight ToF Sensor ICCV 2023 + + +
+ Light-weight time-of-flight (ToF) depth sensors are compact and +cost-efficient, and thus widely used on mobile devices for tasks such as +autofocus and obstacle detection. However, due to the sparse and noisy depth +measurements, these sensors have rarely been considered for dense geometry +reconstruction. In this work, we present the first dense SLAM system with a +monocular camera and a light-weight ToF sensor. Specifically, we propose a +multi-modal implicit scene representation that supports rendering both the +signals from the RGB camera and light-weight ToF sensor which drives the +optimization by comparing with the raw sensor inputs. Moreover, in order to +guarantee successful pose tracking and reconstruction, we exploit a predicted +depth as an intermediate supervision and develop a coarse-to-fine optimization +strategy for efficient learning of the implicit representation. At last, the +temporal information is explicitly exploited to deal with the noisy signals +from light-weight ToF sensors to improve the accuracy and robustness of the +system. Experiments demonstrate that our system well exploits the signals of +light-weight ToF sensors and achieves competitive results both on camera +tracking and dense scene reconstruction. Project page: +\url{https://zju3dv.github.io/tof_slam/}. + +
+
+ comment: Accepted to ICCV 2023 (Oral). Project Page: + https://zju3dv.github.io/tof_slam/ +
+
+
+
+
+ + ☆ GKGNet: Group K-Nearest Neighbor based Graph Convolutional Network for + Multi-Label Image Recognition + + +
+ Multi-Label Image Recognition (MLIR) is a challenging task that aims to +predict multiple object labels in a single image while modeling the complex +relationships between labels and image regions. Although convolutional neural +networks and vision transformers have succeeded in processing images as regular +grids of pixels or patches, these representations are sub-optimal for capturing +irregular and discontinuous regions of interest. In this work, we present the +first fully graph convolutional model, Group K-nearest neighbor based Graph +convolutional Network (GKGNet), which models the connections between semantic +label embeddings and image patches in a flexible and unified graph structure. +To address the scale variance of different objects and to capture information +from multiple perspectives, we propose the Group KGCN module for dynamic graph +construction and message passing. Our experiments demonstrate that GKGNet +achieves state-of-the-art performance with significantly lower computational +costs on the challenging multi-label datasets, \ie MS-COCO and VOC2007 +datasets. We will release the code and models to facilitate future research in +this area. + +
+
+
+
+
+ + ☆ SuperUDF: Self-supervised UDF Estimation for Surface Reconstruction + + +
+ Learning-based surface reconstruction based on unsigned distance functions +(UDF) has many advantages such as handling open surfaces. We propose SuperUDF, +a self-supervised UDF learning which exploits a learned geometry prior for +efficient training and a novel regularization for robustness to sparse +sampling. The core idea of SuperUDF draws inspiration from the classical +surface approximation operator of locally optimal projection (LOP). The key +insight is that if the UDF is estimated correctly, the 3D points should be +locally projected onto the underlying surface following the gradient of the +UDF. Based on that, a number of inductive biases on UDF geometry and a +pre-learned geometry prior are devised to learn UDF estimation efficiently. A +novel regularization loss is proposed to make SuperUDF robust to sparse +sampling. Furthermore, we also contribute a learning-based mesh extraction from +the estimated UDFs. Extensive evaluations demonstrate that SuperUDF outperforms +the state of the arts on several public datasets in terms of both quality and +efficiency. Code will be released after accteptance. + +
+
+
+
+
+ + ☆ Improving Lesion Volume Measurements on Digital Mammograms + + +
+ Lesion volume is an important predictor for prognosis in breast cancer. We +make a step towards a more accurate lesion volume measurement on digital +mammograms by developing a model that allows to estimate lesion volumes on +processed mammograms, which are the images routinely used by radiologists in +clinical practice as well as in breast cancer screening and are available in +medical centers. Processed mammograms are obtained from raw mammograms, which +are the X-ray data coming directly from the scanner, by applying certain +vendor-specific non-linear transformations. At the core of our volume +estimation method is a physics-based algorithm for measuring lesion volumes on +raw mammograms. We subsequently extend this algorithm to processed mammograms +via a deep learning image-to-image translation model that produces synthetic +raw mammograms from processed mammograms in a multi-vendor setting. We assess +the reliability and validity of our method using a dataset of 1778 mammograms +with an annotated mass. Firstly, we investigate the correlations between lesion +volumes computed from mediolateral oblique and craniocaudal views, with a +resulting Pearson correlation of 0.93 [95% confidence interval (CI) 0.92 - +0.93]. Secondly, we compare the resulting lesion volumes from true and +synthetic raw data, with a resulting Pearson correlation of 0.998 [95% CI 0.998 +- 0.998] . Finally, for a subset of 100 mammograms with a malign mass and +concurrent MRI examination available, we analyze the agreement between lesion +volume on mammography and MRI, resulting in an intraclass correlation +coefficient of 0.81 [95% CI 0.73 - 0.87] for consistency and 0.78 [95% CI 0.66 +- 0.86] for absolute agreement. In conclusion, we developed an algorithm to +measure mammographic lesion volume that reached excellent reliability and good +validity, when using MRI as ground truth. + +
+
+
+
+
+ + ☆ MetaWeather: Few-Shot Weather-Degraded Image Restoration via Degradation + Pattern Matching + + +
+ Real-world vision tasks frequently suffer from the appearance of adverse +weather conditions including rain, fog, snow, and raindrops in captured images. +Recently, several generic methods for restoring weather-degraded images have +been proposed, aiming to remove multiple types of adverse weather effects +present in the images. However, these methods have considered weather as +discrete and mutually exclusive variables, leading to failure in generalizing +to unforeseen weather conditions beyond the scope of the training data, such as +the co-occurrence of rain, fog, and raindrops. To this end, weather-degraded +image restoration models should have flexible adaptability to the current +unknown weather condition to ensure reliable and optimal performance. The +adaptation method should also be able to cope with data scarcity for real-world +adaptation. This paper proposes MetaWeather, a few-shot weather-degraded image +restoration method for arbitrary weather conditions. For this, we devise the +core piece of MetaWeather, coined Degradation Pattern Matching Module (DPMM), +which leverages representations from a few-shot support set by matching +features between input and sample images under new weather conditions. In +addition, we build meta-knowledge with episodic meta-learning on top of our +MetaWeather architecture to provide flexible adaptability. In the meta-testing +phase, we adopt a parameter-efficient fine-tuning method to preserve the +prebuilt knowledge and avoid the overfitting problem. Experiments on the BID +Task II.A dataset show our method achieves the best performance on PSNR and +SSIM compared to state-of-the-art image restoration methods. Code is available +at (TBA). + +
+
+ comment: 12 pages, 6 figures +
+
+
+
+
+ + ☆ Attention-Guided Lidar Segmentation and Odometry Using Image-to-Point + Cloud Saliency Transfer + + +
+ LiDAR odometry estimation and 3D semantic segmentation are crucial for +autonomous driving, which has achieved remarkable advances recently. However, +these tasks are challenging due to the imbalance of points in different +semantic categories for 3D semantic segmentation and the influence of dynamic +objects for LiDAR odometry estimation, which increases the importance of using +representative/salient landmarks as reference points for robust feature +learning. To address these challenges, we propose a saliency-guided approach +that leverages attention information to improve the performance of LiDAR +odometry estimation and semantic segmentation models. Unlike in the image +domain, only a few studies have addressed point cloud saliency information due +to the lack of annotated training data. To alleviate this, we first present a +universal framework to transfer saliency distribution knowledge from color +images to point clouds, and use this to construct a pseudo-saliency dataset +(i.e. FordSaliency) for point clouds. Then, we adopt point cloud-based +backbones to learn saliency distribution from pseudo-saliency labels, which is +followed by our proposed SalLiDAR module. SalLiDAR is a saliency-guided 3D +semantic segmentation model that integrates saliency information to improve +segmentation performance. Finally, we introduce SalLONet, a self-supervised +saliency-guided LiDAR odometry network that uses the semantic and saliency +predictions of SalLiDAR to achieve better odometry estimation. Our extensive +experiments on benchmark datasets demonstrate that the proposed SalLiDAR and +SalLONet models achieve state-of-the-art performance against existing methods, +highlighting the effectiveness of image-to-LiDAR saliency knowledge transfer. +Source code will be available at https://github.com/nevrez/SalLONet. + +
+
+ comment: 33 pages, 12 Figures, 6 Tables +
+
+
+
+
+ + ☆ CPFES: Physical Fitness Evaluation Based on Canadian Agility and + Movement Skill Assessment + + +
+ In recent years, the assessment of fundamental movement skills integrated +with physical education has focused on both teaching practice and the +feasibility of assessment. The object of assessment has shifted from multiple +ages to subdivided ages, while the content of assessment has changed from +complex and time-consuming to concise and efficient. Therefore, we apply deep +learning to physical fitness evaluation, we propose a system based on the +Canadian Agility and Movement Skill Assessment (CAMSA) Physical Fitness +Evaluation System (CPFES), which evaluates children's physical fitness based on +CAMSA, and gives recommendations based on the scores obtained by CPFES to help +children grow. We have designed a landmark detection module and a pose +estimation module, and we have also designed a pose evaluation module for the +CAMSA criteria that can effectively evaluate the actions of the child being +tested. Our experimental results demonstrate the high accuracy of the proposed +system. + +
+
+
+
+
+ + ☆ Machine Unlearning Methodology base on Stochastic Teacher Network + + +
+ The rise of the phenomenon of the "right to be forgotten" has prompted +research on machine unlearning, which grants data owners the right to actively +withdraw data that has been used for model training, and requires the +elimination of the contribution of that data to the model. A simple method to +achieve this is to use the remaining data to retrain the model, but this is not +acceptable for other data owners who continue to participate in training. +Existing machine unlearning methods have been found to be ineffective in +quickly removing knowledge from deep learning models. This paper proposes using +a stochastic network as a teacher to expedite the mitigation of the influence +caused by forgotten data on the model. We performed experiments on three +datasets, and the findings demonstrate that our approach can efficiently +mitigate the influence of target data on the model within a single epoch. This +allows for one-time erasure and reconstruction of the model, and the +reconstruction model achieves the same performance as the retrained model. + +
+
+ comment: Accepted by 19th International Conference on Advanced Data Mining and + Applications. (ADMA 2023) +
+
+
+
+
+ + ☆ UniPT: Universal Parallel Tuning for Transfer Learning with Efficient + Parameter and Memory + + +
+ Fine-tuning pre-trained models has emerged as a powerful technique in +numerous domains, owing to its ability to leverage enormous pre-existing +knowledge and achieve remarkable performance on downstream tasks. However, +updating the parameters of entire networks is computationally intensive. +Although state-of-the-art parameter-efficient transfer learning (PETL) methods +significantly reduce the trainable parameters and storage demand, almost all of +them still need to back-propagate the gradients through large pre-trained +networks. This memory-extensive characteristic extremely limits the +applicability of PETL methods in real-world scenarios. To this end, we propose +a new memory-efficient PETL strategy, dubbed Universal Parallel Tuning (UniPT). +Specifically, we facilitate the transfer process via a lightweight learnable +parallel network, which consists of two modules: 1) A parallel interaction +module that decouples the inherently sequential connections and processes the +intermediate activations detachedly of the pre-trained network. 2) A confidence +aggregation module that learns optimal strategies adaptively for integrating +cross-layer features. We evaluate UniPT with different backbones (e.g., +VSE$\infty$, CLIP4Clip, Clip-ViL, and MDETR) on five challenging +vision-and-language tasks (i.e., image-text retrieval, video-text retrieval, +visual question answering, compositional question answering, and visual +grounding). Extensive ablations on ten datasets have validated that our UniPT +can not only dramatically reduce memory consumption and outperform the best +memory-efficient competitor, but also achieve higher performance than existing +PETL methods in a low-memory scenario on different architectures. Our code is +publicly available at: https://github.com/Paranioar/UniPT. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+ + ☆ Local-Global Pseudo-label Correction for Source-free Domain Adaptive + Medical Image Segmentation + + +
+ Domain shift is a commonly encountered issue in medical imaging solutions, +primarily caused by variations in imaging devices and data sources. To mitigate +this problem, unsupervised domain adaptation techniques have been employed. +However, concerns regarding patient privacy and potential degradation of image +quality have led to an increased focus on source-free domain adaptation. In +this study, we address the issue of false labels in self-training based +source-free domain adaptive medical image segmentation methods. To correct +erroneous pseudo-labels, we propose a novel approach called the local-global +pseudo-label correction (LGDA) method for source-free domain adaptive medical +image segmentation. Our method consists of two components: An offline local +context-based pseudo-label correction method that utilizes local context +similarity in image space. And an online global pseudo-label correction method +based on class prototypes, which corrects erroneously predicted pseudo-labels +by considering the relative distance between pixel-wise feature vectors and +prototype vectors. We evaluate the performance of our method on three benchmark +fundus image datasets for optic disc and cup segmentation. Our method achieves +superior performance compared to the state-of-the-art approaches, even without +using of any source data. + +
+
+ comment: 30 pages,7 figures +
+
+
+
+
+ + ☆ Direct initial orbit determination + + +
+ Initial orbit determination (IOD) is an important early step in the +processing chain that makes sense of and reconciles the multiple optical +observations of a resident space object. IOD methods generally operate on +line-of-sight (LOS) vectors extracted from images of the object, hence the LOS +vectors can be seen as discrete point samples of the raw optical measurements. +Typically, the number of LOS vectors used by an IOD method is much smaller than +the available measurements (\ie, the set of pixel intensity values), hence +current IOD methods arguably under-utilize the rich information present in the +data. In this paper, we propose a \emph{direct} IOD method called D-IOD that +fits the orbital parameters directly on the observed streak images, without +requiring LOS extraction. Since it does not utilize LOS vectors, D-IOD avoids +potential inaccuracies or errors due to an imperfect LOS extraction step. Two +innovations underpin our novel orbit-fitting paradigm: first, we introduce a +novel non-linear least-squares objective function that computes the loss +between the candidate-orbit-generated streak images and the observed streak +images. Second, the objective function is minimized with a gradient descent +approach that is embedded in our proposed optimization strategies designed for +streak images. We demonstrate the effectiveness of D-IOD on a variety of +simulated scenarios and challenging real streak images. + +
+
+ comment: 28 pages, 17 figures, Submitted to Advances in Space Research +
+
+
+
+
+ + ☆ Bridging Cross-task Protocol Inconsistency for Distillation in Dense + Object Detection ICCV2023 + + +
+ Knowledge distillation (KD) has shown potential for learning compact models +in dense object detection. However, the commonly used softmax-based +distillation ignores the absolute classification scores for individual +categories. Thus, the optimum of the distillation loss does not necessarily +lead to the optimal student classification scores for dense object detectors. +This cross-task protocol inconsistency is critical, especially for dense object +detectors, since the foreground categories are extremely imbalanced. To address +the issue of protocol differences between distillation and classification, we +propose a novel distillation method with cross-task consistent protocols, +tailored for the dense object detection. For classification distillation, we +address the cross-task protocol inconsistency problem by formulating the +classification logit maps in both teacher and student models as multiple +binary-classification maps and applying a binary-classification distillation +loss to each map. For localization distillation, we design an IoU-based +Localization Distillation Loss that is free from specific network structures +and can be compared with existing localization distillation losses. Our +proposed method is simple but effective, and experimental results demonstrate +its superiority over existing methods. Code is available at +https://github.com/TinyTigerPan/BCKD. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ Unleash Model Potential: Bootstrapped Meta Self-supervised Learning NIPS + + +
+ The long-term goal of machine learning is to learn general visual +representations from a small amount of data without supervision, mimicking +three advantages of human cognition: i) no need for labels, ii) robustness to +data scarcity, and iii) learning from experience. Self-supervised learning and +meta-learning are two promising techniques to achieve this goal, but they both +only partially capture the advantages and fail to address all the problems. +Self-supervised learning struggles to overcome the drawbacks of data scarcity, +while ignoring prior knowledge that can facilitate learning and generalization. +Meta-learning relies on supervised information and suffers from a bottleneck of +insufficient learning. To address these issues, we propose a novel Bootstrapped +Meta Self-Supervised Learning (BMSSL) framework that aims to simulate the human +learning process. We first analyze the close relationship between meta-learning +and self-supervised learning. Based on this insight, we reconstruct tasks to +leverage the strengths of both paradigms, achieving advantages i and ii. +Moreover, we employ a bi-level optimization framework that alternates between +solving specific tasks with a learned ability (first level) and improving this +ability (second level), attaining advantage iii. To fully harness its power, we +introduce a bootstrapped target based on meta-gradient to make the model its +own teacher. We validate the effectiveness of our approach with comprehensive +theoretical and empirical study. + +
+
+ comment: submitted to NIPS +
+
+
+
+
+ + ☆ FaceChain: A Playground for Identity-Preserving Portrait Generation + + +
+ Recent advancement in personalized image generation have unveiled the +intriguing capability of pre-trained text-to-image models on learning identity +information from a collection of portrait images. However, existing solutions +can be vulnerable in producing truthful details, and usually suffer from +several defects such as (i) The generated face exhibit its own unique +characteristics, \ie facial shape and facial feature positioning may not +resemble key characteristics of the input, and (ii) The synthesized face may +contain warped, blurred or corrupted regions. In this paper, we present +FaceChain, a personalized portrait generation framework that combines a series +of customized image-generation model and a rich set of face-related perceptual +understanding models (\eg, face detection, deep face embedding extraction, and +facial attribute recognition), to tackle aforementioned challenges and to +generate truthful personalized portraits, with only a handful of portrait +images as input. Concretely, we inject several SOTA face models into the +generation procedure, achieving a more efficient label-tagging, +data-processing, and model post-processing compared to previous solutions, such +as DreamBooth ~\cite{ruiz2023dreambooth} , InstantBooth +~\cite{shi2023instantbooth} , or other LoRA-only approaches ~\cite{hu2021lora} +. Through the development of FaceChain, we have identified several potential +directions to accelerate development of Face/Human-Centric AIGC research and +application. We have designed FaceChain as a framework comprised of pluggable +components that can be easily adjusted to accommodate different styles and +personalized needs. We hope it can grow to serve the burgeoning needs from the +communities. FaceChain is open-sourced under Apache-2.0 license at +\url{https://github.com/modelscope/facechain}. + +
+
+ comment: This is an ongoing work that will be consistently refined and + improved upon +
+
+
+
+
+ + ☆ HoloFusion: Towards Photo-realistic 3D Generative Modeling ICCV 2023 + + +
+ Diffusion-based image generators can now produce high-quality and diverse +samples, but their success has yet to fully translate to 3D generation: +existing diffusion methods can either generate low-resolution but 3D consistent +outputs, or detailed 2D views of 3D objects but with potential structural +defects and lacking view consistency or realism. We present HoloFusion, a +method that combines the best of these approaches to produce high-fidelity, +plausible, and diverse 3D samples while learning from a collection of +multi-view 2D images only. The method first generates coarse 3D samples using a +variant of the recently proposed HoloDiffusion generator. Then, it +independently renders and upsamples a large number of views of the coarse 3D +model, super-resolves them to add detail, and distills those into a single, +high-fidelity implicit 3D representation, which also ensures view consistency +of the final renders. The super-resolution network is trained as an integral +part of HoloFusion, end-to-end, and the final distillation uses a new sampling +scheme to capture the space of super-resolved signals. We compare our method +against existing baselines, including DreamFusion, Get3D, EG3D, and +HoloDiffusion, and achieve, to the best of our knowledge, the most realistic +results on the challenging CO3Dv2 dataset. + +
+
+ comment: ICCV 2023 conference; project page at: + https://holodiffusion.github.io/holofusion +
+
+
+
+
+ + ☆ Entropy-based Guidance of Deep Neural Networks for Accelerated + Convergence and Improved Performance + + +
+ Neural networks have dramatically increased our capacity to learn from large, +high-dimensional datasets across innumerable disciplines. However, their +decisions are not easily interpretable, their computational costs are high, and +building and training them are uncertain processes. To add structure to these +efforts, we derive new mathematical results to efficiently measure the changes +in entropy as fully-connected and convolutional neural networks process data, +and introduce entropy-based loss terms. Experiments in image compression and +image classification on benchmark datasets demonstrate these losses guide +neural networks to learn rich latent data representations in fewer dimensions, +converge in fewer training epochs, and achieve better test metrics. + +
+
+ comment: 13 pages, 4 figures +
+
+
+
+
+ + ☆ Auto-Prompting SAM for Mobile Friendly 3D Medical Image Segmentation + + +
+ The Segment Anything Model (SAM) has rapidly been adopted for segmenting a +wide range of natural images. However, recent studies have indicated that SAM +exhibits subpar performance on 3D medical image segmentation tasks. In addition +to the domain gaps between natural and medical images, disparities in the +spatial arrangement between 2D and 3D images, the substantial computational +burden imposed by powerful GPU servers, and the time-consuming manual prompt +generation impede the extension of SAM to a broader spectrum of medical image +segmentation applications. To address these challenges, in this work, we +introduce a novel method, AutoSAM Adapter, designed specifically for 3D +multi-organ CT-based segmentation. We employ parameter-efficient adaptation +techniques in developing an automatic prompt learning paradigm to facilitate +the transformation of the SAM model's capabilities to 3D medical image +segmentation, eliminating the need for manually generated prompts. Furthermore, +we effectively transfer the acquired knowledge of the AutoSAM Adapter to other +lightweight models specifically tailored for 3D medical image analysis, +achieving state-of-the-art (SOTA) performance on medical image segmentation +tasks. Through extensive experimental evaluation, we demonstrate the AutoSAM +Adapter as a critical foundation for effectively leveraging the emerging +ability of foundation models in 2D natural image segmentation for 3D medical +image segmentation. + +
+
+ comment: 9 pages, 4 figures, 4 tables +
+
+
+
+
+ + ☆ Application of Quantum Pre-Processing Filter for Binary Image + Classification with Small Samples + + +
+ Over the past few years, there has been significant interest in Quantum +Machine Learning (QML) among researchers, as it has the potential to transform +the field of machine learning. Several models that exploit the properties of +quantum mechanics have been developed for practical applications. In this +study, we investigated the application of our previously proposed quantum +pre-processing filter (QPF) to binary image classification. We evaluated the +QPF on four datasets: MNIST (handwritten digits), EMNIST (handwritten digits +and alphabets), CIFAR-10 (photographic images) and GTSRB (real-life traffic +sign images). Similar to our previous multi-class classification results, the +application of QPF improved the binary image classification accuracy using +neural network against MNIST, EMNIST, and CIFAR-10 from 98.9% to 99.2%, 97.8% +to 98.3%, and 71.2% to 76.1%, respectively, but degraded it against GTSRB from +93.5% to 92.0%. We then applied QPF in cases using a smaller number of training +and testing samples, i.e. 80 and 20 samples per class, respectively. In order +to derive statistically stable results, we conducted the experiment with 100 +trials choosing randomly different training and testing samples and averaging +the results. The result showed that the application of QPF did not improve the +image classification accuracy against MNIST and EMNIST but improved it against +CIFAR-10 and GTSRB from 65.8% to 67.2% and 90.5% to 91.8%, respectively. +Further research will be conducted as part of future work to investigate the +potential of QPF to assess the scalability of the proposed approach to larger +and complex datasets. + +
+
+ comment: 13 pages, 8 figures +
+
+
+
+
+ + ☆ Automated Conversion of Music Videos into Lyric Videos + + +
+ Musicians and fans often produce lyric videos, a form of music videos that +showcase the song's lyrics, for their favorite songs. However, making such +videos can be challenging and time-consuming as the lyrics need to be added in +synchrony and visual harmony with the video. Informed by prior work and close +examination of existing lyric videos, we propose a set of design guidelines to +help creators make such videos. Our guidelines ensure the readability of the +lyric text while maintaining a unified focus of attention. We instantiate these +guidelines in a fully automated pipeline that converts an input music video +into a lyric video. We demonstrate the robustness of our pipeline by generating +lyric videos from a diverse range of input sources. A user study shows that +lyric videos generated by our pipeline are effective in maintaining text +readability and unifying the focus of attention. + +
+
+
+
+
+ + ☆ Maturity-Aware Active Learning for Semantic Segmentation with + Hierarchically-Adaptive Sample Assessment BMVC 2023 + + +
+ Active Learning (AL) for semantic segmentation is challenging due to heavy +class imbalance and different ways of defining "sample" (pixels, areas, etc.), +leaving the interpretation of the data distribution ambiguous. We propose +"Maturity-Aware Distribution Breakdown-based Active Learning'' (MADBAL), an AL +method that benefits from a hierarchical approach to define a multiview data +distribution, which takes into account the different "sample" definitions +jointly, hence able to select the most impactful segmentation pixels with +comprehensive understanding. MADBAL also features a novel uncertainty +formulation, where AL supporting modules are included to sense the features' +maturity whose weighted influence continuously contributes to the uncertainty +detection. In this way, MADBAL makes significant performance leaps even in the +early AL stage, hence reducing the training burden significantly. It +outperforms state-of-the-art methods on Cityscapes and PASCAL VOC datasets as +verified in our extensive experiments. + +
+
+ comment: Accepted to the 34th British Machine Vision Conference (BMVC 2023) +
+
+
+
+
+ + ☆ BIT: Bi-Level Temporal Modeling for Efficient Supervised Action + Segmentation + + +
+ We address the task of supervised action segmentation which aims to partition +a video into non-overlapping segments, each representing a different action. +Recent works apply transformers to perform temporal modeling at the +frame-level, which suffer from high computational cost and cannot well capture +action dependencies over long temporal horizons. To address these issues, we +propose an efficient BI-level Temporal modeling (BIT) framework that learns +explicit action tokens to represent action segments, in parallel performs +temporal modeling on frame and action levels, while maintaining a low +computational cost. Our model contains (i) a frame branch that uses convolution +to learn frame-level relationships, (ii) an action branch that uses transformer +to learn action-level dependencies with a small set of action tokens and (iii) +cross-attentions to allow communication between the two branches. We apply and +extend a set-prediction objective to allow each action token to represent one +or multiple action segments, thus can avoid learning a large number of tokens +over long videos with many segments. Thanks to the design of our action branch, +we can also seamlessly leverage textual transcripts of videos (when available) +to help action segmentation by using them to initialize the action tokens. We +evaluate our model on four video datasets (two egocentric and two third-person) +for action segmentation with and without transcripts, showing that BIT +significantly improves the state-of-the-art accuracy with much lower +computational cost (30 times faster) compared to existing transformer-based +methods. + +
+
+ comment: 9 pages, 6 figures +
+
+
+
+
+ + ☆ RobustCLEVR: A Benchmark and Framework for Evaluating Robustness in + Object-centric Learning + + +
+ Object-centric representation learning offers the potential to overcome +limitations of image-level representations by explicitly parsing image scenes +into their constituent components. While image-level representations typically +lack robustness to natural image corruptions, the robustness of object-centric +methods remains largely untested. To address this gap, we present the +RobustCLEVR benchmark dataset and evaluation framework. Our framework takes a +novel approach to evaluating robustness by enabling the specification of causal +dependencies in the image generation process grounded in expert knowledge and +capable of producing a wide range of image corruptions unattainable in existing +robustness evaluations. Using our framework, we define several causal models of +the image corruption process which explicitly encode assumptions about the +causal relationships and distributions of each corruption type. We generate +dataset variants for each causal model on which we evaluate state-of-the-art +object-centric methods. Overall, we find that object-centric methods are not +inherently robust to image corruptions. Our causal evaluation approach exposes +model sensitivities not observed using conventional evaluation processes, +yielding greater insight into robustness differences across algorithms. Lastly, +while conventional robustness evaluations view corruptions as +out-of-distribution, we use our causal framework to show that even training on +in-distribution image corruptions does not guarantee increased model +robustness. This work provides a step towards more concrete and substantiated +understanding of model performance and deterioration under complex corruption +processes of the real-world. + +
+
+
+
+
+ + ☆ When hard negative sampling meets supervised contrastive learning + + +
+ State-of-the-art image models predominantly follow a two-stage strategy: +pre-training on large datasets and fine-tuning with cross-entropy loss. Many +studies have shown that using cross-entropy can result in sub-optimal +generalisation and stability. While the supervised contrastive loss addresses +some limitations of cross-entropy loss by focusing on intra-class similarities +and inter-class differences, it neglects the importance of hard negative +mining. We propose that models will benefit from performance improvement by +weighting negative samples based on their dissimilarity to positive +counterparts. In this paper, we introduce a new supervised contrastive learning +objective, SCHaNe, which incorporates hard negative sampling during the +fine-tuning phase. Without requiring specialized architectures, additional +data, or extra computational resources, experimental results indicate that +SCHaNe outperforms the strong baseline BEiT-3 in Top-1 accuracy across various +benchmarks, with significant gains of up to $3.32\%$ in few-shot learning +settings and $3.41\%$ in full dataset fine-tuning. Importantly, our proposed +objective sets a new state-of-the-art for base models on ImageNet-1k, achieving +an 86.14\% accuracy. Furthermore, we demonstrate that the proposed objective +yields better embeddings and explains the improved effectiveness observed in +our experiments. + +
+
+
+
+
+ + ☆ Evaluation of Key Spatiotemporal Learners for Print Track Anomaly + Classification Using Melt Pool Image Streams + + +
+ Recent applications of machine learning in metal additive manufacturing (MAM) +have demonstrated significant potential in addressing critical barriers to the +widespread adoption of MAM technology. Recent research in this field emphasizes +the importance of utilizing melt pool signatures for real-time defect +prediction. While high-quality melt pool image data holds the promise of +enabling precise predictions, there has been limited exploration into the +utilization of cutting-edge spatiotemporal models that can harness the inherent +transient and sequential characteristics of the additive manufacturing process. +This research introduces and puts into practice some of the leading deep +spatiotemporal learning models that can be adapted for the classification of +melt pool image streams originating from various materials, systems, and +applications. Specifically, it investigates two-stream networks comprising +spatial and temporal streams, a recurrent spatial network, and a factorized 3D +convolutional neural network. The capacity of these models to generalize when +exposed to perturbations in melt pool image data is examined using data +perturbation techniques grounded in real-world process scenarios. The +implemented architectures demonstrate the ability to capture the spatiotemporal +features of melt pool image sequences. However, among these models, only the +Kinetics400 pre-trained SlowFast network, categorized as a two-stream network, +exhibits robust generalization capabilities in the presence of data +perturbations. + +
+
+ comment: This work has been accepted to IFAC for publication under a Creative + Commons Licence CC-BY-NC-ND +
+
+
+
+
+ + ☆ SynthDistill: Face Recognition with Knowledge Distillation from + Synthetic Data + + +
+ State-of-the-art face recognition networks are often computationally +expensive and cannot be used for mobile applications. Training lightweight face +recognition models also requires large identity-labeled datasets. Meanwhile, +there are privacy and ethical concerns with collecting and using large face +recognition datasets. While generating synthetic datasets for training face +recognition models is an alternative option, it is challenging to generate +synthetic data with sufficient intra-class variations. In addition, there is +still a considerable gap between the performance of models trained on real and +synthetic data. In this paper, we propose a new framework (named SynthDistill) +to train lightweight face recognition models by distilling the knowledge of a +pretrained teacher face recognition model using synthetic data. We use a +pretrained face generator network to generate synthetic face images and use the +synthesized images to learn a lightweight student network. We use synthetic +face images without identity labels, mitigating the problems in the intra-class +variation generation of synthetic datasets. Instead, we propose a novel dynamic +sampling strategy from the intermediate latent space of the face generator +network to include new variations of the challenging images while further +exploring new face images in the training batch. The results on five different +face recognition datasets demonstrate the superiority of our lightweight model +compared to models trained on previous synthetic datasets, achieving a +verification accuracy of 99.52% on the LFW dataset with a lightweight network. +The results also show that our proposed framework significantly reduces the gap +between training with real and synthetic data. The source code for replicating +the experiments is publicly released. + +
+
+ comment: Accepted in the IEEE International Joint Conference on Biometrics + (IJCB 2023) +
+
+
+
+
+ + ☆ NSF: Neural Surface Fields for Human Modeling from Monocular Depth ICCV 2023 + + +
+ Obtaining personalized 3D animatable avatars from a monocular camera has +several real world applications in gaming, virtual try-on, animation, and +VR/XR, etc. However, it is very challenging to model dynamic and fine-grained +clothing deformations from such sparse data. Existing methods for modeling 3D +humans from depth data have limitations in terms of computational efficiency, +mesh coherency, and flexibility in resolution and topology. For instance, +reconstructing shapes using implicit functions and extracting explicit meshes +per frame is computationally expensive and cannot ensure coherent meshes across +frames. Moreover, predicting per-vertex deformations on a pre-designed human +template with a discrete surface lacks flexibility in resolution and topology. +To overcome these limitations, we propose a novel method `\keyfeature: Neural +Surface Fields' for modeling 3D clothed humans from monocular depth. NSF +defines a neural field solely on the base surface which models a continuous and +flexible displacement field. NSF can be adapted to the base surface with +different resolution and topology without retraining at inference time. +Compared to existing approaches, our method eliminates the expensive per-frame +surface extraction while maintaining mesh coherency, and is capable of +reconstructing meshes with arbitrary resolution without retraining. To foster +research in this direction, we release our code in project page at: +https://yuxuan-xue.com/nsf. + +
+
+ comment: Accpted to ICCV 2023; Homepage at: https://yuxuan-xue.com/nsf +
+
+
+
+
+ + ☆ The Interstate-24 3D Dataset: a new benchmark for 3D multi-camera + vehicle tracking + + +
+ This work presents a novel video dataset recorded from overlapping highway +traffic cameras along an urban interstate, enabling multi-camera 3D object +tracking in a traffic monitoring context. Data is released from 3 scenes +containing video from at least 16 cameras each, totaling 57 minutes in length. +877,000 3D bounding boxes and corresponding object tracklets are fully and +accurately annotated for each camera field of view and are combined into a +spatially and temporally continuous set of vehicle trajectories for each scene. +Lastly, existing algorithms are combined to benchmark a number of 3D +multi-camera tracking pipelines on the dataset, with results indicating that +the dataset is challenging due to the difficulty of matching objects traveling +at high speeds across cameras and heavy object occlusion, potentially for +hundreds of frames, during congested traffic. This work aims to enable the +development of accurate and automatic vehicle trajectory extraction algorithms, +which will play a vital role in understanding impacts of autonomous vehicle +technologies on the safety and efficiency of traffic. + +
+
+
+
+
+ + ☆ Continual Learning with Dynamic Sparse Training: Exploring Algorithms + for Effective Model Updates + + +
+ Continual learning (CL) refers to the ability of an intelligent system to +sequentially acquire and retain knowledge from a stream of data with as little +computational overhead as possible. To this end; regularization, replay, +architecture, and parameter isolation approaches were introduced to the +literature. Parameter isolation using a sparse network which enables to +allocate distinct parts of the neural network to different tasks and also +allows to share of parameters between tasks if they are similar. Dynamic Sparse +Training (DST) is a prominent way to find these sparse networks and isolate +them for each task. This paper is the first empirical study investigating the +effect of different DST components under the CL paradigm to fill a critical +research gap and shed light on the optimal configuration of DST for CL if it +exists. Therefore, we perform a comprehensive study in which we investigate +various DST components to find the best topology per task on well-known +CIFAR100 and miniImageNet benchmarks in a task-incremental CL setup since our +primary focus is to evaluate the performance of various DST criteria, rather +than the process of mask selection. We found that, at a low sparsity level, +Erdos-Renyi Kernel (ERK) initialization utilizes the backbone more efficiently +and allows to effectively learn increments of tasks. At a high sparsity level, +however, uniform initialization demonstrates more reliable and robust +performance. In terms of growth strategy; performance is dependent on the +defined initialization strategy, and the extent of sparsity. Finally, +adaptivity within DST components is a promising way for better continual +learners. + +
+
+
+
+
+ + ☆ CLNeRF: Continual Learning Meets NeRF ICCV 2023 + + +
+ Novel view synthesis aims to render unseen views given a set of calibrated +images. In practical applications, the coverage, appearance or geometry of the +scene may change over time, with new images continuously being captured. +Efficiently incorporating such continuous change is an open challenge. Standard +NeRF benchmarks only involve scene coverage expansion. To study other practical +scene changes, we propose a new dataset, World Across Time (WAT), consisting of +scenes that change in appearance and geometry over time. We also propose a +simple yet effective method, CLNeRF, which introduces continual learning (CL) +to Neural Radiance Fields (NeRFs). CLNeRF combines generative replay and the +Instant Neural Graphics Primitives (NGP) architecture to effectively prevent +catastrophic forgetting and efficiently update the model when new data arrives. +We also add trainable appearance and geometry embeddings to NGP, allowing a +single compact model to handle complex scene changes. Without the need to store +historical images, CLNeRF trained sequentially over multiple scans of a +changing scene performs on-par with the upper bound model trained on all scans +at once. Compared to other CL baselines CLNeRF performs much better across +standard benchmarks and WAT. The source code, and the WAT dataset are available +at https://github.com/IntelLabs/CLNeRF. Video presentation is available at: +https://youtu.be/nLRt6OoDGq0?si=8yD6k-8MMBJInQPs + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Interaction-Aware Prompting for Zero-Shot Spatio-Temporal Action + Detection ICCV + + +
+ The goal of spatial-temporal action detection is to determine the time and +place where each person's action occurs in a video and classify the +corresponding action category. Most of the existing methods adopt +fully-supervised learning, which requires a large amount of training data, +making it very difficult to achieve zero-shot learning. In this paper, we +propose to utilize a pre-trained visual-language model to extract the +representative image and text features, and model the relationship between +these features through different interaction modules to obtain the interaction +feature. In addition, we use this feature to prompt each label to obtain more +appropriate text features. Finally, we calculate the similarity between the +interaction feature and the text feature for each label to determine the action +category. Our experiments on J-HMDB and UCF101-24 datasets demonstrate that the +proposed interaction module and prompting make the visual-language features +better aligned, thus achieving excellent accuracy for zero-shot spatio-temporal +action detection. The code will be available at +https://github.com/webber2933/iCLIP. + +
+
+ comment: Accepted by ICCVW 2023 (What is Next in Multimodal Foundation + Models?) +
+
+
+
+
+ + ♻ ☆ Free Lunch for Gait Recognition: A Novel Relation Descriptor + + +
+ Gait recognition is to seek correct matches for query individuals by their +unique walking patterns. However, current methods focus solely on extracting +individual-specific features, overlooking inter-personal relationships. In this +paper, we propose a novel $\textbf{Relation Descriptor}$ that captures not only +individual features but also relations between test gaits and pre-selected +anchored gaits. Specifically, we reinterpret classifier weights as anchored +gaits and compute similarity scores between test features and these anchors, +which re-expresses individual gait features into a similarity relation +distribution. In essence, the relation descriptor offers a holistic perspective +that leverages the collective knowledge stored within the classifier's weights, +emphasizing meaningful patterns and enhancing robustness. Despite its +potential, relation descriptor poses dimensionality challenges since its +dimension depends on the training set's identity count. To address this, we +propose the Farthest Anchored-gait Selection to identify the most +discriminative anchored gaits and an Orthogonal Regularization to increase +diversity within anchored gaits. Compared to individual-specific features +extracted from the backbone, our relation descriptor can boost the performances +nearly without any extra costs. We evaluate the effectiveness of our method on +the popular GREW, Gait3D, CASIA-B, and OU-MVLP, showing that our method +consistently outperforms the baselines and achieves state-of-the-art +performances. + +
+
+ comment: Add new figures and fix some typos +
+
+
+
+
+ + ♻ ☆ Cross-domain Federated Object Detection ICME 2023 + + +
+ Detection models trained by one party (including server) may face severe +performance degradation when distributed to other users (clients). Federated +learning can enable multi-party collaborative learning without leaking client +data. In this paper, we focus on a special cross-domain scenario in which the +server has large-scale labeled data and multiple clients only have a small +amount of labeled data; meanwhile, there exist differences in data +distributions among the clients. In this case, traditional federated learning +methods can't help a client learn both the global knowledge of all participants +and its own unique knowledge. To make up for this limitation, we propose a +cross-domain federated object detection framework, named FedOD. The proposed +framework first performs the federated training to obtain a public global +aggregated model through multi-teacher distillation, and sends the aggregated +model back to each client for fine-tuning its personalized local model. After a +few rounds of communication, on each client we can perform weighted ensemble +inference on the public global model and the personalized local model. We +establish a federated object detection dataset which has significant background +differences and instance differences based on multiple public autonomous +driving datasets, and then conduct extensive experiments on the dataset. The +experimental results validate the effectiveness of the proposed method. + +
+
+ comment: ICME 2023 +
+
+
+
+
+ + ♻ ☆ Domain Generalization with Correlated Style Uncertainty WACV2024 + + +
+ Domain generalization (DG) approaches intend to extract domain invariant +features that can lead to a more robust deep learning model. In this regard, +style augmentation is a strong DG method taking advantage of instance-specific +feature statistics containing informative style characteristics to synthetic +novel domains. While it is one of the state-of-the-art methods, prior works on +style augmentation have either disregarded the interdependence amongst distinct +feature channels or have solely constrained style augmentation to linear +interpolation. To address these research gaps, in this work, we introduce a +novel augmentation approach, named Correlated Style Uncertainty (CSU), +surpassing the limitations of linear interpolation in style statistic space and +simultaneously preserving vital correlation information. Our method's efficacy +is established through extensive experimentation on diverse cross-domain +computer vision and medical imaging classification tasks: PACS, Office-Home, +and Camelyon17 datasets, and the Duke-Market1501 instance retrieval task. The +results showcase a remarkable improvement margin over existing state-of-the-art +techniques. The source code is available https://github.com/freshman97/CSU. + +
+
+ comment: Accepted by WACV2024, camera ready version +
+
+
+
+
+ + ♻ ☆ SoGAR: Self-supervised Spatiotemporal Attention-based Social Group + Activity Recognition + + +
+ This paper introduces a novel approach to Social Group Activity Recognition +(SoGAR) using Self-supervised Transformers network that can effectively utilize +unlabeled video data. To extract spatio-temporal information, we created local +and global views with varying frame rates. Our self-supervised objective +ensures that features extracted from contrasting views of the same video were +consistent across spatio-temporal domains. Our proposed approach is efficient +in using transformer-based encoders to alleviate the weakly supervised setting +of group activity recognition. By leveraging the benefits of transformer +models, our approach can model long-term relationships along spatio-temporal +dimensions. Our proposed SoGAR method achieved state-of-the-art results on +three group activity recognition benchmarks, namely JRDB-PAR, NBA, and +Volleyball datasets, surpassing the current numbers in terms of F1-score, MCA, +and MPCA metrics. + +
+
+ comment: Under review for PR journal; 32 pages, 7 figures. arXiv admin note: + text overlap with arXiv:2303.12149 +
+
+
+
+
+ + ♻ ☆ SPARTAN: Self-supervised Spatiotemporal Transformers Approach to Group + Activity Recognition CVPR + + +
+ In this paper, we propose a new, simple, and effective Self-supervised +Spatio-temporal Transformers (SPARTAN) approach to Group Activity Recognition +(GAR) using unlabeled video data. Given a video, we create local and global +Spatio-temporal views with varying spatial patch sizes and frame rates. The +proposed self-supervised objective aims to match the features of these +contrasting views representing the same video to be consistent with the +variations in spatiotemporal domains. To the best of our knowledge, the +proposed mechanism is one of the first works to alleviate the weakly supervised +setting of GAR using the encoders in video transformers. Furthermore, using the +advantage of transformer models, our proposed approach supports long-term +relationship modeling along spatio-temporal dimensions. The proposed SPARTAN +approach performs well on two group activity recognition benchmarks, including +NBA and Volleyball datasets, by surpassing the state-of-the-art results by a +significant margin in terms of MCA and MPCA metrics. + +
+
+ comment: Accepted to CVPRW 2023; 11 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Three-stage binarization of color document images based on discrete + wavelet transform and generative adversarial networks + + +
+ The efficient segmentation of foreground text information from the background +in degraded color document images is a critical challenge in the preservation +of ancient manuscripts. The imperfect preservation of ancient manuscripts over +time has led to various types of degradation, such as staining, yellowing, and +ink seepage, significantly affecting image binarization results. This work +proposes a three-stage method using Generative Adversarial Networks (GAN) for +enhancing and binarizing degraded color document images through Discrete +Wavelet Transform (DWT). Stage-1 involves applying DWT and retaining the +Low-Low (LL) subband images for image enhancement. In Stage-2, the original +input image is divided into four single-channel images (Red, Green, Blue, and +Gray), and each is trained with independent adversarial networks to extract +color foreground information. In Stage-3, the output image from Stage-2 and the +original input image are used to train independent adversarial networks for +document binarization, enabling the integration of global and local features. +The experimental results demonstrate that our proposed method outperforms other +classic and state-of-the-art (SOTA) methods on the Document Image Binarization +Contest (DIBCO) datasets. We have released our implementation code at +https://github.com/abcpp12383/ThreeStageBinarization. + +
+
+
+
+
+ + ♻ ☆ PiClick: Picking the desired mask in click-based interactive + segmentation + + +
+ Click-based interactive segmentation aims to generate target masks via human +clicking, which facilitates efficient pixel-level annotation and image editing. +In such a task, target ambiguity remains a problem hindering the accuracy and +efficiency of segmentation. That is, in scenes with rich context, one click may +correspond to multiple potential targets, while most previous interactive +segmentors only generate a single mask and fail to deal with target ambiguity. +In this paper, we propose a novel interactive segmentation network named +PiClick, to yield all potentially reasonable masks and suggest the most +plausible one for the user. Specifically, PiClick utilizes a Transformer-based +architecture to generate all potential target masks by mutually interactive +mask queries. Moreover, a Target Reasoning module is designed in PiClick to +automatically suggest the user-desired mask from all candidates, relieving +target ambiguity and extra-human efforts. Extensive experiments on 9 +interactive segmentation datasets demonstrate PiClick performs favorably +against previous state-of-the-arts considering the segmentation results. +Moreover, we show that PiClick effectively reduces human efforts in annotating +and picking the desired masks. To ease the usage and inspire future research, +we release the source code of PiClick together with a plug-and-play annotation +tool at https://github.com/cilinyan/PiClick. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ Secure & Private Federated Neuroimaging + + +
+ The amount of biomedical data continues to grow rapidly. However, collecting +data from multiple sites for joint analysis remains challenging due to +security, privacy, and regulatory concerns. To overcome this challenge, we use +Federated Learning, which enables distributed training of neural network models +over multiple data sources without sharing data. Each site trains the neural +network over its private data for some time, then shares the neural network +parameters (i.e., weights, gradients) with a Federation Controller, which in +turn aggregates the local models, sends the resulting community model back to +each site, and the process repeats. Our Federated Learning architecture, +MetisFL, provides strong security and privacy. First, sample data never leaves +a site. Second, neural network parameters are encrypted before transmission and +the global neural model is computed under fully-homomorphic encryption. +Finally, we use information-theoretic methods to limit information leakage from +the neural model to prevent a curious site from performing model inversion or +membership attacks. We present a thorough evaluation of the performance of +secure, private federated learning in neuroimaging tasks, including for +predicting Alzheimer's disease and estimating BrainAGE from magnetic resonance +imaging (MRI) studies, in challenging, heterogeneous federated environments +where sites have different amounts of data and statistical distributions. + +
+
+ comment: 18 pages, 13 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ EgoLoc: Revisiting 3D Object Localization from Egocentric Videos with + Visual Queries ICCV 2023 + + +
+ With the recent advances in video and 3D understanding, novel 4D +spatio-temporal methods fusing both concepts have emerged. Towards this +direction, the Ego4D Episodic Memory Benchmark proposed a task for Visual +Queries with 3D Localization (VQ3D). Given an egocentric video clip and an +image crop depicting a query object, the goal is to localize the 3D position of +the center of that query object with respect to the camera pose of a query +frame. Current methods tackle the problem of VQ3D by unprojecting the 2D +localization results of the sibling task Visual Queries with 2D Localization +(VQ2D) into 3D predictions. Yet, we point out that the low number of camera +poses caused by camera re-localization from previous VQ3D methods severally +hinders their overall success rate. In this work, we formalize a pipeline (we +dub EgoLoc) that better entangles 3D multiview geometry with 2D object +retrieval from egocentric videos. Our approach involves estimating more robust +camera poses and aggregating multi-view 3D displacements by leveraging the 2D +detection confidence, which enhances the success rate of object queries and +leads to a significant improvement in the VQ3D baseline performance. +Specifically, our approach achieves an overall success rate of up to 87.12%, +which sets a new state-of-the-art result in the VQ3D task. We provide a +comprehensive empirical analysis of the VQ3D task and existing solutions, and +highlight the remaining challenges in VQ3D. The code is available at +https://github.com/Wayne-Mai/EgoLoc. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Integrating Boxes and Masks: A Multi-Object Framework for Unified Visual + Tracking and Segmentation ICCV2023 + + +
+ Tracking any given object(s) spatially and temporally is a common purpose in +Visual Object Tracking (VOT) and Video Object Segmentation (VOS). Joint +tracking and segmentation have been attempted in some studies but they often +lack full compatibility of both box and mask in initialization and prediction, +and mainly focus on single-object scenarios. To address these limitations, this +paper proposes a Multi-object Mask-box Integrated framework for unified +Tracking and Segmentation, dubbed MITS. Firstly, the unified identification +module is proposed to support both box and mask reference for initialization, +where detailed object information is inferred from boxes or directly retained +from masks. Additionally, a novel pinpoint box predictor is proposed for +accurate multi-object box prediction, facilitating target-oriented +representation learning. All target objects are processed simultaneously from +encoding to propagation and decoding, as a unified pipeline for VOT and VOS. +Experimental results show MITS achieves state-of-the-art performance on both +VOT and VOS benchmarks. Notably, MITS surpasses the best prior VOT competitor +by around 6% on the GOT-10k test set, and significantly improves the +performance of box initialization on VOS benchmarks. The code is available at +https://github.com/yoxu515/MITS. + +
+
+ comment: Accepted to ICCV2023 +
+
+
+
+
+ + ♻ ☆ CarDD: A New Dataset for Vision-based Car Damage Detection + + +
+ Automatic car damage detection has attracted significant attention in the car +insurance business. However, due to the lack of high-quality and publicly +available datasets, we can hardly learn a feasible model for car damage +detection. To this end, we contribute with Car Damage Detection (CarDD), the +first public large-scale dataset designed for vision-based car damage detection +and segmentation. Our CarDD contains 4,000 highresolution car damage images +with over 9,000 well-annotated instances of six damage categories. We detail +the image collection, selection, and annotation processes, and present a +statistical dataset analysis. Furthermore, we conduct extensive experiments on +CarDD with state-of-the-art deep methods for different tasks and provide +comprehensive analyses to highlight the specialty of car damage detection. +CarDD dataset and the source code are available at +https://cardd-ustc.github.io. + +
+
+ comment: 13 pages, 10 figures, full-length paper for Transactions on + Intelligent Transportation Systems (2023) +
+
+
+
+
+ + ♻ ☆ Self-supervised pseudo-colorizing of masked cells + + +
+ Self-supervised learning, which is strikingly referred to as the dark matter +of intelligence, is gaining more attention in biomedical applications of deep +learning. In this work, we introduce a novel self-supervision objective for the +analysis of cells in biomedical microscopy images. We propose training deep +learning models to pseudo-colorize masked cells. We use a physics-informed +pseudo-spectral colormap that is well suited for colorizing cell topology. Our +experiments reveal that approximating semantic segmentation by +pseudo-colorization is beneficial for subsequent fine-tuning on cell detection. +Inspired by the recent success of masked image modeling, we additionally mask +out cell parts and train to reconstruct these parts to further enrich the +learned representations. We compare our pre-training method with +self-supervised frameworks including contrastive learning (SimCLR), masked +autoencoders (MAEs), and edge-based self-supervision. We build upon our +previous work and train hybrid models for cell detection, which contain both +convolutional and vision transformer modules. Our pre-training method can +outperform SimCLR, MAE-like masked image modeling, and edge-based +self-supervision when pre-training on a diverse set of six fluorescence +microscopy datasets. Code is available at: +https://github.com/roydenwa/pseudo-colorize-masked-cells + +
+
+ comment: 14 pages, 3 figures; Published in PLOS ONE +
+
+
+
+
+ + ♻ ☆ Graph-based Topology Reasoning for Driving Scenes + + +
+ Understanding the road genome is essential to realize autonomous driving. +This highly intelligent problem contains two aspects - the connection +relationship of lanes, and the assignment relationship between lanes and +traffic elements, where a comprehensive topology reasoning method is vacant. On +one hand, previous map learning techniques struggle in deriving lane +connectivity with segmentation or laneline paradigms; or prior lane +topology-oriented approaches focus on centerline detection and neglect the +interaction modeling. On the other hand, the traffic element to lane assignment +problem is limited in the image domain, leaving how to construct the +correspondence from two views an unexplored challenge. To address these issues, +we present TopoNet, the first end-to-end framework capable of abstracting +traffic knowledge beyond conventional perception tasks. To capture the driving +scene topology, we introduce three key designs: (1) an embedding module to +incorporate semantic knowledge from 2D elements into a unified feature space; +(2) a curated scene graph neural network to model relationships and enable +feature interaction inside the network; (3) instead of transmitting messages +arbitrarily, a scene knowledge graph is devised to differentiate prior +knowledge from various types of the road genome. We evaluate TopoNet on the +challenging scene understanding benchmark, OpenLane-V2, where our approach +outperforms all previous works by a great margin on all perceptual and +topological metrics. The code is released at +https://github.com/OpenDriveLab/TopoNet + +
+
+
+
+
+ + ♻ ☆ No Fear of Classifier Biases: Neural Collapse Inspired Federated + Learning with Synthetic and Fixed Classifier ICCV 2023 + + +
+ Data heterogeneity is an inherent challenge that hinders the performance of +federated learning (FL). Recent studies have identified the biased classifiers +of local models as the key bottleneck. Previous attempts have used classifier +calibration after FL training, but this approach falls short in improving the +poor feature representations caused by training-time classifier biases. +Resolving the classifier bias dilemma in FL requires a full understanding of +the mechanisms behind the classifier. Recent advances in neural collapse have +shown that the classifiers and feature prototypes under perfect training +scenarios collapse into an optimal structure called simplex equiangular tight +frame (ETF). Building on this neural collapse insight, we propose a solution to +the FL's classifier bias problem by utilizing a synthetic and fixed ETF +classifier during training. The optimal classifier structure enables all +clients to learn unified and optimal feature representations even under +extremely heterogeneous data. We devise several effective modules to better +adapt the ETF structure in FL, achieving both high generalization and +personalization. Extensive experiments demonstrate that our method achieves +state-of-the-art performances on CIFAR-10, CIFAR-100, and Tiny-ImageNet. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Multimodal Motion Conditioned Diffusion Model for Skeleton-based Video + Anomaly Detection ICCV2023 + + +
+ Anomalies are rare and anomaly detection is often therefore framed as +One-Class Classification (OCC), i.e. trained solely on normalcy. Leading OCC +techniques constrain the latent representations of normal motions to limited +volumes and detect as abnormal anything outside, which accounts satisfactorily +for the openset'ness of anomalies. But normalcy shares the same openset'ness +property since humans can perform the same action in several ways, which the +leading techniques neglect. We propose a novel generative model for video +anomaly detection (VAD), which assumes that both normality and abnormality are +multimodal. We consider skeletal representations and leverage state-of-the-art +diffusion probabilistic models to generate multimodal future human poses. We +contribute a novel conditioning on the past motion of people and exploit the +improved mode coverage capabilities of diffusion processes to generate +different-but-plausible future motions. Upon the statistical aggregation of +future modes, an anomaly is detected when the generated set of motions is not +pertinent to the actual future. We validate our model on 4 established +benchmarks: UBnormal, HR-UBnormal, HR-STC, and HR-Avenue, with extensive +experiments surpassing state-of-the-art results. + +
+
+ comment: Accepted at ICCV2023 +
+
+
+
+
+ + ♻ ☆ Efficient Video Action Detection with Token Dropout and Context + Refinement + + +
+ Streaming video clips with large-scale video tokens impede vision +transformers (ViTs) for efficient recognition, especially in video action +detection where sufficient spatiotemporal representations are required for +precise actor identification. In this work, we propose an end-to-end framework +for efficient video action detection (EVAD) based on vanilla ViTs. Our EVAD +consists of two specialized designs for video action detection. First, we +propose a spatiotemporal token dropout from a keyframe-centric perspective. In +a video clip, we maintain all tokens from its keyframe, preserve tokens +relevant to actor motions from other frames, and drop out the remaining tokens +in this clip. Second, we refine scene context by leveraging remaining tokens +for better recognizing actor identities. The region of interest (RoI) in our +action detector is expanded into temporal domain. The captured spatiotemporal +actor identity representations are refined via scene context in a decoder with +the attention mechanism. These two designs make our EVAD efficient while +maintaining accuracy, which is validated on three benchmark datasets (i.e., +AVA, UCF101-24, JHMDB). Compared to the vanilla ViT backbone, our EVAD reduces +the overall GFLOPs by 43% and improves real-time inference speed by 40% with no +performance degradation. Moreover, even at similar computational costs, our +EVAD can improve the performance by 1.1 mAP with higher resolution inputs. Code +is available at https://github.com/MCG-NJU/EVAD. + +
+
+ comment: technical report +
+
+
+
+
+ + ♻ ☆ Unleashing the Power of Self-Supervised Image Denoising: A Comprehensive + Review + + +
+ The advent of deep learning has brought a revolutionary transformation to +image denoising techniques. However, the persistent challenge of acquiring +noise-clean pairs for supervised methods in real-world scenarios remains +formidable, necessitating the exploration of more practical self-supervised +image denoising. This paper focuses on self-supervised image denoising methods +that offer effective solutions to address this challenge. Our comprehensive +review thoroughly analyzes the latest advancements in self-supervised image +denoising approaches, categorizing them into three distinct classes: General +methods, Blind Spot Network (BSN)-based methods, and Transformer-based methods. +For each class, we provide a concise theoretical analysis along with their +practical applications. To assess the effectiveness of these methods, we +present both quantitative and qualitative experimental results on various +datasets, utilizing classical algorithms as benchmarks. Additionally, we +critically discuss the current limitations of these methods and propose +promising directions for future research. By offering a detailed overview of +recent developments in self-supervised image denoising, this review serves as +an invaluable resource for researchers and practitioners in the field, +facilitating a deeper understanding of this emerging domain and inspiring +further advancements. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ♻ ☆ Predicting Shape Development: a Riemannian Method + + +
+ Predicting the future development of an anatomical shape from a single +baseline observation is a challenging task. But it can be essential for +clinical decision-making. Research has shown that it should be tackled in +curved shape spaces, as (e.g., disease-related) shape changes frequently expose +nonlinear characteristics. We thus propose a novel prediction method that +encodes the whole shape in a Riemannian shape space. It then learns a simple +prediction technique founded on hierarchical statistical modeling of +longitudinal training data. When applied to predict the future development of +the shape of the right hippocampus under Alzheimer's disease and to human body +motion, it outperforms deep learning-supported variants as well as +state-of-the-art. + +
+
+ comment: new experiment with human motion data; fixed vertex-assignment bug in + the prediction of the varifold-based method +
+
+
+
+
+ + ♻ ☆ Efficient Decision-based Black-box Patch Attacks on Video Recognition + + +
+ Although Deep Neural Networks (DNNs) have demonstrated excellent performance, +they are vulnerable to adversarial patches that introduce perceptible and +localized perturbations to the input. Generating adversarial patches on images +has received much attention, while adversarial patches on videos have not been +well investigated. Further, decision-based attacks, where attackers only access +the predicted hard labels by querying threat models, have not been well +explored on video models either, even if they are practical in real-world video +recognition scenes. The absence of such studies leads to a huge gap in the +robustness assessment for video models. To bridge this gap, this work first +explores decision-based patch attacks on video models. We analyze that the huge +parameter space brought by videos and the minimal information returned by +decision-based models both greatly increase the attack difficulty and query +burden. To achieve a query-efficient attack, we propose a spatial-temporal +differential evolution (STDE) framework. First, STDE introduces target videos +as patch textures and only adds patches on keyframes that are adaptively +selected by temporal difference. Second, STDE takes minimizing the patch area +as the optimization objective and adopts spatialtemporal mutation and crossover +to search for the global optimum without falling into the local optimum. +Experiments show STDE has demonstrated state-of-the-art performance in terms of +threat, efficiency and imperceptibility. Hence, STDE has the potential to be a +powerful tool for evaluating the robustness of video recognition models. + +
+
+
+
+
+ + ♻ ☆ Multi-Atlas Segmentation and Spatial Alignment of the Human Embryo in + First Trimester 3D Ultrasound + + +
+ Segmentation and spatial alignment of ultrasound (US) imaging data acquired +in the in first trimester are crucial for monitoring human embryonic growth and +development throughout this crucial period of life. Current approaches are +either manual or semi-automatic and are therefore very time-consuming and prone +to errors. To automate these tasks, we propose a multi-atlas framework for +automatic segmentation and spatial alignment of the embryo using deep learning +with minimal supervision. Our framework learns to register the embryo to an +atlas, which consists of the US images acquired at a range of gestational age +(GA), segmented and spatially aligned to a predefined standard orientation. +From this, we can derive the segmentation of the embryo and put the embryo in +standard orientation. US images acquired at 8+0 till 12+6 weeks GA were used +and eight subjects were selected as atlas. We evaluated different fusion +strategies to incorporate multiple atlases: 1) training the framework using +atlas images from a single subject, 2) training the framework with data of all +available atlases and 3) ensembling of the frameworks trained per subject. To +evaluate the performance, we calculated the Dice score over the test set. We +found that training the framework using all available atlases outperformed +ensembling and gave similar results compared to the best of all frameworks +trained on a single subject. Furthermore, we found that selecting images from +the four atlases closest in GA out of all available atlases, regardless of the +individual quality, gave the best results with a median Dice score of 0.72. We +conclude that our framework can accurately segment and spatially align the +embryo in first trimester 3D US images and is robust for the variation in +quality that existed in the available atlases. + +
+
+ comment: Accepted for publication at the Journal of Machine Learning for + Biomedical Imaging (MELBA) https://www.melba-journal.org/papers/2022:020.html +
+
+
+
+
+ + ♻ ☆ GeoMIM: Towards Better 3D Knowledge Transfer via Masked Image Modeling + for Multi-view 3D Understanding + + +
+ Multi-view camera-based 3D detection is a challenging problem in computer +vision. Recent works leverage a pretrained LiDAR detection model to transfer +knowledge to a camera-based student network. However, we argue that there is a +major domain gap between the LiDAR BEV features and the camera-based BEV +features, as they have different characteristics and are derived from different +sources. In this paper, we propose Geometry Enhanced Masked Image Modeling +(GeoMIM) to transfer the knowledge of the LiDAR model in a pretrain-finetune +paradigm for improving the multi-view camera-based 3D detection. GeoMIM is a +multi-camera vision transformer with Cross-View Attention (CVA) blocks that +uses LiDAR BEV features encoded by the pretrained BEV model as learning +targets. During pretraining, GeoMIM's decoder has a semantic branch completing +dense perspective-view features and the other geometry branch reconstructing +dense perspective-view depth maps. The depth branch is designed to be +camera-aware by inputting the camera's parameters for better transfer +capability. Extensive results demonstrate that GeoMIM outperforms existing +methods on nuScenes benchmark, achieving state-of-the-art performance for +camera-based 3D object detection and 3D segmentation. Code and pretrained +models are available at https://github.com/Sense-X/GeoMIM. + +
+
+ comment: Release code: https://github.com/Sense-X/GeoMIM +
+
+
+
+
+ + ♻ ☆ DDH-QA: A Dynamic Digital Humans Quality Assessment Database + + +
+ In recent years, large amounts of effort have been put into pushing forward +the real-world application of dynamic digital human (DDH). However, most +current quality assessment research focuses on evaluating static 3D models and +usually ignores motion distortions. Therefore, in this paper, we construct a +large-scale dynamic digital human quality assessment (DDH-QA) database with +diverse motion content as well as multiple distortions to comprehensively study +the perceptual quality of DDHs. Both model-based distortion (noise, +compression) and motion-based distortion (binding error, motion unnaturalness) +are taken into consideration. Ten types of common motion are employed to drive +the DDHs and a total of 800 DDHs are generated in the end. Afterward, we render +the video sequences of the distorted DDHs as the evaluation media and carry out +a well-controlled subjective experiment. Then a benchmark experiment is +conducted with the state-of-the-art video quality assessment (VQA) methods and +the experimental results show that existing VQA methods are limited in +assessing the perceptual loss of DDHs. + +
+
+
+
+
+ + ♻ ☆ Region-Aware Pretraining for Open-Vocabulary Object Detection with + Vision Transformers CVPR 2023 + + +
+ We present Region-aware Open-vocabulary Vision Transformers (RO-ViT) - a +contrastive image-text pretraining recipe to bridge the gap between image-level +pretraining and open-vocabulary object detection. At the pretraining phase, we +propose to randomly crop and resize regions of positional embeddings instead of +using the whole image positional embeddings. This better matches the use of +positional embeddings at region-level in the detection finetuning phase. In +addition, we replace the common softmax cross entropy loss in contrastive +learning with focal loss to better learn the informative yet difficult +examples. Finally, we leverage recent advances in novel object proposals to +improve open-vocabulary detection finetuning. We evaluate our full model on the +LVIS and COCO open-vocabulary detection benchmarks and zero-shot transfer. +RO-ViT achieves a state-of-the-art 34.1 $AP_r$ on LVIS, surpassing the best +existing approach by +7.8 points in addition to competitive zero-shot transfer +detection. Surprisingly, RO-ViT improves the image-level representation as well +and achieves the state of the art on 9 out of 12 metrics on COCO and Flickr +image-text retrieval benchmarks, outperforming competitive approaches with +larger models. + +
+
+ comment: CVPR 2023 Highlight - https://github.com/mcahny/rovit ; adds LAION-2B + result +
+
+
+
+
+ + ♻ ☆ Predicting Class Distribution Shift for Reliable Domain Adaptive Object + Detection + + +
+ Unsupervised Domain Adaptive Object Detection (UDA-OD) uses unlabelled data +to improve the reliability of robotic vision systems in open-world +environments. Previous approaches to UDA-OD based on self-training have been +effective in overcoming changes in the general appearance of images. However, +shifts in a robot's deployment environment can also impact the likelihood that +different objects will occur, termed class distribution shift. Motivated by +this, we propose a framework for explicitly addressing class distribution shift +to improve pseudo-label reliability in self-training. Our approach uses the +domain invariance and contextual understanding of a pre-trained joint vision +and language model to predict the class distribution of unlabelled data. By +aligning the class distribution of pseudo-labels with this prediction, we +provide weak supervision of pseudo-label accuracy. To further account for low +quality pseudo-labels early in self-training, we propose an approach to +dynamically adjust the number of pseudo-labels per image based on model +confidence. Our method outperforms state-of-the-art approaches on several +benchmarks, including a 4.7 mAP improvement when facing challenging class +distribution shift. + +
+
+
+
+
+ + ♻ ☆ Instance-incremental Scene Graph Generation from Real-world Point Clouds + via Normalizing Flows + + +
+ This work introduces a new task of instance-incremental scene graph +generation: Given a scene of the point cloud, representing it as a graph and +automatically increasing novel instances. A graph denoting the object layout of +the scene is finally generated. It is an important task since it helps to guide +the insertion of novel 3D objects into a real-world scene in vision-based +applications like augmented reality. It is also challenging because the +complexity of the real-world point cloud brings difficulties in learning object +layout experiences from the observation data (non-empty rooms with labeled +semantics). We model this task as a conditional generation problem and propose +a 3D autoregressive framework based on normalizing flows (3D-ANF) to address +it. First, we represent the point cloud as a graph by extracting the label +semantics and contextual relationships. Next, a model based on normalizing +flows is introduced to map the conditional generation of graphic elements into +the Gaussian process. The mapping is invertible. Thus, the real-world +experiences represented in the observation data can be modeled in the training +phase, and novel instances can be autoregressively generated based on the +Gaussian process in the testing phase. To evaluate the performance of our +method sufficiently, we implement this new task on the indoor benchmark dataset +3DSSG-O27R16 and our newly proposed graphical dataset of outdoor scenes GPL3D. +Experiments show that our method generates reliable novel graphs from the +real-world point cloud and achieves state-of-the-art performance on the +datasets. + +
+
+ comment: Accepted by IEEE TCSVT. The supplementary material is available in + the media column of the journal version of the article +
+
+
+
+
+ + ♻ ☆ GINA-3D: Learning to Generate Implicit Neural Assets in the Wild CVPR 2023 + + +
+ Modeling the 3D world from sensor data for simulation is a scalable way of +developing testing and validation environments for robotic learning problems +such as autonomous driving. However, manually creating or re-creating +real-world-like environments is difficult, expensive, and not scalable. Recent +generative model techniques have shown promising progress to address such +challenges by learning 3D assets using only plentiful 2D images -- but still +suffer limitations as they leverage either human-curated image datasets or +renderings from manually-created synthetic 3D environments. In this paper, we +introduce GINA-3D, a generative model that uses real-world driving data from +camera and LiDAR sensors to create realistic 3D implicit neural assets of +diverse vehicles and pedestrians. Compared to the existing image datasets, the +real-world driving setting poses new challenges due to occlusions, +lighting-variations and long-tail distributions. GINA-3D tackles these +challenges by decoupling representation learning and generative modeling into +two stages with a learned tri-plane latent structure, inspired by recent +advances in generative modeling of images. To evaluate our approach, we +construct a large-scale object-centric dataset containing over 1.2M images of +vehicles and pedestrians from the Waymo Open Dataset, and a new set of 80K +images of long-tail instances such as construction equipment, garbage trucks, +and cable cars. We compare our model with existing approaches and demonstrate +that it achieves state-of-the-art performance in quality and diversity for both +generated images and geometries. + +
+
+ comment: Accepted by CVPR 2023; Our WOD-ObjectAsset can be accessed through + waymo.com/open +
+
+
+
+
+ + ♻ ☆ $BT^2$: Backward-compatible Training with Basis Transformation + + +
+ Modern retrieval system often requires recomputing the representation of +every piece of data in the gallery when updating to a better representation +model. This process is known as backfilling and can be especially costly in the +real world where the gallery often contains billions of samples. Recently, +researchers have proposed the idea of Backward Compatible Training (BCT) where +the new representation model can be trained with an auxiliary loss to make it +backward compatible with the old representation. In this way, the new +representation can be directly compared with the old representation, in +principle avoiding the need for any backfilling. However, followup work shows +that there is an inherent tradeoff where a backward compatible representation +model cannot simultaneously maintain the performance of the new model itself. +This paper reports our ``not-so-surprising'' finding that adding extra +dimensions to the representation can help here. However, we also found that +naively increasing the dimension of the representation did not work. To deal +with this, we propose Backward-compatible Training with a novel Basis +Transformation ($BT^2$). A basis transformation (BT) is basically a learnable +set of parameters that applies an orthonormal transformation. Such a +transformation possesses an important property whereby the original information +contained in its input is retained in its output. We show in this paper how a +BT can be utilized to add only the necessary amount of additional dimensions. +We empirically verify the advantage of $BT^2$ over other state-of-the-art +methods in a wide range of settings. We then further extend $BT^2$ to other +challenging yet more practical settings, including significant change in model +architecture (CNN to Transformers), modality change, and even a series of +updates in the model architecture mimicking the evolution of deep learning +models. + +
+
+ comment: iccv2023 camera ready +
+
+
+
+
+ + ♻ ☆ One-shot Ultra-high-Resolution Generative Adversarial Network That + Synthesizes 16K Images On A Single GPU + + +
+ We propose a one-shot ultra-high-resolution generative adversarial network +(OUR-GAN) framework that generates non-repetitive 16K (16, 384 x 8, 640) images +from a single training image and is trainable on a single consumer GPU. OUR-GAN +generates an initial image that is visually plausible and varied in shape at +low resolution, and then gradually increases the resolution by adding detail +through super-resolution. Since OUR-GAN learns from a real +ultra-high-resolution (UHR) image, it can synthesize large shapes with fine +details and long-range coherence, which is difficult to achieve with +conventional generative models that rely on the patch distribution learned from +relatively small images. OUR-GAN can synthesize high-quality 16K images with +12.5 GB of GPU memory and 4K images with only 4.29 GB as it synthesizes a UHR +image part by part through seamless subregion-wise super-resolution. +Additionally, OUR-GAN improves visual coherence while maintaining diversity by +applying vertical positional convolution. In experiments on the ST4K and RAISE +datasets, OUR-GAN exhibited improved fidelity, visual coherency, and diversity +compared with the baseline one-shot synthesis models. To the best of our +knowledge, OUR-GAN is the first one-shot image synthesizer that generates +non-repetitive UHR images on a single consumer GPU. The synthesized image +samples are presented at https://our-gan.github.io. + +
+
+ comment: 36 pages, 26 figures +
+
+
+
+
+ + ♻ ☆ Adaptive Negative Evidential Deep Learning for Open-set Semi-supervised + Learning + + +
+ Semi-supervised learning (SSL) methods assume that labeled data, unlabeled +data and test data are from the same distribution. Open-set semi-supervised +learning (Open-set SSL) considers a more practical scenario, where unlabeled +data and test data contain new categories (outliers) not observed in labeled +data (inliers). Most previous works focused on outlier detection via binary +classifiers, which suffer from insufficient scalability and inability to +distinguish different types of uncertainty. In this paper, we propose a novel +framework, Adaptive Negative Evidential Deep Learning (ANEDL) to tackle these +limitations. Concretely, we first introduce evidential deep learning (EDL) as +an outlier detector to quantify different types of uncertainty, and design +different uncertainty metrics for self-training and inference. Furthermore, we +propose a novel adaptive negative optimization strategy, making EDL more +tailored to the unlabeled dataset containing both inliers and outliers. As +demonstrated empirically, our proposed method outperforms existing +state-of-the-art methods across four datasets. + +
+
+
+
+
+ + ♻ ☆ Enhancing Breast Cancer Risk Prediction by Incorporating Prior Images MICCAI 2023 + + +
+ Recently, deep learning models have shown the potential to predict breast +cancer risk and enable targeted screening strategies, but current models do not +consider the change in the breast over time. In this paper, we present a new +method, PRIME+, for breast cancer risk prediction that leverages prior +mammograms using a transformer decoder, outperforming a state-of-the-art risk +prediction method that only uses mammograms from a single time point. We +validate our approach on a dataset with 16,113 exams and further demonstrate +that it effectively captures patterns of changes from prior mammograms, such as +changes in breast density, resulting in improved short-term and long-term +breast cancer risk prediction. Experimental results show that our model +achieves a statistically significant improvement in performance over the +state-of-the-art based model, with a C-index increase from 0.68 to 0.73 (p < +0.05) on held-out test sets. + +
+
+ comment: MICCAI 2023 accepted +
+
+
+
+
+ + ♻ ☆ Bayesian Optimization Meets Self-Distillation ICCV 2023 + + +
+ Bayesian optimization (BO) has contributed greatly to improving model +performance by suggesting promising hyperparameter configurations iteratively +based on observations from multiple training trials. However, only partial +knowledge (i.e., the measured performances of trained models and their +hyperparameter configurations) from previous trials is transferred. On the +other hand, Self-Distillation (SD) only transfers partial knowledge learned by +the task model itself. To fully leverage the various knowledge gained from all +training trials, we propose the BOSS framework, which combines BO and SD. BOSS +suggests promising hyperparameter configurations through BO and carefully +selects pre-trained models from previous trials for SD, which are otherwise +abandoned in the conventional BO process. BOSS achieves significantly better +performance than both BO and SD in a wide range of tasks including general +image classification, learning with noisy labels, semi-supervised learning, and +medical image analysis tasks. + +
+
+ comment: ICCV 2023 accepted +
+
+
+
+
+ + ♻ ☆ CLE Diffusion: Controllable Light Enhancement Diffusion Model + + +
+ Low light enhancement has gained increasing importance with the rapid +development of visual creation and editing. However, most existing enhancement +algorithms are designed to homogeneously increase the brightness of images to a +pre-defined extent, limiting the user experience. To address this issue, we +propose Controllable Light Enhancement Diffusion Model, dubbed CLE Diffusion, a +novel diffusion framework to provide users with rich controllability. Built +with a conditional diffusion model, we introduce an illumination embedding to +let users control their desired brightness level. Additionally, we incorporate +the Segment-Anything Model (SAM) to enable user-friendly region +controllability, where users can click on objects to specify the regions they +wish to enhance. Extensive experiments demonstrate that CLE Diffusion achieves +competitive performance regarding quantitative metrics, qualitative results, +and versatile controllability. Project page: +https://yuyangyin.github.io/CLEDiffusion/ + +
+
+ comment: Accepted In Proceedings of the 31st ACM International Conference on + Multimedia (MM' 23) +
+
+
+
+
+ + ♻ ☆ TRansPose: Large-Scale Multispectral Dataset for Transparent Object + + +
+ Transparent objects are encountered frequently in our daily lives, yet +recognizing them poses challenges for conventional vision sensors due to their +unique material properties, not being well perceived from RGB or depth cameras. +Overcoming this limitation, thermal infrared cameras have emerged as a +solution, offering improved visibility and shape information for transparent +objects. In this paper, we present TRansPose, the first large-scale +multispectral dataset that combines stereo RGB-D, thermal infrared (TIR) +images, and object poses to promote transparent object research. The dataset +includes 99 transparent objects, encompassing 43 household items, 27 recyclable +trashes, 29 chemical laboratory equivalents, and 12 non-transparent objects. It +comprises a vast collection of 333,819 images and 4,000,056 annotations, +providing instance-level segmentation masks, ground-truth poses, and completed +depth information. The data was acquired using a FLIR A65 thermal infrared +(TIR) camera, two Intel RealSense L515 RGB-D cameras, and a Franka Emika Panda +robot manipulator. Spanning 87 sequences, TRansPose covers various challenging +real-life scenarios, including objects filled with water, diverse lighting +conditions, heavy clutter, non-transparent or translucent containers, objects +in plastic bags, and multi-stacked objects. TRansPose dataset can be accessed +from the following link: https://sites.google.com/view/transpose-dataset + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Hiding Visual Information via Obfuscating Adversarial Perturbations + + +
+ Growing leakage and misuse of visual information raise security and privacy +concerns, which promotes the development of information protection. Existing +adversarial perturbations-based methods mainly focus on the de-identification +against deep learning models. However, the inherent visual information of the +data has not been well protected. In this work, inspired by the Type-I +adversarial attack, we propose an adversarial visual information hiding method +to protect the visual privacy of data. Specifically, the method generates +obfuscating adversarial perturbations to obscure the visual information of the +data. Meanwhile, it maintains the hidden objectives to be correctly predicted +by models. In addition, our method does not modify the parameters of the +applied model, which makes it flexible for different scenarios. Experimental +results on the recognition and classification tasks demonstrate that the +proposed method can effectively hide visual information and hardly affect the +performances of models. The code is available in the supplementary material. + +
+
+
+
+
+ + ♻ ☆ Deep Generative Models on 3D Representations: A Survey + + +
+ Generative models aim to learn the distribution of observed data by +generating new instances. With the advent of neural networks, deep generative +models, including variational autoencoders (VAEs), generative adversarial +networks (GANs), and diffusion models (DMs), have progressed remarkably in +synthesizing 2D images. Recently, researchers started to shift focus from 2D to +3D space, considering that 3D data is more closely aligned with our physical +world and holds immense practical potential. However, unlike 2D images, which +possess an inherent and efficient representation (\textit{i.e.}, a pixel grid), +representing 3D data poses significantly greater challenges. Ideally, a robust +3D representation should be capable of accurately modeling complex shapes and +appearances while being highly efficient in handling high-resolution data with +high processing speeds and low memory requirements. Regrettably, existing 3D +representations, such as point clouds, meshes, and neural fields, often fail to +satisfy all of these requirements simultaneously. In this survey, we thoroughly +review the ongoing developments of 3D generative models, including methods that +employ 2D and 3D supervision. Our analysis centers on generative models, with a +particular focus on the representations utilized in this context. We believe +our survey will help the community to track the field's evolution and to spark +innovative ideas to propel progress towards solving this challenging task. + +
+
+ comment: Github: https://github.com/justimyhxu/awesome-3D-generation +
+
+
+
+
+ + ♻ ☆ DynamicISP: Dynamically Controlled Image Signal Processor for Image + Recognition ICCV2023 + + +
+ Image Signal Processors (ISPs) play important roles in image recognition +tasks as well as in the perceptual quality of captured images. In most cases, +experts make a lot of effort to manually tune many parameters of ISPs, but the +parameters are sub-optimal. In the literature, two types of techniques have +been actively studied: a machine learning-based parameter tuning technique and +a DNN-based ISP technique. The former is lightweight but lacks expressive +power. The latter has expressive power, but the computational cost is too heavy +on edge devices. To solve these problems, we propose "DynamicISP," which +consists of multiple classical ISP functions and dynamically controls the +parameters of each frame according to the recognition result of the previous +frame. We show our method successfully controls the parameters of multiple ISP +functions and achieves state-of-the-art accuracy with low computational cost in +single and multi-category object detection tasks. + +
+
+ comment: Accepted to ICCV2023. Several updates from v2 including additional + experiments and modification of typos in Auto Gain equation +
+
+
+
+
+ + ♻ ☆ Ego-Body Pose Estimation via Ego-Head Pose Estimation CVPR 2023 + + +
+ Estimating 3D human motion from an egocentric video sequence plays a critical +role in human behavior understanding and has various applications in VR/AR. +However, naively learning a mapping between egocentric videos and human motions +is challenging, because the user's body is often unobserved by the front-facing +camera placed on the head of the user. In addition, collecting large-scale, +high-quality datasets with paired egocentric videos and 3D human motions +requires accurate motion capture devices, which often limit the variety of +scenes in the videos to lab-like environments. To eliminate the need for paired +egocentric video and human motions, we propose a new method, Ego-Body Pose +Estimation via Ego-Head Pose Estimation (EgoEgo), which decomposes the problem +into two stages, connected by the head motion as an intermediate +representation. EgoEgo first integrates SLAM and a learning approach to +estimate accurate head motion. Subsequently, leveraging the estimated head pose +as input, EgoEgo utilizes conditional diffusion to generate multiple plausible +full-body motions. This disentanglement of head and body pose eliminates the +need for training datasets with paired egocentric videos and 3D human motion, +enabling us to leverage large-scale egocentric video datasets and motion +capture datasets separately. Moreover, for systematic benchmarking, we develop +a synthetic dataset, AMASS-Replica-Ego-Syn (ARES), with paired egocentric +videos and human motion. On both ARES and real data, our EgoEgo model performs +significantly better than the current state-of-the-art methods. + +
+
+ comment: CVPR 2023 (Award Candidate) +
+
+
+
+
+ + ♻ ☆ Exploring the Mutual Influence between Self-Supervised Single-Frame and + Multi-Frame Depth Estimation + + +
+ Although both self-supervised single-frame and multi-frame depth estimation +methods only require unlabeled monocular videos for training, the information +they leverage varies because single-frame methods mainly rely on +appearance-based features while multi-frame methods focus on geometric cues. +Considering the complementary information of single-frame and multi-frame +methods, some works attempt to leverage single-frame depth to improve +multi-frame depth. However, these methods can neither exploit the difference +between single-frame depth and multi-frame depth to improve multi-frame depth +nor leverage multi-frame depth to optimize single-frame depth models. To fully +utilize the mutual influence between single-frame and multi-frame methods, we +propose a novel self-supervised training framework. Specifically, we first +introduce a pixel-wise adaptive depth sampling module guided by single-frame +depth to train the multi-frame model. Then, we leverage the minimum +reprojection based distillation loss to transfer the knowledge from the +multi-frame depth network to the single-frame network to improve single-frame +depth. Finally, we regard the improved single-frame depth as a prior to further +boost the performance of multi-frame depth estimation. Experimental results on +the KITTI and Cityscapes datasets show that our method outperforms existing +approaches in the self-supervised monocular setting. + +
+
+ comment: Accepted for publication in the IEEE Robotics and Automation Letters + (RA-L). 8 pages, 3figures +
+
+
+
+
+ + ♻ ☆ Unsupervised Anomaly Detection in Medical Images Using Masked Diffusion + Model MICCAI 2023 + + +
+ It can be challenging to identify brain MRI anomalies using supervised +deep-learning techniques due to anatomical heterogeneity and the requirement +for pixel-level labeling. Unsupervised anomaly detection approaches provide an +alternative solution by relying only on sample-level labels of healthy brains +to generate a desired representation to identify abnormalities at the pixel +level. Although, generative models are crucial for generating such anatomically +consistent representations of healthy brains, accurately generating the +intricate anatomy of the human brain remains a challenge. In this study, we +present a method called masked-DDPM (mDPPM), which introduces masking-based +regularization to reframe the generation task of diffusion models. +Specifically, we introduce Masked Image Modeling (MIM) and Masked Frequency +Modeling (MFM) in our self-supervised approach that enables models to learn +visual representations from unlabeled data. To the best of our knowledge, this +is the first attempt to apply MFM in DPPM models for medical applications. We +evaluate our approach on datasets containing tumors and numerous sclerosis +lesions and exhibit the superior performance of our unsupervised method as +compared to the existing fully/weakly supervised baselines. Code is available +at https://github.com/hasan1292/mDDPM. + +
+
+ comment: Accepted in MICCAI 2023 Workshops +
+
+
+
+
+ + ♻ ☆ Score-Based Diffusion Models as Principled Priors for Inverse Imaging ICCV 2023 + + +
+ Priors are essential for reconstructing images from noisy and/or incomplete +measurements. The choice of the prior determines both the quality and +uncertainty of recovered images. We propose turning score-based diffusion +models into principled image priors ("score-based priors") for analyzing a +posterior of images given measurements. Previously, probabilistic priors were +limited to handcrafted regularizers and simple distributions. In this work, we +empirically validate the theoretically-proven probability function of a +score-based diffusion model. We show how to sample from resulting posteriors by +using this probability function for variational inference. Our results, +including experiments on denoising, deblurring, and interferometric imaging, +suggest that score-based priors enable principled inference with a +sophisticated, data-driven image prior. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Spatially Varying Nanophotonic Neural Networks + + +
+ The explosive growth of computation and energy cost of artificial +intelligence has spurred strong interests in new computing modalities as +potential alternatives to conventional electronic processors. Photonic +processors that execute operations using photons instead of electrons, have +promised to enable optical neural networks with ultra-low latency and power +consumption. However, existing optical neural networks, limited by the +underlying network designs, have achieved image recognition accuracy much lower +than state-of-the-art electronic neural networks. In this work, we close this +gap by introducing a large-kernel spatially-varying convolutional neural +network learned via low-dimensional reparameterization techniques. We +experimentally instantiate the network with a flat meta-optical system that +encompasses an array of nanophotonic structures designed to induce +angle-dependent responses. Combined with an extremely lightweight electronic +backend with approximately 2K parameters we demonstrate a nanophotonic neural +network reaches 73.80\% blind test classification accuracy on CIFAR-10 dataset, +and, as such, the first time, an optical neural network outperforms the first +modern digital neural network -- AlexNet (72.64\%) with 57M parameters, +bringing optical neural network into modern deep learning era. + +
+
+
+
+
+ + ♻ ☆ Trajectory Poisson multi-Bernoulli mixture filter for traffic monitoring + using a drone + + +
+ This paper proposes a multi-object tracking (MOT) algorithm for traffic +monitoring using a drone equipped with optical and thermal cameras. Object +detections on the images are obtained using a neural network for each type of +camera. The cameras are modelled as direction-of-arrival (DOA) sensors. Each +DOA detection follows a von-Mises Fisher distribution, whose mean direction is +obtain by projecting a vehicle position on the ground to the camera. We then +use the trajectory Poisson multi-Bernoulli mixture filter (TPMBM), which is a +Bayesian MOT algorithm, to optimally estimate the set of vehicle trajectories. +We have also developed a parameter estimation algorithm for the measurement +model. We have tested the accuracy of the resulting TPMBM filter in synthetic +and experimental data sets. + +
+
+ comment: accepted in IEEE Transactions on Vehicular Technology +
+
+
+
+
+
+
+
+ + Information Retrieval 13 + +
+
+
+ + ☆ TRIVEA: Transparent Ranking Interpretation using Visual Explanation of + Black-Box Algorithmic Rankers + + +
+ Ranking schemes drive many real-world decisions, like, where to study, whom +to hire, what to buy, etc. Many of these decisions often come with high +consequences. For example, a university can be deemed less prestigious if not +featured in a top-k list, and consumers might not even explore products that do +not get recommended to buyers. At the heart of most of these decisions are +opaque ranking schemes, which dictate the ordering of data entities, but their +internal logic is inaccessible or proprietary. Drawing inferences about the +ranking differences is like a guessing game to the stakeholders, like, the +rankees (i.e., the entities who are ranked, like product companies) and the +decision-makers (i.e., who use the rankings, like buyers). In this paper, we +aim to enable transparency in ranking interpretation by using algorithmic +rankers that learn from available data and by enabling human reasoning about +the learned ranking differences using explainable AI (XAI) methods. To realize +this aim, we leverage the exploration-explanation paradigm of human-data +interaction to let human stakeholders explore subsets and groupings of complex +multi-attribute ranking data using visual explanations of model fit and +attribute influence on rankings. We realize this explanation paradigm for +transparent ranking interpretation in TRIVEA, a visual analytic system that is +fueled by: i) visualizations of model fit derived from algorithmic rankers that +learn the associations between attributes and rankings from available data and +ii) visual explanations derived from XAI methods that help abstract important +patterns, like, the relative influence of attributes in different ranking +ranges. Using TRIVEA, end users not trained in data science have the agency to +transparently reason about the global and local behavior of the rankings +without the need to open black-box ranking models and develop confidence in the +resulting attribute-based inferences. We demonstrate the efficacy of TRIVEA +using multiple usage scenarios and subjective feedback from researchers with +diverse domain expertise. Keywords: Visual Analytics, Learning-to-Rank, +Explainable ML, Ranking + +
+
+ comment: Accepted for publication in SpringerNature's Visual Computer Journal +
+
+
+
+
+ + ☆ Fairness Through Domain Awareness: Mitigating Popularity Bias For Music + Discovery + + +
+ As online music platforms grow, music recommender systems play a vital role +in helping users navigate and discover content within their vast musical +databases. At odds with this larger goal, is the presence of popularity bias, +which causes algorithmic systems to favor mainstream content over, potentially +more relevant, but niche items. In this work we explore the intrinsic +relationship between music discovery and popularity bias. To mitigate this +issue we propose a domain-aware, individual fairness-based approach which +addresses popularity bias in graph neural network (GNNs) based recommender +systems. Our approach uses individual fairness to reflect a ground truth +listening experience, i.e., if two songs sound similar, this similarity should +be reflected in their representations. In doing so, we facilitate meaningful +music discovery that is robust to popularity bias and grounded in the music +domain. We apply our BOOST methodology to two discovery based tasks, performing +recommendations at both the playlist level and user level. Then, we ground our +evaluation in the cold start setting, showing that our approach outperforms +existing fairness benchmarks in both performance and recommendation of +lesser-known content. Finally, our analysis explains why our proposed +methodology is a novel and promising approach to mitigating popularity bias and +improving the discovery of new and niche content in music recommender systems. + +
+
+
+
+
+ + ☆ Efficient and Accurate Tree Detection from 3D Point Clouds through Paid + Crowdsourcing + + +
+ Accurate tree detection is of growing importance in applications such as +urban planning, forest inventory, and environmental monitoring. In this +article, we present an approach to creating tree maps by annotating them in 3D +point clouds. Point cloud representations allow the precise identification of +tree positions, particularly stem locations, and their heights. Our method +leverages human computational power through paid crowdsourcing, employing a web +tool designed to enable even non-experts to effectively tackle the task. The +primary focus of this paper is to discuss the web tool's development and +strategies to ensure high-quality tree annotations despite encountering noise +in the crowdsourced data. Following our methodology, we achieve quality +measures surpassing 90% for various challenging test sets of diverse +complexities. We emphasize that our tree map creation process, including +initial point cloud collection, can be completed within 1-2 days. + +
+
+ comment: This paper can be considered an extension of the approach presented + by Walter et al. + (https://isprs-annals.copernicus.org/articles/V-4-2020/49/2020/) +
+
+
+
+
+ + ☆ Bridging the KB-Text Gap: Leveraging Structured Knowledge-aware + Pre-training for KBQA CIKM 2023 + + +
+ Knowledge Base Question Answering (KBQA) aims to answer natural language +questions with factual information such as entities and relations in KBs. +However, traditional Pre-trained Language Models (PLMs) are directly +pre-trained on large-scale natural language corpus, which poses challenges for +them in understanding and representing complex subgraphs in structured KBs. To +bridge the gap between texts and structured KBs, we propose a Structured +Knowledge-aware Pre-training method (SKP). In the pre-training stage, we +introduce two novel structured knowledge-aware tasks, guiding the model to +effectively learn the implicit relationship and better representations of +complex subgraphs. In downstream KBQA task, we further design an efficient +linearization strategy and an interval attention mechanism, which assist the +model to better encode complex subgraphs and shield the interference of +irrelevant subgraphs during reasoning respectively. Detailed experiments and +analyses on WebQSP verify the effectiveness of SKP, especially the significant +improvement in subgraph retrieval (+4.08% H@10). + +
+
+ comment: Accepted as a short paper at CIKM 2023 +
+
+
+
+
+ + ☆ Can Transformer and GNN Help Each Other? + + +
+ Although Transformer has achieved great success in natural language process +and computer vision, it has difficulty generalizing to medium and large-scale +graph data for two important reasons: (i) High complexity. (ii) Failing to +capture the complex and entangled structure information. In graph +representation learning, Graph Neural Networks(GNNs) can fuse the graph +structure and node attributes but have limited receptive fields. Therefore, we +question whether can we combine Transformers and GNNs to help each other. In +this paper, we propose a new model named TransGNN where the Transformer layer +and GNN layer are used alternately to improve each other. Specifically, to +expand the receptive field and disentangle the information aggregation from +edges, we propose using Transformer to aggregate more relevant nodes' +information to improve the message passing of GNNs. Besides, to capture the +graph structure information, we utilize positional encoding and make use of the +GNN layer to fuse the structure into node attributes, which improves the +Transformer in graph data. We also propose to sample the most relevant nodes +for Transformer and two efficient samples update strategies to lower the +complexity. At last, we theoretically prove that TransGNN is more expressive +than GNNs only with extra linear complexity. The experiments on eight datasets +corroborate the effectiveness of TransGNN on node and graph classification +tasks. + +
+
+
+
+
+ + ☆ RecMind: Large Language Model Powered Agent For Recommendation + + +
+ Recent advancements in instructing Large Language Models (LLMs) to utilize +external tools and execute multi-step plans have significantly enhanced their +ability to solve intricate tasks, ranging from mathematical problems to +creative writing. Yet, there remains a notable gap in studying the capacity of +LLMs in responding to personalized queries such as a recommendation request. To +bridge this gap, we have designed an LLM-powered autonomous recommender agent, +RecMind, which is capable of providing precise personalized recommendations +through careful planning, utilizing tools for obtaining external knowledge, and +leveraging individual data. We propose a novel algorithm, Self-Inspiring, to +improve the planning ability of the LLM agent. At each intermediate planning +step, the LLM 'self-inspires' to consider all previously explored states to +plan for next step. This mechanism greatly improves the model's ability to +comprehend and utilize historical planning information for recommendation. We +evaluate RecMind's performance in various recommendation scenarios, including +rating prediction, sequential recommendation, direct recommendation, +explanation generation, and review summarization. Our experiment shows that +RecMind outperforms existing zero/few-shot LLM-based recommendation methods in +different recommendation tasks and achieves competitive performance to a recent +model P5, which requires fully pre-train for the recommendation tasks. + +
+
+
+
+
+ + ☆ Alleviating Video-Length Effect for Micro-video Recommendation + + +
+ Micro-videos platforms such as TikTok are extremely popular nowadays. One +important feature is that users no longer select interested videos from a set, +instead they either watch the recommended video or skip to the next one. As a +result, the time length of users' watching behavior becomes the most important +signal for identifying preferences. However, our empirical data analysis has +shown a video-length effect that long videos are easier to receive a higher +value of average view time, thus adopting such view-time labels for measuring +user preferences can easily induce a biased model that favors the longer +videos. In this paper, we propose a Video Length Debiasing Recommendation +(VLDRec) method to alleviate such an effect for micro-video recommendation. +VLDRec designs the data labeling approach and the sample generation module that +better capture user preferences in a view-time oriented manner. It further +leverages the multi-task learning technique to jointly optimize the above +samples with original biased ones. Extensive experiments show that VLDRec can +improve the users' view time by 1.81% and 11.32% on two real-world datasets, +given a recommendation list of a fixed overall video length, compared with the +best baseline method. Moreover, VLDRec is also more effective in matching +users' interests in terms of the video content. + +
+
+
+
+
+ + ☆ Cross-Modal Retrieval: A Systematic Review of Methods and Future + Directions + + +
+ With the exponential surge in diverse multi-modal data, traditional uni-modal +retrieval methods struggle to meet the needs of users demanding access to data +from various modalities. To address this, cross-modal retrieval has emerged, +enabling interaction across modalities, facilitating semantic matching, and +leveraging complementarity and consistency between different modal data. +Although prior literature undertook a review of the cross-modal retrieval +field, it exhibits numerous deficiencies pertaining to timeliness, taxonomy, +and comprehensiveness. This paper conducts a comprehensive review of +cross-modal retrieval's evolution, spanning from shallow statistical analysis +techniques to vision-language pre-training models. Commencing with a +comprehensive taxonomy grounded in machine learning paradigms, mechanisms, and +models, the paper then delves deeply into the principles and architectures +underpinning existing cross-modal retrieval methods. Furthermore, it offers an +overview of widely used benchmarks, metrics, and performances. Lastly, the +paper probes the prospects and challenges that confront contemporary +cross-modal retrieval, while engaging in a discourse on potential directions +for further progress in the field. To facilitate the research on cross-modal +retrieval, we develop an open-source code repository at +https://github.com/BMC-SDNU/Cross-Modal-Retrieval. + +
+
+
+
+
+ + ☆ RecRec: Algorithmic Recourse for Recommender Systems CIKM 2023 + + +
+ Recommender systems play an essential role in the choices people make in +domains such as entertainment, shopping, food, news, employment, and education. +The machine learning models underlying these recommender systems are often +enormously large and black-box in nature for users, content providers, and +system developers alike. It is often crucial for all stakeholders to understand +the model's rationale behind making certain predictions and recommendations. +This is especially true for the content providers whose livelihoods depend on +the recommender system. Drawing motivation from the practitioners' need, in +this work, we propose a recourse framework for recommender systems, targeted +towards the content providers. Algorithmic recourse in the recommendation +setting is a set of actions that, if executed, would modify the recommendations +(or ranking) of an item in the desired manner. A recourse suggests actions of +the form: "if a feature changes X to Y, then the ranking of that item for a set +of users will change to Z." Furthermore, we demonstrate that RecRec is highly +effective in generating valid, sparse, and actionable recourses through an +empirical evaluation of recommender systems trained on three real-world +datasets. To the best of our knowledge, this work is the first to conceptualize +and empirically test a generalized framework for generating recourses for +recommender systems. + +
+
+ comment: Accepted as a short paper at CIKM 2023 +
+
+
+
+
+ + ☆ Ad-Rec: Advanced Feature Interactions to Address Covariate-Shifts in + Recommendation Networks + + +
+ Recommendation models are vital in delivering personalized user experiences +by leveraging the correlation between multiple input features. However, deep +learning-based recommendation models often face challenges due to evolving user +behaviour and item features, leading to covariate shifts. Effective +cross-feature learning is crucial to handle data distribution drift and +adapting to changing user behaviour. Traditional feature interaction techniques +have limitations in achieving optimal performance in this context. + This work introduces Ad-Rec, an advanced network that leverages feature +interaction techniques to address covariate shifts. This helps eliminate +irrelevant interactions in recommendation tasks. Ad-Rec leverages masked +transformers to enable the learning of higher-order cross-features while +mitigating the impact of data distribution drift. Our approach improves model +quality, accelerates convergence, and reduces training time, as measured by the +Area Under Curve (AUC) metric. We demonstrate the scalability of Ad-Rec and its +ability to achieve superior model quality through comprehensive ablation +studies. + +
+
+
+
+
+ + ☆ Extending Cross-Modal Retrieval with Interactive Learning to Improve + Image Retrieval Performance in Forensics AAAI22 + + +
+ Nowadays, one of the critical challenges in forensics is analyzing the +enormous amounts of unstructured digital evidence, such as images. Often, +unstructured digital evidence contains precious information for forensic +investigations. Therefore, a retrieval system that can effectively identify +forensically relevant images is paramount. In this work, we explored the +effectiveness of interactive learning in improving image retrieval performance +in the forensic domain by proposing Excalibur - a zero-shot cross-modal image +retrieval system extended with interactive learning. Excalibur was evaluated +using both simulations and a user study. The simulations reveal that +interactive learning is highly effective in improving retrieval performance in +the forensic domain. Furthermore, user study participants could effectively +leverage the power of interactive learning. Finally, they considered Excalibur +effective and straightforward to use and expressed interest in using it in +their daily practice. + +
+
+ comment: Submitted to the AAAI22 conference +
+
+
+
+
+ + ♻ ☆ How Discriminative Are Your Qrels? How To Study the Statistical + Significance of Document Adjudication Methods + + +
+ Creating test collections for offline retrieval evaluation requires human +effort to judge documents' relevance. This expensive activity motivated much +work in developing methods for constructing benchmarks with fewer assessment +costs. In this respect, adjudication methods actively decide both which +documents and the order in which experts review them, in order to better +exploit the assessment budget or to lower it. Researchers evaluate the quality +of those methods by measuring the correlation between the known gold ranking of +systems under the full collection and the observed ranking of systems under the +lower-cost one. This traditional analysis ignores whether and how the low-cost +judgements impact on the statistically significant differences among systems +with respect to the full collection. We fill this void by proposing a novel +methodology to evaluate how the low-cost adjudication methods preserve the +pairwise significant differences between systems as the full collection. In +other terms, while traditional approaches look for stability in answering the +question "is system A better than system B?", our proposed approach looks for +stability in answering the question "is system A significantly better than +system B?", which is the ultimate questions researchers need to answer to +guarantee the generalisability of their results. Among other results, we found +that the best methods in terms of ranking of systems correlation do not always +match those preserving statistical significance. + +
+
+
+
+
+ + ♻ ☆ Leveraging Watch-time Feedback for Short-Video Recommendations: A Causal + Labeling Framework + + +
+ With the proliferation of short video applications, the significance of short +video recommendations has vastly increased. Unlike other recommendation +scenarios, short video recommendation systems heavily rely on feedback from +watch time. Existing approaches simply treat watch time as a direct label, +failing to effectively harness its extensive semantics and introduce bias, +thereby limiting the potential for modeling user interests based on watch time. +To overcome this challenge, we propose a framework named Debiased +Multiple-semantics-extracting Labeling(DML). DML constructs labels that +encompass various semantics by utilizing quantiles derived from the +distribution of watch time, prioritizing relative order rather than absolute +label values. This approach facilitates easier model learning while aligning +with the ranking objective of recommendations. Furthermore, we introduce a +method inspired by causal adjustment to refine label definitions, thereby +directly mitigating bias at the label level. We substantiate the effectiveness +of our DML framework through both online and offline experiments. Extensive +results demonstrate that our DML could effectively leverage watch time to +discover users' real interests, enhancing their engagement in our application. + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+
+
+
+ + Machine Learning 145 + +
+
+
+ + ☆ Efficient Discovery and Effective Evaluation of Visual Perceptual + Similarity: A Benchmark and Beyond ICCV 2023 + + +
+ Visual similarities discovery (VSD) is an important task with broad +e-commerce applications. Given an image of a certain object, the goal of VSD is +to retrieve images of different objects with high perceptual visual similarity. +Although being a highly addressed problem, the evaluation of proposed methods +for VSD is often based on a proxy of an identification-retrieval task, +evaluating the ability of a model to retrieve different images of the same +object. We posit that evaluating VSD methods based on identification tasks is +limited, and faithful evaluation must rely on expert annotations. In this +paper, we introduce the first large-scale fashion visual similarity benchmark +dataset, consisting of more than 110K expert-annotated image pairs. Besides +this major contribution, we share insight from the challenges we faced while +curating this dataset. Based on these insights, we propose a novel and +efficient labeling procedure that can be applied to any dataset. Our analysis +examines its limitations and inductive biases, and based on these findings, we +propose metrics to mitigate those limitations. Though our primary focus lies on +visual similarity, the methodologies we present have broader applications for +discovering and evaluating perceptual similarity across various domains. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Minimizing Quasi-Self-Concordant Functions by Gradient Regularization of + Newton Method + + +
+ We study the composite convex optimization problems with a +Quasi-Self-Concordant smooth component. This problem class naturally +interpolates between classic Self-Concordant functions and functions with +Lipschitz continuous Hessian. Previously, the best complexity bounds for this +problem class were associated with trust-region schemes and implementations of +a ball-minimization oracle. In this paper, we show that for minimizing +Quasi-Self-Concordant functions we can use instead the basic Newton Method with +Gradient Regularization. For unconstrained minimization, it only involves a +simple matrix inversion operation (solving a linear system) at each step. We +prove a fast global linear rate for this algorithm, matching the complexity +bound of the trust-region scheme, while our method remains especially simple to +implement. Then, we introduce the Dual Newton Method, and based on it, develop +the corresponding Accelerated Newton Scheme for this problem class, which +further improves the complexity factor of the basic method. As a direct +consequence of our results, we establish fast global linear rates of simple +variants of the Newton Method applied to several practical problems, including +Logistic Regression, Soft Maximum, and Matrix Scaling, without requiring +additional assumptions on strong or uniform convexity for the target objective. + +
+
+
+
+
+ + ☆ Total Selfie: Generating Full-Body Selfies + + +
+ We present a method to generate full-body selfies -- photos that you take of +yourself, but capturing your whole body as if someone else took the photo of +you from a few feet away. Our approach takes as input a pre-captured video of +your body, a target pose photo, and a selfie + background pair for each +location. We introduce a novel diffusion-based approach to combine all of this +information into high quality, well-composed photos of you with the desired +pose and background. + +
+
+ comment: Project page: + https://homes.cs.washington.edu/~boweiche/project_page/totalselfie/ +
+
+
+
+
+ + ☆ Fast Feedforward Networks + + +
+ We break the linear link between the layer size and its inference cost by +introducing the fast feedforward (FFF) architecture, a logarithmic-time +alternative to feedforward networks. + We show that FFFs give comparable performance to feedforward networks at an +exponential fraction of their inference cost, are quicker to deliver +performance compared to mixture-of-expert networks, and can readily take the +place of either in transformers. + Pushing FFFs to the absolute limit, we train a vision transformer to perform +single-neuron inferences at the cost of only 5.8% performance decrease against +the full-width variant. + Our implementation is available as a Python package; just use "pip install +fastfeedforward". + +
+
+ comment: 12 pages, 6 figures, 4 tables +
+
+
+
+
+ + ☆ VideoCutLER: Surprisingly Simple Unsupervised Video Instance + Segmentation + + +
+ Existing approaches to unsupervised video instance segmentation typically +rely on motion estimates and experience difficulties tracking small or +divergent motions. We present VideoCutLER, a simple method for unsupervised +multi-instance video segmentation without using motion-based learning signals +like optical flow or training on natural videos. Our key insight is that using +high-quality pseudo masks and a simple video synthesis method for model +training is surprisingly sufficient to enable the resulting video model to +effectively segment and track multiple instances across video frames. We show +the first competitive unsupervised learning results on the challenging +YouTubeVIS-2019 benchmark, achieving 50.7% APvideo^50 , surpassing the previous +state-of-the-art by a large margin. VideoCutLER can also serve as a strong +pretrained model for supervised video instance segmentation tasks, exceeding +DINO by 15.9% on YouTubeVIS-2019 in terms of APvideo. + +
+
+ comment: Preprint. Code: https://github.com/facebookresearch/CutLER +
+
+
+
+
+ + ☆ Diversified Ensemble of Independent Sub-Networks for Robust + Self-Supervised Representation Learning + + +
+ Ensembling a neural network is a widely recognized approach to enhance model +performance, estimate uncertainty, and improve robustness in deep supervised +learning. However, deep ensembles often come with high computational costs and +memory demands. In addition, the efficiency of a deep ensemble is related to +diversity among the ensemble members which is challenging for large, +over-parameterized deep neural networks. Moreover, ensemble learning has not +yet seen such widespread adoption, and it remains a challenging endeavor for +self-supervised or unsupervised representation learning. Motivated by these +challenges, we present a novel self-supervised training regime that leverages +an ensemble of independent sub-networks, complemented by a new loss function +designed to encourage diversity. Our method efficiently builds a sub-model +ensemble with high diversity, leading to well-calibrated estimates of model +uncertainty, all achieved with minimal computational overhead compared to +traditional deep self-supervised ensembles. To evaluate the effectiveness of +our approach, we conducted extensive experiments across various tasks, +including in-distribution generalization, out-of-distribution detection, +dataset corruption, and semi-supervised settings. The results demonstrate that +our method significantly improves prediction reliability. Our approach not only +achieves excellent accuracy but also enhances calibration, surpassing baseline +performance across a wide range of self-supervised architectures in computer +vision, natural language processing, and genomics data. + +
+
+
+
+
+ + ☆ Hybrid PLS-ML Authentication Scheme for V2I Communication Networks + + +
+ Vehicular communication networks are rapidly emerging as vehicles become +smarter. However, these networks are increasingly susceptible to various +attacks. The situation is exacerbated by the rise in automated vehicles +complicates, emphasizing the need for security and authentication measures to +ensure safe and effective traffic management. In this paper, we propose a novel +hybrid physical layer security (PLS)-machine learning (ML) authentication +scheme by exploiting the position of the transmitter vehicle as a device +fingerprint. We use a time-of-arrival (ToA) based localization mechanism where +the ToA is estimated at roadside units (RSUs), and the coordinates of the +transmitter vehicle are extracted at the base station (BS).Furthermore, to +track the mobility of the moving legitimate vehicle, we use ML model trained on +several system parameters. We try two ML models for this purpose, i.e., support +vector regression and decision tree. To evaluate our scheme, we conduct binary +hypothesis testing on the estimated positions with the help of the ground +truths provided by the ML model, which classifies the transmitter node as +legitimate or malicious. Moreover, we consider the probability of false alarm +and the probability of missed detection as performance metrics resulting from +the binary hypothesis testing, and mean absolute error (MAE), mean square error +(MSE), and coefficient of determination $\text{R}^2$ to further evaluate the ML +models. We also compare our scheme with a baseline scheme that exploits the +angle of arrival at RSUs for authentication. We observe that our proposed +position-based mechanism outperforms the baseline scheme significantly in terms +of missed detections. + +
+
+ comment: Accepted for Publication following Presentation at IEEE ISNCC-23 +
+
+
+
+
+ + ☆ Fine-Tuning Llama 2 Large Language Models for Detecting Online Sexual + Predatory Chats and Abusive Texts + + +
+ Detecting online sexual predatory behaviours and abusive language on social +media platforms has become a critical area of research due to the growing +concerns about online safety, especially for vulnerable populations such as +children and adolescents. Researchers have been exploring various techniques +and approaches to develop effective detection systems that can identify and +mitigate these risks. Recent development of large language models (LLMs) has +opened a new opportunity to address this problem more effectively. This paper +proposes an approach to detection of online sexual predatory chats and abusive +language using the open-source pretrained Llama 2 7B-parameter model, recently +released by Meta GenAI. We fine-tune the LLM using datasets with different +sizes, imbalance degrees, and languages (i.e., English, Roman Urdu and Urdu). +Based on the power of LLMs, our approach is generic and automated without a +manual search for a synergy between feature extraction and classifier design +steps like conventional methods in this domain. Experimental results show a +strong performance of the proposed approach, which performs proficiently and +consistently across three distinct datasets with five sets of experiments. This +study's outcomes indicate that the proposed method can be implemented in +real-world applications (even with non-English languages) for flagging sexual +predators, offensive or toxic content, hate speech, and discriminatory language +in online discussions and comments to maintain respectful internet or digital +communities. Furthermore, it can be employed for solving text classification +problems with other potential applications such as sentiment analysis, spam and +phishing detection, sorting legal documents, fake news detection, language +identification, user intent recognition, text-based product categorization, +medical record analysis, and resume screening. + +
+
+
+
+
+ + ☆ RESTORE: Graph Embedding Assessment Through Reconstruction + + +
+ Following the success of Word2Vec embeddings, graph embeddings (GEs) have +gained substantial traction. GEs are commonly generated and evaluated +extrinsically on downstream applications, but intrinsic evaluations of the +original graph properties in terms of topological structure and semantic +information have been lacking. Understanding these will help identify the +deficiency of the various families of GE methods when vectorizing graphs in +terms of preserving the relevant knowledge or learning incorrect knowledge. To +address this, we propose RESTORE, a framework for intrinsic GEs assessment +through graph reconstruction. We show that reconstructing the original graph +from the underlying GEs yields insights into the relative amount of information +preserved in a given vector form. We first introduce the graph reconstruction +task. We generate GEs from three GE families based on factorization methods, +random walks, and deep learning (with representative algorithms from each +family) on the CommonSense Knowledge Graph (CSKG). We analyze their +effectiveness in preserving the (a) topological structure of node-level graph +reconstruction with an increasing number of hops and (b) semantic information +on various word semantic and analogy tests. Our evaluations show deep +learning-based GE algorithm (SDNE) is overall better at preserving (a) with a +mean average precision (mAP) of 0.54 and 0.35 for 2 and 3-hop reconstruction +respectively, while the factorization-based algorithm (HOPE) is better at +encapsulating (b) with an average Euclidean distance of 0.14, 0.17, and 0.11 +for 1, 2, and 3-hop reconstruction respectively. The modest performance of +these GEs leaves room for further research avenues on better graph +representation learning. + +
+
+
+
+
+ + ☆ Adversarial Predictions of Data Distributions Across Federated + Internet-of-Things Devices + + +
+ Federated learning (FL) is increasingly becoming the default approach for +training machine learning models across decentralized Internet-of-Things (IoT) +devices. A key advantage of FL is that no raw data are communicated across the +network, providing an immediate layer of privacy. Despite this, recent works +have demonstrated that data reconstruction can be done with the locally trained +model updates which are communicated across the network. However, many of these +works have limitations with regard to how the gradients are computed in +backpropagation. In this work, we demonstrate that the model weights shared in +FL can expose revealing information about the local data distributions of IoT +devices. This leakage could expose sensitive information to malicious actors in +a distributed system. We further discuss results which show that injecting +noise into model weights is ineffective at preventing data leakage without +seriously harming the global model accuracy. + +
+
+ comment: 6 pages, 6 figures, accepted for publication through 2023 IEEE World + Forum on Internet of Things +
+
+
+
+
+ + ☆ Comparison of automated crater catalogs for Mars from Benedix et al. + (2020) and Lee and Hogan (2021) + + +
+ Crater mapping using neural networks and other automated methods has +increased recently with automated Crater Detection Algorithms (CDAs) applied to +planetary bodies throughout the solar system. A recent publication by Benedix +et al. (2020) showed high performance at small scales compared to similar +automated CDAs but with a net positive diameter bias in many crater candidates. +I compare the publicly available catalogs from Benedix et al. (2020) and Lee & +Hogan (2021) and show that the reported performance is sensitive to the metrics +used to test the catalogs. I show how the more permissive comparison methods +indicate a higher CDA performance by allowing worse candidate craters to match +ground-truth craters. I show that the Benedix et al. (2020) catalog has a +substantial performance loss with increasing latitude and identify an image +projection issue that might cause this loss. Finally, I suggest future +applications of neural networks in generating large scientific datasets be +validated using secondary networks with independent data sources or training +methods. + +
+
+ comment: 14 pages, 6 figures. Accepted August 13th 2023 +
+
+
+
+
+ + ☆ Edge Generation Scheduling for DAG Tasks using Deep Reinforcement + Learning + + +
+ Directed acyclic graph (DAG) tasks are currently adopted in the real-time +domain to model complex applications from the automotive, avionics, and +industrial domain that implement their functionalities through chains of +intercommunicating tasks. This paper studies the problem of scheduling +real-time DAG tasks by presenting a novel schedulability test based on the +concept of trivial schedulability. Using this schedulability test, we propose a +new DAG scheduling framework (edge generation scheduling -- EGS) that attempts +to minimize the DAG width by iteratively generating edges while guaranteeing +the deadline constraint. We study how to efficiently solve the problem of +generating edges by developing a deep reinforcement learning algorithm combined +with a graph representation neural network to learn an efficient edge +generation policy for EGS. We evaluate the effectiveness of the proposed +algorithm by comparing it with state-of-the-art DAG scheduling heuristics and +an optimal mixed-integer linear programming baseline. Experimental results show +that the proposed algorithm outperforms the state-of-the-art by requiring fewer +processors to schedule the same DAG tasks. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Human Comfortability Index Estimation in Industrial Human-Robot + Collaboration Task + + +
+ Fluent human-robot collaboration requires a robot teammate to understand, +learn, and adapt to the human's psycho-physiological state. Such collaborations +require a computing system that monitors human physiological signals during +human-robot collaboration (HRC) to quantitatively estimate a human's level of +comfort, which we have termed in this research as comfortability index (CI) and +uncomfortability index (unCI). Subjective metrics (surprise, anxiety, boredom, +calmness, and comfortability) and physiological signals were collected during a +human-robot collaboration experiment that varied robot behavior. The emotion +circumplex model is adapted to calculate the CI from the participant's +quantitative data as well as physiological data. To estimate CI/unCI from +physiological signals, time features were extracted from electrocardiogram +(ECG), galvanic skin response (GSR), and pupillometry signals. In this +research, we successfully adapt the circumplex model to find the location +(axis) of 'comfortability' and 'uncomfortability' on the circumplex model, and +its location match with the closest emotions on the circumplex model. Finally, +the study showed that the proposed approach can estimate human +comfortability/uncomfortability from physiological signals. + +
+
+ comment: Submitted to IEEE-THMS +
+
+
+
+
+ + ☆ Rate-Optimal Policy Optimization for Linear Markov Decision Processes + + +
+ We study regret minimization in online episodic linear Markov Decision +Processes, and obtain rate-optimal $\widetilde O (\sqrt K)$ regret where $K$ +denotes the number of episodes. Our work is the first to establish the optimal +(w.r.t.~$K$) rate of convergence in the stochastic setting with bandit feedback +using a policy optimization based approach, and the first to establish the +optimal (w.r.t.~$K$) rate in the adversarial setup with full information +feedback, for which no algorithm with an optimal rate guarantee is currently +known. + +
+
+
+
+
+ + ☆ Breaking the Bank with ChatGPT: Few-Shot Text Classification for Finance IJCAI-2023 + + +
+ We propose the use of conversational GPT models for easy and quick few-shot +text classification in the financial domain using the Banking77 dataset. Our +approach involves in-context learning with GPT-3.5 and GPT-4, which minimizes +the technical expertise required and eliminates the need for expensive GPU +computing while yielding quick and accurate results. Additionally, we fine-tune +other pre-trained, masked language models with SetFit, a recent contrastive +learning technique, to achieve state-of-the-art results both in full-data and +few-shot settings. Our findings show that querying GPT-3.5 and GPT-4 can +outperform fine-tuned, non-generative models even with fewer examples. However, +subscription fees associated with these solutions may be considered costly for +small organizations. Lastly, we find that generative models perform better on +the given task when shown representative samples selected by a human expert +rather than when shown random ones. We conclude that a) our proposed methods +offer a practical solution for few-shot tasks in datasets with limited label +availability, and b) our state-of-the-art results can inspire future work in +the area. + +
+
+ comment: Early pre-print; Accepted at the 5th FinNLP workshop @ IJCAI-2023 +
+
+
+
+
+ + ☆ Comparing AutoML and Deep Learning Methods for Condition Monitoring + using Realistic Validation Scenarios + + +
+ This study extensively compares conventional machine learning methods and +deep learning for condition monitoring tasks using an AutoML toolbox. The +experiments reveal consistent high accuracy in random K-fold cross-validation +scenarios across all tested models. However, when employing leave-one-group-out +(LOGO) cross-validation on the same datasets, no clear winner emerges, +indicating the presence of domain shift in real-world scenarios. Additionally, +the study assesses the scalability and interpretability of conventional methods +and neural networks. Conventional methods offer explainability with their +modular structure aiding feature identification. In contrast, neural networks +require specialized interpretation techniques like occlusion maps to visualize +important regions in the input data. Finally, the paper highlights the +significance of feature selection, particularly in condition monitoring tasks +with limited class variations. Low-complexity models prove sufficient for such +tasks, as only a few features from the input signal are typically needed. In +summary, these findings offer crucial insights into the strengths and +limitations of various approaches, providing valuable benchmarks and +identifying the most suitable methods for condition monitoring applications, +thereby enhancing their applicability in real-world scenarios. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ☆ VesselShot: Few-shot learning for cerebral blood vessel segmentation + + +
+ Angiography is widely used to detect, diagnose, and treat cerebrovascular +diseases. While numerous techniques have been proposed to segment the vascular +network from different imaging modalities, deep learning (DL) has emerged as a +promising approach. However, existing DL methods often depend on proprietary +datasets and extensive manual annotation. Moreover, the availability of +pre-trained networks specifically for medical domains and 3D volumes is +limited. To overcome these challenges, we propose a few-shot learning approach +called VesselShot for cerebrovascular segmentation. VesselShot leverages +knowledge from a few annotated support images and mitigates the scarcity of +labeled data and the need for extensive annotation in cerebral blood vessel +segmentation. We evaluated the performance of VesselShot using the publicly +available TubeTK dataset for the segmentation task, achieving a mean Dice +coefficient (DC) of 0.62(0.03). + +
+
+
+
+
+ + ☆ AI in the Gray: Exploring Moderation Policies in Dialogic Large Language + Models vs. Human Answers in Controversial Topics + + +
+ The introduction of ChatGPT and the subsequent improvement of Large Language +Models (LLMs) have prompted more and more individuals to turn to the use of +ChatBots, both for information and assistance with decision-making. However, +the information the user is after is often not formulated by these ChatBots +objectively enough to be provided with a definite, globally accepted answer. + Controversial topics, such as "religion", "gender identity", "freedom of +speech", and "equality", among others, can be a source of conflict as partisan +or biased answers can reinforce preconceived notions or promote disinformation. +By exposing ChatGPT to such debatable questions, we aim to understand its level +of awareness and if existing models are subject to socio-political and/or +economic biases. We also aim to explore how AI-generated answers compare to +human ones. For exploring this, we use a dataset of a social media platform +created for the purpose of debating human-generated claims on polemic subjects +among users, dubbed Kialo. + Our results show that while previous versions of ChatGPT have had important +issues with controversial topics, more recent versions of ChatGPT +(gpt-3.5-turbo) are no longer manifesting significant explicit biases in +several knowledge areas. In particular, it is well-moderated regarding economic +aspects. However, it still maintains degrees of implicit libertarian leaning +toward right-winged ideals which suggest the need for increased moderation from +the socio-political point of view. In terms of domain knowledge on +controversial topics, with the exception of the "Philosophical" category, +ChatGPT is performing well in keeping up with the collective human level of +knowledge. Finally, we see that sources of Bing AI have slightly more tendency +to the center when compared to human answers. All the analyses we make are +generalizable to other types of biases and domains. + +
+
+
+
+
+ + ☆ On the Tradeoff between Privacy Preservation and Byzantine-Robustness in + Decentralized Learning + + +
+ This paper jointly considers privacy preservation and Byzantine-robustness in +decentralized learning. In a decentralized network, honest-but-curious agents +faithfully follow the prescribed algorithm, but expect to infer their +neighbors' private data from messages received during the learning process, +while dishonest-and-Byzantine agents disobey the prescribed algorithm, and +deliberately disseminate wrong messages to their neighbors so as to bias the +learning process. For this novel setting, we investigate a generic +privacy-preserving and Byzantine-robust decentralized stochastic gradient +descent (SGD) framework, in which Gaussian noise is injected to preserve +privacy and robust aggregation rules are adopted to counteract Byzantine +attacks. We analyze its learning error and privacy guarantee, discovering an +essential tradeoff between privacy preservation and Byzantine-robustness in +decentralized learning -- the learning error caused by defending against +Byzantine attacks is exacerbated by the Gaussian noise added to preserve +privacy. Numerical experiments are conducted and corroborate our theoretical +findings. + +
+
+
+
+
+ + ☆ Recent Progress in Energy Management of Connected Hybrid Electric + Vehicles Using Reinforcement Learning + + +
+ The growing adoption of hybrid electric vehicles (HEVs) presents a +transformative opportunity for revolutionizing transportation energy systems. +The shift towards electrifying transportation aims to curb environmental +concerns related to fossil fuel consumption. This necessitates efficient energy +management systems (EMS) to optimize energy efficiency. The evolution of EMS +from HEVs to connected hybrid electric vehicles (CHEVs) represent a pivotal +shift. For HEVs, EMS now confronts the intricate energy cooperation +requirements of CHEVs, necessitating advanced algorithms for route +optimization, charging coordination, and load distribution. Challenges persist +in both domains, including optimal energy utilization for HEVs, and cooperative +eco-driving control (CED) for CHEVs across diverse vehicle types. Reinforcement +learning (RL) stands out as a promising tool for addressing these challenges at +hand. Specifically, within the realm of CHEVs, the application of multi-agent +reinforcement learning (MARL) emerges as a powerful approach for effectively +tackling the intricacies of CED control. Despite extensive research, few +reviews span from individual vehicles to multi-vehicle scenarios. This review +bridges the gap, highlighting challenges, advancements, and potential +contributions of RL-based solutions for future sustainable transportation +systems. + +
+
+
+
+
+ + ☆ Fairness Through Domain Awareness: Mitigating Popularity Bias For Music + Discovery + + +
+ As online music platforms grow, music recommender systems play a vital role +in helping users navigate and discover content within their vast musical +databases. At odds with this larger goal, is the presence of popularity bias, +which causes algorithmic systems to favor mainstream content over, potentially +more relevant, but niche items. In this work we explore the intrinsic +relationship between music discovery and popularity bias. To mitigate this +issue we propose a domain-aware, individual fairness-based approach which +addresses popularity bias in graph neural network (GNNs) based recommender +systems. Our approach uses individual fairness to reflect a ground truth +listening experience, i.e., if two songs sound similar, this similarity should +be reflected in their representations. In doing so, we facilitate meaningful +music discovery that is robust to popularity bias and grounded in the music +domain. We apply our BOOST methodology to two discovery based tasks, performing +recommendations at both the playlist level and user level. Then, we ground our +evaluation in the cold start setting, showing that our approach outperforms +existing fairness benchmarks in both performance and recommendation of +lesser-known content. Finally, our analysis explains why our proposed +methodology is a novel and promising approach to mitigating popularity bias and +improving the discovery of new and niche content in music recommender systems. + +
+
+
+
+
+ + ☆ Adversarial Attacks on Foundational Vision Models + + +
+ Rapid progress is being made in developing large, pretrained, task-agnostic +foundational vision models such as CLIP, ALIGN, DINOv2, etc. In fact, we are +approaching the point where these models do not have to be finetuned +downstream, and can simply be used in zero-shot or with a lightweight probing +head. Critically, given the complexity of working at this scale, there is a +bottleneck where relatively few organizations in the world are executing the +training then sharing the models on centralized platforms such as HuggingFace +and torch.hub. The goal of this work is to identify several key adversarial +vulnerabilities of these models in an effort to make future designs more +robust. Intuitively, our attacks manipulate deep feature representations to +fool an out-of-distribution (OOD) detector which will be required when using +these open-world-aware models to solve closed-set downstream tasks. Our methods +reliably make in-distribution (ID) images (w.r.t. a downstream task) be +predicted as OOD and vice versa while existing in extremely +low-knowledge-assumption threat models. We show our attacks to be potent in +whitebox and blackbox settings, as well as when transferred across foundational +model types (e.g., attack DINOv2 with CLIP)! This work is only just the +beginning of a long journey towards adversarially robust foundational vision +models. + +
+
+
+
+
+ + ☆ LatentDR: Improving Model Generalization Through Sample-Aware Latent + Degradation and Restoration + + +
+ Despite significant advances in deep learning, models often struggle to +generalize well to new, unseen domains, especially when training data is +limited. To address this challenge, we propose a novel approach for +distribution-aware latent augmentation that leverages the relationships across +samples to guide the augmentation procedure. Our approach first degrades the +samples stochastically in the latent space, mapping them to augmented labels, +and then restores the samples from their corrupted versions during training. +This process confuses the classifier in the degradation step and restores the +overall class distribution of the original samples, promoting diverse +intra-class/cross-domain variability. We extensively evaluate our approach on a +diverse set of datasets and tasks, including domain generalization benchmarks +and medical imaging datasets with strong domain shift, where we show our +approach achieves significant improvements over existing methods for latent +space augmentation. We further show that our method can be flexibly adapted to +long-tail recognition tasks, demonstrating its versatility in building more +generalizable models. Code is available at +https://github.com/nerdslab/LatentDR. + +
+
+
+
+
+ + ☆ Neural Network Training Strategy to Enhance Anomaly Detection + Performance: A Perspective on Reconstruction Loss Amplification + + +
+ Unsupervised anomaly detection (UAD) is a widely adopted approach in industry +due to rare anomaly occurrences and data imbalance. A desirable characteristic +of an UAD model is contained generalization ability which excels in the +reconstruction of seen normal patterns but struggles with unseen anomalies. +Recent studies have pursued to contain the generalization capability of their +UAD models in reconstruction from different perspectives, such as design of +neural network (NN) structure and training strategy. In contrast, we note that +containing of generalization ability in reconstruction can also be obtained +simply from steep-shaped loss landscape. Motivated by this, we propose a loss +landscape sharpening method by amplifying the reconstruction loss, dubbed Loss +AMPlification (LAMP). LAMP deforms the loss landscape into a steep shape so the +reconstruction error on unseen anomalies becomes greater. Accordingly, the +anomaly detection performance is improved without any change of the NN +architecture. Our findings suggest that LAMP can be easily applied to any +reconstruction error metrics in UAD settings where the reconstruction model is +trained with anomaly-free samples only. + +
+
+ comment: 5 pages, 4 figures, 2 tables +
+
+
+
+
+ + ☆ Kernel Limit of Recurrent Neural Networks Trained on Ergodic Data + Sequences + + +
+ Mathematical methods are developed to characterize the asymptotics of +recurrent neural networks (RNN) as the number of hidden units, data samples in +the sequence, hidden state updates, and training steps simultaneously grow to +infinity. In the case of an RNN with a simplified weight matrix, we prove the +convergence of the RNN to the solution of an infinite-dimensional ODE coupled +with the fixed point of a random algebraic equation. The analysis requires +addressing several challenges which are unique to RNNs. In typical mean-field +applications (e.g., feedforward neural networks), discrete updates are of +magnitude $\mathcal{O}(\frac{1}{N})$ and the number of updates is +$\mathcal{O}(N)$. Therefore, the system can be represented as an Euler +approximation of an appropriate ODE/PDE, which it will converge to as $N +\rightarrow \infty$. However, the RNN hidden layer updates are +$\mathcal{O}(1)$. Therefore, RNNs cannot be represented as a discretization of +an ODE/PDE and standard mean-field techniques cannot be applied. Instead, we +develop a fixed point analysis for the evolution of the RNN memory states, with +convergence estimates in terms of the number of update steps and the number of +hidden units. The RNN hidden layer is studied as a function in a Sobolev space, +whose evolution is governed by the data sequence (a Markov chain), the +parameter updates, and its dependence on the RNN hidden layer at the previous +time step. Due to the strong correlation between updates, a Poisson equation +must be used to bound the fluctuations of the RNN around its limit equation. +These mathematical methods give rise to the neural tangent kernel (NTK) limits +for RNNs trained on data sequences as the number of data samples and size of +the neural network grow to infinity. + +
+
+
+
+
+ + ☆ Spoken Language Intelligence of Large Language Models for Language + Learning + + +
+ People have long hoped for a conversational system that can assist in +real-life situations, and recent progress on large language models (LLMs) is +bringing this idea closer to reality. While LLMs are often impressive in +performance, their efficacy in real-world scenarios that demand expert +knowledge remains unclear. LLMs are believed to hold the most potential and +value in education, especially in the development of Artificial intelligence +(AI) based virtual teachers capable of facilitating language learning. Our +focus is centered on evaluating the efficacy of LLMs in the realm of education, +specifically in the areas of spoken language learning which encompass +phonetics, phonology, and second language acquisition. We introduce a new +multiple-choice question dataset to evaluate the effectiveness of LLMs in the +aforementioned scenarios, including understanding and application of spoken +language knowledge. In addition, we investigate the influence of various +prompting techniques such as zero- and few-shot method (prepending the question +with question-answer exemplars), chain-of-thought (CoT, think step-by-step), +in-domain exampler and external tools (Google, Wikipedia). We conducted +large-scale evaluation on popular LLMs (20 distinct models) using these +methods. We achieved significant performance improvements compared to the +zero-shot baseline in the practical questions reasoning (GPT-3.5, 49.1% -> +63.1%; LLaMA2-70B-Chat, 42.2% -> 48.6%). We found that models of different +sizes have good understanding of concepts in phonetics, phonology, and second +language acquisition, but show limitations in reasoning for real-world +problems. Additionally, we also explore preliminary findings on conversational +communication. + +
+
+ comment: 28 pages, 7 figures, Preprint +
+
+
+
+
+ + ☆ Large Graph Models: A Perspective + + +
+ Large models have emerged as the most recent groundbreaking achievements in +artificial intelligence, and particularly machine learning. However, when it +comes to graphs, large models have not achieved the same level of success as in +other fields, such as natural language processing and computer vision. In order +to promote applying large models for graphs forward, we present a perspective +paper to discuss the challenges and opportunities associated with developing +large graph models. First, we discuss the desired characteristics of large +graph models. Then, we present detailed discussions from three key +perspectives: representation basis, graph data, and graph models. In each +category, we provide a brief overview of recent advances and highlight the +remaining challenges together with our visions. Finally, we discuss valuable +applications of large graph models. We believe this perspective paper is able +to encourage further investigations into large graph models, ultimately pushing +us one step closer towards artificial general intelligence (AGI). + +
+
+ comment: Preliminary version. Comments are welcome +
+
+
+
+
+ + ☆ Context-Aware Composition of Agent Policies by Markov Decision Process + Entity Embeddings and Agent Ensembles + + +
+ Computational agents support humans in many areas of life and are therefore +found in heterogeneous contexts. This means that agents operate in rapidly +changing environments and can be confronted with huge state and action spaces. +In order to perform services and carry out activities in a goal-oriented +manner, agents require prior knowledge and therefore have to develop and pursue +context-dependent policies. The problem is that prescribing policies in advance +is limited and inflexible, especially in dynamically changing environments. +Moreover, the context of an agent determines its choice of actions. Since the +environments in which agents operate can be stochastic and complex in terms of +the number of states and feasible actions, activities are usually modelled in a +simplified way by Markov decision processes so that agents with reinforcement +learning are able to learn policies that help to capture the context and act +accordingly to optimally perform activities. However, training policies for all +possible contexts using reinforcement learning is time-consuming. A requirement +and challenge for agents is to learn strategies quickly and respond immediately +in cross-context environments and applications. In this work, we propose a +novel simulation-based approach that enables a) the representation of +heterogeneous contexts through knowledge graphs and entity embeddings and b) +the context-aware composition of policies on demand by ensembles of agents +running in parallel. The evaluation we performed on the "Virtual Home" dataset +indicates that agents that need to seamlessly switch between different +contexts, can request on-the-fly composed policies that lead to the successful +completion of context-appropriate activities without having to learn these +policies in lengthy training steps and episodes, in contrast to agents that +apply reinforcement learning. + +
+
+ comment: 29 pages, 11 figures, 9 tables, 3 listings, Submitted to Semantic Web + Journal, Under revision for re-submission to Semantic Web Journal +
+
+
+
+
+ + ☆ Prediction of Tourism Flow with Sparse Geolocation Data SC2023 + + +
+ Modern tourism in the 21st century is facing numerous challenges. Among these +the rapidly growing number of tourists visiting space-limited regions like +historical cities, museums and bottlenecks such as bridges is one of the +biggest. In this context, a proper and accurate prediction of tourism volume +and tourism flow within a certain area is important and critical for visitor +management tasks such as sustainable treatment of the environment and +prevention of overcrowding. Static flow control methods like conventional +low-level controllers or limiting access to overcrowded venues could not solve +the problem yet. In this paper, we empirically evaluate the performance of +state-of-the-art deep-learning methods such as RNNs, GNNs, and Transformers as +well as the classic statistical ARIMA method. Granular limited data supplied by +a tourism region is extended by exogenous data such as geolocation trajectories +of individual tourists, weather and holidays. In the field of visitor flow +prediction with sparse data, we are thereby capable of increasing the accuracy +of our predictions, incorporating modern input feature handling as well as +mapping geolocation data on top of discrete POI data. + +
+
+ comment: Accepted for publication at the proceedings of the 5th International + Data Science Conference - iDSC2023. arXiv admin note: substantial text + overlap with arXiv:2206.13274 +
+
+
+
+
+ + ☆ Spectral Estimators for Structured Generalized Linear Models via + Approximate Message Passing + + +
+ We consider the problem of parameter estimation from observations given by a +generalized linear model. Spectral methods are a simple yet effective approach +for estimation: they estimate the parameter via the principal eigenvector of a +matrix obtained by suitably preprocessing the observations. Despite their wide +use, a rigorous performance characterization of spectral estimators, as well as +a principled way to preprocess the data, is available only for unstructured +(i.e., i.i.d. Gaussian and Haar) designs. In contrast, real-world design +matrices are highly structured and exhibit non-trivial correlations. To address +this problem, we consider correlated Gaussian designs which capture the +anisotropic nature of the measurements via a feature covariance matrix +$\Sigma$. Our main result is a precise asymptotic characterization of the +performance of spectral estimators in this setting. This then allows to +identify the optimal preprocessing that minimizes the number of samples needed +to meaningfully estimate the parameter. Remarkably, such an optimal spectral +estimator depends on $\Sigma$ only through its normalized trace, which can be +consistently estimated from the data. Numerical results demonstrate the +advantage of our principled approach over previous heuristic methods. + Existing analyses of spectral estimators crucially rely on the rotational +invariance of the design matrix. This key assumption does not hold for +correlated Gaussian designs. To circumvent this difficulty, we develop a novel +strategy based on designing and analyzing an approximate message passing +algorithm whose fixed point coincides with the desired spectral estimator. Our +methodology is general, and opens the way to the precise characterization of +spiked matrices and of the corresponding spectral methods in a variety of +settings. + +
+
+
+
+
+ + ☆ Rebalancing Social Feed to Minimize Polarization and Disagreement CIKM 2023 + + +
+ Social media have great potential for enabling public discourse on important +societal issues. However, adverse effects, such as polarization and echo +chambers, greatly impact the benefits of social media and call for algorithms +that mitigate these effects. In this paper, we propose a novel problem +formulation aimed at slightly nudging users' social feeds in order to strike a +balance between relevance and diversity, thus mitigating the emergence of +polarization, without lowering the quality of the feed. Our approach is based +on re-weighting the relative importance of the accounts that a user follows, so +as to calibrate the frequency with which the content produced by various +accounts is shown to the user. We analyze the convexity properties of the +problem, demonstrating the non-matrix convexity of the objective function and +the convexity of the feasible set. To efficiently address the problem, we +develop a scalable algorithm based on projected gradient descent. We also prove +that our problem statement is a proper generalization of the undirected-case +problem so that our method can also be adopted for undirected social networks. +As a baseline for comparison in the undirected case, we develop a semidefinite +programming approach, which provides the optimal solution. Through extensive +experiments on synthetic and real-world datasets, we validate the effectiveness +of our approach, which outperforms non-trivial baselines, underscoring its +ability to foster healthier and more cohesive online communities. + +
+
+ comment: Accepted for publication at ACM CIKM 2023 +
+
+
+
+
+ + ☆ Group Regression for Query Based Object Detection and Tracking SC 2023 + + +
+ Group regression is commonly used in 3D object detection to predict box +parameters of similar classes in a joint head, aiming to benefit from +similarities while separating highly dissimilar classes. For query-based +perception methods, this has, so far, not been feasible. We close this gap and +present a method to incorporate multi-class group regression, especially +designed for the 3D domain in the context of autonomous driving, into existing +attention and query-based perception approaches. We enhance a transformer based +joint object detection and tracking model with this approach, and thoroughly +evaluate its behavior and performance. For group regression, the classes of the +nuScenes dataset are divided into six groups of similar shape and prevalence, +each being regressed by a dedicated head. We show that the proposed method is +applicable to many existing transformer based perception approaches and can +bring potential benefits. The behavior of query group regression is thoroughly +analyzed in comparison to a unified regression head, e.g. in terms of +class-switching behavior and distribution of the output parameters. The +proposed method offers many possibilities for further research, such as in the +direction of deep multi-hypotheses tracking. + +
+
+ comment: Accepted for publication at the 2023 26th IEEE International + Conference on Intelligent Transportation Systems (ITSC 2023), Sep 24-28, + 2023, in Bilbao, Spain +
+
+
+
+
+ + ☆ Some issues in robust clustering + + +
+ Some key issues in robust clustering are discussed with focus on Gaussian +mixture model based clustering, namely the formal definition of outliers, +ambiguity between groups of outliers and clusters, the interaction between +robust clustering and the estimation of the number of clusters, the essential +dependence of (not only) robust clustering on tuning decisions, and +shortcomings of existing measurements of cluster stability when it comes to +outliers. + +
+
+ comment: 11 pages, no figures +
+
+
+
+
+ + ☆ Speech Self-Supervised Representations Benchmarking: a Case for Larger + Probing Heads + + +
+ Self-supervised learning (SSL) leverages large datasets of unlabeled speech +to reach impressive performance with reduced amounts of annotated data. The +high number of proposed approaches fostered the emergence of comprehensive +benchmarks that evaluate their performance on a set of downstream tasks +exploring various aspects of the speech signal. However, while the number of +considered tasks has been growing, most proposals rely upon a single downstream +architecture that maps the frozen SSL representations to the task labels. This +study examines how benchmarking results are affected by changes in the probing +head architecture. Interestingly, we found that altering the downstream +architecture structure leads to significant fluctuations in the performance +ranking of the evaluated models. Against common practices in speech SSL +benchmarking, we evaluate larger-capacity probing heads, showing their impact +on performance, inference costs, generalization and multi-level feature +exploitation. + +
+
+ comment: 11 Pages +
+
+
+
+
+ + ☆ TextrolSpeech: A Text Style Control Speech Corpus With Codec Language + Text-to-Speech Models + + +
+ Recently, there has been a growing interest in the field of controllable +Text-to-Speech (TTS). While previous studies have relied on users providing +specific style factor values based on acoustic knowledge or selecting reference +speeches that meet certain requirements, generating speech solely from natural +text prompts has emerged as a new challenge for researchers. This challenge +arises due to the scarcity of high-quality speech datasets with natural text +style prompt and the absence of advanced text-controllable TTS models. In light +of this, 1) we propose TextrolSpeech, which is the first large-scale speech +emotion dataset annotated with rich text attributes. The dataset comprises +236,220 pairs of style prompt in natural text descriptions with five style +factors and corresponding speech samples. Through iterative experimentation, we +introduce a multi-stage prompt programming approach that effectively utilizes +the GPT model for generating natural style descriptions in large volumes. 2) +Furthermore, to address the need for generating audio with greater style +diversity, we propose an efficient architecture called Salle. This architecture +treats text controllable TTS as a language model task, utilizing audio codec +codes as an intermediate representation to replace the conventional +mel-spectrogram. Finally, we successfully demonstrate the ability of the +proposed model by showing a comparable performance in the controllable TTS +task. Audio samples are available at https://sall-e.github.io/ + +
+
+
+
+
+ + ☆ Shielded Reinforcement Learning for Hybrid Systems + + +
+ Safe and optimal controller synthesis for switched-controlled hybrid systems, +which combine differential equations and discrete changes of the system's +state, is known to be intricately hard. Reinforcement learning has been +leveraged to construct near-optimal controllers, but their behavior is not +guaranteed to be safe, even when it is encouraged by reward engineering. One +way of imposing safety to a learned controller is to use a shield, which is +correct by design. However, obtaining a shield for non-linear and hybrid +environments is itself intractable. In this paper, we propose the construction +of a shield using the so-called barbaric method, where an approximate finite +representation of an underlying partition-based two-player safety game is +extracted via systematically picked samples of the true transition function. +While hard safety guarantees are out of reach, we experimentally demonstrate +strong statistical safety guarantees with a prototype implementation and UPPAAL +STRATEGO. Furthermore, we study the impact of the synthesized shield when +applied as either a pre-shield (applied before learning a controller) or a +post-shield (only applied after learning a controller). We experimentally +demonstrate superiority of the pre-shielding approach. We apply our technique +on a range of case studies, including two industrial examples, and further +study post-optimization of the post-shielding approach. + +
+
+
+
+
+ + ☆ Task-Aware Machine Unlearning and Its Application in Load Forecasting + + +
+ Data privacy and security have become a non-negligible factor in load +forecasting. Previous researches mainly focus on training stage enhancement. +However, once the model is trained and deployed, it may need to `forget' (i.e., +remove the impact of) part of training data if the data is found to be +malicious or as requested by the data owner. This paper introduces machine +unlearning algorithm which is specifically designed to remove the influence of +part of the original dataset on an already trained forecaster. However, direct +unlearning inevitably degrades the model generalization ability. To balance +between unlearning completeness and performance degradation, a +performance-aware algorithm is proposed by evaluating the sensitivity of local +model parameter change using influence function and sample re-weighting. +Moreover, we observe that the statistic criterion cannot fully reflect the +operation cost of down-stream tasks. Therefore, a task-aware machine unlearning +is proposed whose objective is a tri-level optimization with dispatch and +redispatch problems considered. We theoretically prove the existence of the +gradient of such objective, which is key to re-weighting the remaining samples. +We test the unlearning algorithms on linear and neural network load forecasters +with realistic load dataset. The simulation demonstrates the balance on +unlearning completeness and operational cost. All codes can be found at +https://github.com/xuwkk/task_aware_machine_unlearning. + +
+
+
+
+
+ + ☆ Steerable Conditional Diffusion for Out-of-Distribution Adaptation in + Imaging Inverse Problems + + +
+ Denoising diffusion models have emerged as the go-to framework for solving +inverse problems in imaging. A critical concern regarding these models is their +performance on out-of-distribution (OOD) tasks, which remains an under-explored +challenge. Realistic reconstructions inconsistent with the measured data can be +generated, hallucinating image features that are uniquely present in the +training dataset. To simultaneously enforce data-consistency and leverage +data-driven priors, we introduce a novel sampling framework called Steerable +Conditional Diffusion. This framework adapts the denoising network specifically +to the available measured data. Utilising our proposed method, we achieve +substantial enhancements in OOD performance across diverse imaging modalities, +advancing the robust deployment of denoising diffusion models in real-world +applications. + +
+
+
+
+
+ + ☆ Identifying topology of leaky photonic lattices with machine learning + + +
+ We show how machine learning techniques can be applied for the classification +of topological phases in leaky photonic lattices using limited measurement +data. We propose an approach based solely on bulk intensity measurements, thus +exempt from the need for complicated phase retrieval procedures. In particular, +we design a fully connected neural network that accurately determines +topological properties from the output intensity distribution in dimerized +waveguide arrays with leaky channels, after propagation of a spatially +localized initial excitation at a finite distance, in a setting that closely +emulates realistic experimental conditions. + +
+
+ comment: 9 pages, 8 figures +
+
+
+
+
+ + ☆ Semi-Supervised Semantic Depth Estimation using Symbiotic Transformer + and NearFarMix Augmentation WACV 2024 + + +
+ In computer vision, depth estimation is crucial for domains like robotics, +autonomous vehicles, augmented reality, and virtual reality. Integrating +semantics with depth enhances scene understanding through reciprocal +information sharing. However, the scarcity of semantic information in datasets +poses challenges. Existing convolutional approaches with limited local +receptive fields hinder the full utilization of the symbiotic potential between +depth and semantics. This paper introduces a dataset-invariant semi-supervised +strategy to address the scarcity of semantic information. It proposes the Depth +Semantics Symbiosis module, leveraging the Symbiotic Transformer for achieving +comprehensive mutual awareness by information exchange within both local and +global contexts. Additionally, a novel augmentation, NearFarMix is introduced +to combat overfitting and compensate both depth-semantic tasks by strategically +merging regions from two images, generating diverse and structurally consistent +samples with enhanced control. Extensive experiments on NYU-Depth-V2 and KITTI +datasets demonstrate the superiority of our proposed techniques in indoor and +outdoor environments. + +
+
+ comment: Accepted at WACV 2024 +
+
+
+
+
+ + ☆ Biclustering Methods via Sparse Penalty + + +
+ In this paper, we first reviewed several biclustering methods that are used +to identify the most significant clusters in gene expression data. Here we +mainly focused on the SSVD(sparse SVD) method and tried a new sparse penalty +named "Prenet penalty" which has been used only in factor analysis to gain +sparsity. Then in the simulation study, we tried different types of generated +datasets (with different sparsity and dimension) and tried 1-layer +approximation then for k-layers which shows the mixed Prenet penalty is very +effective for non-overlapped data. Finally, we used some real gene expression +data to show the behavior of our methods. + +
+
+
+
+
+ + ☆ Self-Supervision for Tackling Unsupervised Anomaly Detection: Pitfalls + and Opportunities + + +
+ Self-supervised learning (SSL) is a growing torrent that has recently +transformed machine learning and its many real world applications, by learning +on massive amounts of unlabeled data via self-generated supervisory signals. +Unsupervised anomaly detection (AD) has also capitalized on SSL, by +self-generating pseudo-anomalies through various data augmentation functions or +external data exposure. In this vision paper, we first underline the importance +of the choice of SSL strategies on AD performance, by presenting evidences and +studies from the AD literature. Equipped with the understanding that SSL incurs +various hyperparameters (HPs) to carefully tune, we present recent developments +on unsupervised model selection and augmentation tuning for SSL-based AD. We +then highlight emerging challenges and future opportunities; on designing new +pretext tasks and augmentation functions for different data modalities, +creating novel model selection solutions for systematically tuning the SSL HPs, +as well as on capitalizing on the potential of pretrained foundation models on +AD through effective density estimation. + +
+
+
+
+
+ + ☆ Meta Attentive Graph Convolutional Recurrent Network for Traffic + Forecasting + + +
+ Traffic forecasting is a fundamental problem in intelligent transportation +systems. Existing traffic predictors are limited by their expressive power to +model the complex spatial-temporal dependencies in traffic data, mainly due to +the following limitations. Firstly, most approaches are primarily designed to +model the local shared patterns, which makes them insufficient to capture the +specific patterns associated with each node globally. Hence, they fail to learn +each node's unique properties and diversified patterns. Secondly, most existing +approaches struggle to accurately model both short- and long-term dependencies +simultaneously. In this paper, we propose a novel traffic predictor, named Meta +Attentive Graph Convolutional Recurrent Network (MAGCRN). MAGCRN utilizes a +Graph Convolutional Recurrent Network (GCRN) as a core module to model local +dependencies and improves its operation with two novel modules: 1) a +Node-Specific Meta Pattern Learning (NMPL) module to capture node-specific +patterns globally and 2) a Node Attention Weight Generation Module (NAWG) +module to capture short- and long-term dependencies by connecting the +node-specific features with the ones learned initially at each time step during +GCRN operation. Experiments on six real-world traffic datasets demonstrate that +NMPL and NAWG together enable MAGCRN to outperform state-of-the-art baselines +on both short- and long-term predictions. + +
+
+
+
+
+ + ☆ Are Existing Out-Of-Distribution Techniques Suitable for Network + Intrusion Detection? + + +
+ Machine learning (ML) has become increasingly popular in network intrusion +detection. However, ML-based solutions always respond regardless of whether the +input data reflects known patterns, a common issue across safety-critical +applications. While several proposals exist for detecting Out-Of-Distribution +(OOD) in other fields, it remains unclear whether these approaches can +effectively identify new forms of intrusions for network security. New attacks, +not necessarily affecting overall distributions, are not guaranteed to be +clearly OOD as instead, images depicting new classes are in computer vision. In +this work, we investigate whether existing OOD detectors from other fields +allow the identification of unknown malicious traffic. We also explore whether +more discriminative and semantically richer embedding spaces within models, +such as those created with contrastive learning and multi-class tasks, benefit +detection. Our investigation covers a set of six OOD techniques that employ +different detection strategies. These techniques are applied to models trained +in various ways and subsequently exposed to unknown malicious traffic from the +same and different datasets (network environments). Our findings suggest that +existing detectors can identify a consistent portion of new malicious traffic, +and that improved embedding spaces enhance detection. We also demonstrate that +simple combinations of certain detectors can identify almost 100% of malicious +traffic in our tested scenarios. + +
+
+
+
+
+ + ☆ Online Continual Learning on Hierarchical Label Expansion ICCV 2023 + + +
+ Continual learning (CL) enables models to adapt to new tasks and environments +without forgetting previously learned knowledge. While current CL setups have +ignored the relationship between labels in the past task and the new task with +or without small task overlaps, real-world scenarios often involve hierarchical +relationships between old and new tasks, posing another challenge for +traditional CL approaches. To address this challenge, we propose a novel +multi-level hierarchical class incremental task configuration with an online +learning constraint, called hierarchical label expansion (HLE). Our +configuration allows a network to first learn coarse-grained classes, with data +labels continually expanding to more fine-grained classes in various hierarchy +depths. To tackle this new setup, we propose a rehearsal-based method that +utilizes hierarchy-aware pseudo-labeling to incorporate hierarchical class +information. Additionally, we propose a simple yet effective memory management +and sampling strategy that selectively adopts samples of newly encountered +classes. Our experiments demonstrate that our proposed method can effectively +use hierarchy on our HLE setup to improve classification accuracy across all +levels of hierarchies, regardless of depth and class imbalance ratio, +outperforming prior state-of-the-art works by significant margins while also +outperforming them on the conventional disjoint, blurry and i-Blurry CL setups. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ Target-independent XLA optimization using Reinforcement Learning NeurIPS 2022 + + +
+ An important challenge in Machine Learning compilers like XLA is multi-pass +optimization and analysis. There has been recent interest chiefly in XLA +target-dependent optimization on the graph-level, subgraph-level, and +kernel-level phases. We specifically focus on target-independent optimization +XLA HLO pass ordering: our approach aims at finding the optimal sequence of +compiler optimization passes, which is decoupled from target-dependent +optimization. However, there is little domain specific study in pass ordering +for XLA HLO. To this end, we propose introducing deep Reinforcement Learning +(RL) based search for optimal XLA HLO pass ordering. We also propose +enhancements to the deep RL algorithms to further improve optimal search +performance and open the research direction for domain-specific guidance for +RL. We create an XLA Gym experimentation framework as a tool to enable RL +algorithms to interact with the compiler for passing optimizations and thereby +train agents. Overall, in our experimentation we observe an average of $13.3\%$ +improvement in operation count reduction on a benchmark of GPT-2 training +graphs and $10.4\%$ improvement on a diverse benchmark including GPT-2, BERT, +and ResNet graphs using the proposed approach over the compiler's default phase +ordering. + +
+
+ comment: Workshop on ML for Systems @ NeurIPS 2022 +
+
+
+
+
+ + ☆ Can Transformer and GNN Help Each Other? + + +
+ Although Transformer has achieved great success in natural language process +and computer vision, it has difficulty generalizing to medium and large-scale +graph data for two important reasons: (i) High complexity. (ii) Failing to +capture the complex and entangled structure information. In graph +representation learning, Graph Neural Networks(GNNs) can fuse the graph +structure and node attributes but have limited receptive fields. Therefore, we +question whether can we combine Transformers and GNNs to help each other. In +this paper, we propose a new model named TransGNN where the Transformer layer +and GNN layer are used alternately to improve each other. Specifically, to +expand the receptive field and disentangle the information aggregation from +edges, we propose using Transformer to aggregate more relevant nodes' +information to improve the message passing of GNNs. Besides, to capture the +graph structure information, we utilize positional encoding and make use of the +GNN layer to fuse the structure into node attributes, which improves the +Transformer in graph data. We also propose to sample the most relevant nodes +for Transformer and two efficient samples update strategies to lower the +complexity. At last, we theoretically prove that TransGNN is more expressive +than GNNs only with extra linear complexity. The experiments on eight datasets +corroborate the effectiveness of TransGNN on node and graph classification +tasks. + +
+
+
+
+
+ + ☆ EdgeMoE: Fast On-Device Inference of MoE-based Large Language Models + + +
+ Large Language Models (LLMs) such as GPTs and LLaMa have ushered in a +revolution in machine intelligence, owing to their exceptional capabilities in +a wide range of machine learning tasks. However, the transition of LLMs from +data centers to edge devices presents a set of challenges and opportunities. +While this shift can enhance privacy and availability, it is hampered by the +enormous parameter sizes of these models, leading to impractical runtime costs. +In light of these considerations, we introduce EdgeMoE, the first on-device +inference engine tailored for mixture-of-expert (MoE) LLMs, a popular variant +of sparse LLMs that exhibit nearly constant computational complexity as their +parameter size scales. EdgeMoE achieves both memory and computational +efficiency by strategically partitioning the model across the storage +hierarchy. Specifically, non-expert weights are stored in the device's memory, +while expert weights are kept in external storage and are fetched into memory +only when they are activated. This design is underpinned by a crucial insight +that expert weights, though voluminous, are infrequently accessed due to sparse +activation patterns. To further mitigate the overhead associated with expert +I/O swapping, EdgeMoE incorporates two innovative techniques: (1) Expert-wise +bitwidth adaptation: This method reduces the size of expert weights with an +acceptable level of accuracy loss. (2) Expert management: It predicts the +experts that will be activated in advance and preloads them into the +compute-I/O pipeline, thus further optimizing the process. In empirical +evaluations conducted on well-established MoE LLMs and various edge devices, +EdgeMoE demonstrates substantial memory savings and performance improvements +when compared to competitive baseline solutions. + +
+
+
+
+
+ + ☆ Simple Modification of the Upper Confidence Bound Algorithm by + Generalized Weighted Averages + + +
+ The multi-armed bandit (MAB) problem is a classical problem that models +sequential decision-making under uncertainty in reinforcement learning. In this +study, we propose a new generalized upper confidence bound (UCB) algorithm +(GWA-UCB1) by extending UCB1, which is a representative algorithm for MAB +problems, using generalized weighted averages, and present an effective +algorithm for various problem settings. GWA-UCB1 is a two-parameter +generalization of the balance between exploration and exploitation in UCB1 and +can be implemented with a simple modification of the UCB1 formula. Therefore, +this algorithm can be easily applied to UCB-based reinforcement learning +models. In preliminary experiments, we investigated the optimal parameters of a +simple generalized UCB1 (G-UCB1), prepared for comparison and GWA-UCB1, in a +stochastic MAB problem with two arms. Subsequently, we confirmed the +performance of the algorithms with the investigated parameters on stochastic +MAB problems when arm reward probabilities were sampled from uniform or normal +distributions and on survival MAB problems assuming more realistic situations. +GWA-UCB1 outperformed G-UCB1, UCB1-Tuned, and Thompson sampling in most problem +settings and can be useful in many situations. The code is available at +https://github.com/manome/python-mab. + +
+
+ comment: 8 pages, 8 figures +
+
+
+
+
+ + ☆ Label-free Deep Learning Driven Secure Access Selection in + Space-Air-Ground Integrated Networks + + +
+ In Space-air-ground integrated networks (SAGIN), the inherent openness and +extensive broadcast coverage expose these networks to significant eavesdropping +threats. Considering the inherent co-channel interference due to spectrum +sharing among multi-tier access networks in SAGIN, it can be leveraged to +assist the physical layer security among heterogeneous transmissions. However, +it is challenging to conduct a secrecy-oriented access strategy due to both +heterogeneous resources and different eavesdropping models. In this paper, we +explore secure access selection for a scenario involving multi-mode users +capable of accessing satellites, unmanned aerial vehicles, or base stations in +the presence of eavesdroppers. Particularly, we propose a Q-network +approximation based deep learning approach for selecting the optimal access +strategy for maximizing the sum secrecy rate. Meanwhile, the power optimization +is also carried out by an unsupervised learning approach to improve the secrecy +performance. Remarkably, two neural networks are trained by unsupervised +learning and Q-network approximation which are both label-free methods without +knowing the optimal solution as labels. Numerical results verify the efficiency +of our proposed power optimization approach and access strategy, leading to +enhanced secure transmission performance. + +
+
+
+
+
+ + ☆ Buy when? Survival machine learning model comparison for purchase timing + + +
+ The value of raw data is unlocked by converting it into information and +knowledge that drives decision-making. Machine Learning (ML) algorithms are +capable of analysing large datasets and making accurate predictions. Market +segmentation, client lifetime value, and marketing techniques have all made use +of machine learning. This article examines marketing machine learning +techniques such as Support Vector Machines, Genetic Algorithms, Deep Learning, +and K-Means. ML is used to analyse consumer behaviour, propose items, and make +other customer choices about whether or not to purchase a product or service, +but it is seldom used to predict when a person will buy a product or a basket +of products. In this paper, the survival models Kernel SVM, DeepSurv, Survival +Random Forest, and MTLR are examined to predict tine-purchase individual +decisions. Gender, Income, Location, PurchaseHistory, OnlineBehavior, +Interests, PromotionsDiscounts and CustomerExperience all have an influence on +purchasing time, according to the analysis. The study shows that the DeepSurv +model predicted purchase completion the best. These insights assist marketers +in increasing conversion rates. + +
+
+
+
+
+ + ☆ HRGCN: Heterogeneous Graph-level Anomaly Detection with Hierarchical + Relation-augmented Graph Neural Networks + + +
+ This work considers the problem of heterogeneous graph-level anomaly +detection. Heterogeneous graphs are commonly used to represent behaviours +between different types of entities in complex industrial systems for capturing +as much information about the system operations as possible. Detecting +anomalous heterogeneous graphs from a large set of system behaviour graphs is +crucial for many real-world applications like online web/mobile service and +cloud access control. To address the problem, we propose HRGCN, an unsupervised +deep heterogeneous graph neural network, to model complex heterogeneous +relations between different entities in the system for effectively identifying +these anomalous behaviour graphs. HRGCN trains a hierarchical +relation-augmented Heterogeneous Graph Neural Network (HetGNN), which learns +better graph representations by modelling the interactions among all the system +entities and considering both source-to-destination entity (node) types and +their relation (edge) types. Extensive evaluation on two real-world application +datasets shows that HRGCN outperforms state-of-the-art competing anomaly +detection approaches. We further present a real-world industrial case study to +justify the effectiveness of HRGCN in detecting anomalous (e.g., congested) +network devices in a mobile communication service. HRGCN is available at +https://github.com/jiaxililearn/HRGCN. + +
+
+ comment: 12 pages, 10 figures, 6 tables. Accepted +
+
+
+
+
+ + ☆ Fair Few-shot Learning with Auxiliary Sets ECAI 2023 + + +
+ Recently, there has been a growing interest in developing machine learning +(ML) models that can promote fairness, i.e., eliminating biased predictions +towards certain populations (e.g., individuals from a specific demographic +group). Most existing works learn such models based on well-designed fairness +constraints in optimization. Nevertheless, in many practical ML tasks, only +very few labeled data samples can be collected, which can lead to inferior +fairness performance. This is because existing fairness constraints are +designed to restrict the prediction disparity among different sensitive groups, +but with few samples, it becomes difficult to accurately measure the disparity, +thus rendering ineffective fairness optimization. In this paper, we define the +fairness-aware learning task with limited training samples as the \emph{fair +few-shot learning} problem. To deal with this problem, we devise a novel +framework that accumulates fairness-aware knowledge across different +meta-training tasks and then generalizes the learned knowledge to meta-test +tasks. To compensate for insufficient training samples, we propose an essential +strategy to select and leverage an auxiliary set for each meta-test task. These +auxiliary sets contain several labeled training samples that can enhance the +model performance regarding fairness in meta-test tasks, thereby allowing for +the transfer of learned useful fairness-oriented knowledge to meta-test tasks. +Furthermore, we conduct extensive experiments on three real-world datasets to +validate the superiority of our framework against the state-of-the-art +baselines. + +
+
+ comment: ECAI 2023 +
+
+
+
+
+ + ☆ DiffSmooth: Certifiably Robust Learning via Diffusion Models and Local + Smoothing USENIX Security + + +
+ Diffusion models have been leveraged to perform adversarial purification and +thus provide both empirical and certified robustness for a standard model. On +the other hand, different robustly trained smoothed models have been studied to +improve the certified robustness. Thus, it raises a natural question: Can +diffusion model be used to achieve improved certified robustness on those +robustly trained smoothed models? In this work, we first theoretically show +that recovered instances by diffusion models are in the bounded neighborhood of +the original instance with high probability; and the "one-shot" denoising +diffusion probabilistic models (DDPM) can approximate the mean of the generated +distribution of a continuous-time diffusion model, which approximates the +original instance under mild conditions. Inspired by our analysis, we propose a +certifiably robust pipeline DiffSmooth, which first performs adversarial +purification via diffusion models and then maps the purified instances to a +common region via a simple yet effective local smoothing strategy. We conduct +extensive experiments on different datasets and show that DiffSmooth achieves +SOTA-certified robustness compared with eight baselines. For instance, +DiffSmooth improves the SOTA-certified accuracy from $36.0\%$ to $53.0\%$ under +$\ell_2$ radius $1.5$ on ImageNet. The code is available at +[https://github.com/javyduck/DiffSmooth]. + +
+
+ comment: Accepted in 32nd USENIX Security, 2023 +
+
+
+
+
+ + ☆ Reinforcement Learning for Generative AI: A Survey + + +
+ Deep Generative AI has been a long-standing essential topic in the machine +learning community, which can impact a number of application areas like text +generation and computer vision. The major paradigm to train a generative model +is maximum likelihood estimation, which pushes the learner to capture and +approximate the target data distribution by decreasing the divergence between +the model distribution and the target distribution. This formulation +successfully establishes the objective of generative tasks, while it is +incapable of satisfying all the requirements that a user might expect from a +generative model. Reinforcement learning, serving as a competitive option to +inject new training signals by creating new objectives that exploit novel +signals, has demonstrated its power and flexibility to incorporate human +inductive bias from multiple angles, such as adversarial learning, +hand-designed rules and learned reward model to build a performant model. +Thereby, reinforcement learning has become a trending research field and has +stretched the limits of generative AI in both model design and application. It +is reasonable to summarize and conclude advances in recent years with a +comprehensive review. Although there are surveys in different application areas +recently, this survey aims to shed light on a high-level review that spans a +range of application areas. We provide a rigorous taxonomy in this area and +make sufficient coverage on various models and applications. Notably, we also +surveyed the fast-developing large language model area. We conclude this survey +by showing the potential directions that might tackle the limit of current +models and expand the frontiers for generative AI. + +
+
+
+
+
+ + ☆ Machine Unlearning Methodology base on Stochastic Teacher Network + + +
+ The rise of the phenomenon of the "right to be forgotten" has prompted +research on machine unlearning, which grants data owners the right to actively +withdraw data that has been used for model training, and requires the +elimination of the contribution of that data to the model. A simple method to +achieve this is to use the remaining data to retrain the model, but this is not +acceptable for other data owners who continue to participate in training. +Existing machine unlearning methods have been found to be ineffective in +quickly removing knowledge from deep learning models. This paper proposes using +a stochastic network as a teacher to expedite the mitigation of the influence +caused by forgotten data on the model. We performed experiments on three +datasets, and the findings demonstrate that our approach can efficiently +mitigate the influence of target data on the model within a single epoch. This +allows for one-time erasure and reconstruction of the model, and the +reconstruction model achieves the same performance as the retrained model. + +
+
+ comment: Accepted by 19th International Conference on Advanced Data Mining and + Applications. (ADMA 2023) +
+
+
+
+
+ + ☆ Policy Diversity for Cooperative Agents + + +
+ Standard cooperative multi-agent reinforcement learning (MARL) methods aim to +find the optimal team cooperative policy to complete a task. However there may +exist multiple different ways of cooperating, which usually are very needed by +domain experts. Therefore, identifying a set of significantly different +policies can alleviate the task complexity for them. Unfortunately, there is a +general lack of effective policy diversity approaches specifically designed for +the multi-agent domain. In this work, we propose a method called +Moment-Matching Policy Diversity to alleviate this problem. This method can +generate different team policies to varying degrees by formalizing the +difference between team policies as the difference in actions of selected +agents in different policies. Theoretically, we show that our method is a +simple way to implement a constrained optimization problem that regularizes the +difference between two trajectory distributions by using the maximum mean +discrepancy. The effectiveness of our approach is demonstrated on a challenging +team-based shooter. + +
+
+
+
+
+ + ☆ Solving Attention Kernel Regression Problem via Pre-conditioner + + +
+ Large language models have shown impressive performance in many tasks. One of +the major features from the computation perspective is computing the attention +matrix. Previous works [Zandieh, Han, Daliri, and Karba 2023, Alman and Song +2023] have formally studied the possibility and impossibility of approximating +the attention matrix. In this work, we define and study a new problem which is +called the attention kernel regression problem. We show how to solve the +attention kernel regression in the input sparsity time of the data matrix. + +
+
+
+
+
+ + ☆ Traffic Light Control with Reinforcement Learning + + +
+ Traffic light control is important for reducing congestion in urban mobility +systems. This paper proposes a real-time traffic light control method using +deep Q learning. Our approach incorporates a reward function considering queue +lengths, delays, travel time, and throughput. The model dynamically decides +phase changes based on current traffic conditions. The training of the deep Q +network involves an offline stage from pre-generated data with fixed schedules +and an online stage using real-time traffic data. A deep Q network structure +with a "phase gate" component is used to simplify the model's learning task +under different phases. A "memory palace" mechanism is used to address sample +imbalance during the training process. We validate our approach using both +synthetic and real-world traffic flow data on a road intersecting in Hangzhou, +China. Results demonstrate significant performance improvements of the proposed +method in reducing vehicle waiting time (57.1% to 100%), queue lengths (40.9% +to 100%), and total travel time (16.8% to 68.0%) compared to traditional fixed +signal plans. + +
+
+
+
+
+ + ☆ Goodhart's Law Applies to NLP's Explanation Benchmarks + + +
+ Despite the rising popularity of saliency-based explanations, the research +community remains at an impasse, facing doubts concerning their purpose, +efficacy, and tendency to contradict each other. Seeking to unite the +community's efforts around common goals, several recent works have proposed +evaluation metrics. In this paper, we critically examine two sets of metrics: +the ERASER metrics (comprehensiveness and sufficiency) and the EVAL-X metrics, +focusing our inquiry on natural language processing. First, we show that we can +inflate a model's comprehensiveness and sufficiency scores dramatically without +altering its predictions or explanations on in-distribution test inputs. Our +strategy exploits the tendency for extracted explanations and their complements +to be "out-of-support" relative to each other and in-distribution inputs. Next, +we demonstrate that the EVAL-X metrics can be inflated arbitrarily by a simple +method that encodes the label, even though EVAL-X is precisely motivated to +address such exploits. Our results raise doubts about the ability of current +metrics to guide explainability research, underscoring the need for a broader +reassessment of what precisely these metrics are intended to capture. + +
+
+
+
+
+ + ☆ Unleash Model Potential: Bootstrapped Meta Self-supervised Learning NIPS + + +
+ The long-term goal of machine learning is to learn general visual +representations from a small amount of data without supervision, mimicking +three advantages of human cognition: i) no need for labels, ii) robustness to +data scarcity, and iii) learning from experience. Self-supervised learning and +meta-learning are two promising techniques to achieve this goal, but they both +only partially capture the advantages and fail to address all the problems. +Self-supervised learning struggles to overcome the drawbacks of data scarcity, +while ignoring prior knowledge that can facilitate learning and generalization. +Meta-learning relies on supervised information and suffers from a bottleneck of +insufficient learning. To address these issues, we propose a novel Bootstrapped +Meta Self-Supervised Learning (BMSSL) framework that aims to simulate the human +learning process. We first analyze the close relationship between meta-learning +and self-supervised learning. Based on this insight, we reconstruct tasks to +leverage the strengths of both paradigms, achieving advantages i and ii. +Moreover, we employ a bi-level optimization framework that alternates between +solving specific tasks with a learned ability (first level) and improving this +ability (second level), attaining advantage iii. To fully harness its power, we +introduce a bootstrapped target based on meta-gradient to make the model its +own teacher. We validate the effectiveness of our approach with comprehensive +theoretical and empirical study. + +
+
+ comment: submitted to NIPS +
+
+
+
+
+ + ☆ Breaking Boundaries: Distributed Domain Decomposition with Scalable + Physics-Informed Neural PDE Solvers + + +
+ Mosaic Flow is a novel domain decomposition method designed to scale +physics-informed neural PDE solvers to large domains. Its unique approach +leverages pre-trained networks on small domains to solve partial differential +equations on large domains purely through inference, resulting in high +reusability. This paper presents an end-to-end parallelization of Mosaic Flow, +combining data parallel training and domain parallelism for inference on +large-scale problems. By optimizing the network architecture and data parallel +training, we significantly reduce the training time for learning the Laplacian +operator to minutes on 32 GPUs. Moreover, our distributed domain decomposition +algorithm enables scalable inferences for solving the Laplace equation on +domains 4096 times larger than the training domain, demonstrating strong +scaling while maintaining accuracy on 32 GPUs. The reusability of Mosaic Flow, +combined with the improved performance achieved through the distributed-memory +algorithms, makes it a promising tool for modeling complex physical phenomena +and accelerating scientific discovery. + +
+
+
+
+
+ + ☆ The Promise and Peril of Artificial Intelligence -- Violet Teaming + Offers a Balanced Path Forward + + +
+ Artificial intelligence (AI) promises immense benefits across sectors, yet +also poses risks from dual-use potentials, biases, and unintended behaviors. +This paper reviews emerging issues with opaque and uncontrollable AI systems +and proposes an integrative framework called violet teaming to develop reliable +and responsible AI. Violet teaming combines adversarial vulnerability probing +(red teaming) with solutions for safety and security (blue teaming) while +prioritizing ethics and social benefit. It emerged from AI safety research to +manage risks proactively by design. The paper traces the evolution of red, +blue, and purple teaming toward violet teaming, and then discusses applying +violet techniques to address biosecurity risks of AI in biotechnology. +Additional sections review key perspectives across law, ethics, cybersecurity, +macrostrategy, and industry best practices essential for operationalizing +responsible AI through holistic technical and social considerations. Violet +teaming provides both philosophy and method for steering AI trajectories toward +societal good. With conscience and wisdom, the extraordinary capabilities of AI +can enrich humanity. But without adequate precaution, the risks could prove +catastrophic. Violet teaming aims to empower moral technology for the common +welfare. + +
+
+ comment: 14 pages, 1 figure +
+
+
+
+
+ + ☆ Rule-Based Error Detection and Correction to Operationalize Movement + Trajectory Classification + + +
+ Classification of movement trajectories has many applications in +transportation. Supervised neural models represent the current +state-of-the-art. Recent security applications require this task to be rapidly +employed in environments that may differ from the data used to train such +models for which there is little training data. We provide a neuro-symbolic +rule-based framework to conduct error correction and detection of these models +to support eventual deployment in security applications. We provide a suite of +experiments on several recent and state-of-the-art models and show an accuracy +improvement of 1.7% over the SOTA model in the case where all classes are +present in training and when 40% of classes are omitted from training, we +obtain a 5.2% improvement (zero-shot) and 23.9% (few-shot) improvement over the +SOTA model without resorting to retraining of the base model. + +
+
+
+
+
+ + ☆ A Comparison of Personalized and Generalized Approaches to Emotion + Recognition Using Consumer Wearable Devices: Machine Learning Study + + +
+ Background: Studies have shown the potential adverse health effects, ranging +from headaches to cardiovascular disease, associated with long-term negative +emotions and chronic stress. Since many indicators of stress are imperceptible +to observers, the early detection and intervention of stress remains a pressing +medical need. Physiological signals offer a non-invasive method of monitoring +emotions and are easily collected by smartwatches. Existing research primarily +focuses on developing generalized machine learning-based models for emotion +classification. Objective: We aim to study the differences between personalized +and generalized machine learning models for three-class emotion classification +(neutral, stress, and amusement) using wearable biosignal data. Methods: We +developed a convolutional encoder for the three-class emotion classification +problem using data from WESAD, a multimodal dataset with physiological signals +for 15 subjects. We compared the results between a subject-exclusive +generalized, subject-inclusive generalized, and personalized model. Results: +For the three-class classification problem, our personalized model achieved an +average accuracy of 95.06% and F1-score of 91.71, our subject-inclusive +generalized model achieved an average accuracy of 66.95% and F1-score of 42.50, +and our subject-exclusive generalized model achieved an average accuracy of +67.65% and F1-score of 43.05. Conclusions: Our results emphasize the need for +increased research in personalized emotion recognition models given that they +outperform generalized models in certain contexts. We also demonstrate that +personalized machine learning models for emotion classification are viable and +can achieve high performance. + +
+
+
+
+
+ + ☆ Quantum Next Generation Reservoir Computing: An Efficient Quantum + Algorithm for Forecasting Quantum Dynamics + + +
+ Next Generation Reservoir Computing (NG-RC) is a modern class of model-free +machine learning that enables an accurate forecasting of time series data +generated by dynamical systems. We demonstrate that NG-RC can accurately +predict full many-body quantum dynamics, instead of merely concentrating on the +dynamics of observables, which is the conventional application of reservoir +computing. In addition, we apply a technique which we refer to as skipping +ahead to predict far future states accurately without the need to extract +information about the intermediate states. However, adopting a classical NG-RC +for many-body quantum dynamics prediction is computationally prohibitive due to +the large Hilbert space of sample input data. In this work, we propose an +end-to-end quantum algorithm for many-body quantum dynamics forecasting with a +quantum computational speedup via the block-encoding technique. This proposal +presents an efficient model-free quantum scheme to forecast quantum dynamics +coherently, bypassing inductive biases incurred in a model-based approach. + +
+
+ comment: 14 pages, 4 figures +
+
+
+
+
+ + ☆ Reinforcement Learning for Sampling on Temporal Medical Imaging + Sequences ICML 2023 + + +
+ Accelerated magnetic resonance imaging resorts to either Fourier-domain +subsampling or better reconstruction algorithms to deal with fewer measurements +while still generating medical images of high quality. Determining the optimal +sampling strategy given a fixed reconstruction protocol often has combinatorial +complexity. In this work, we apply double deep Q-learning and REINFORCE +algorithms to learn the sampling strategy for dynamic image reconstruction. We +consider the data in the format of time series, and the reconstruction method +is a pre-trained autoencoder-typed neural network. We present a proof of +concept that reinforcement learning algorithms are effective to discover the +optimal sampling pattern which underlies the pre-trained reconstructor network +(i.e., the dynamics in the environment). The code for replicating experiments +can be found at https://github.com/zhishenhuang/RLsamp. + +
+
+ comment: ICML 2023 Workshop SODS +
+
+
+
+
+ + ☆ Noise-Free Sampling Algorithms via Regularized Wasserstein Proximals + + +
+ We consider the problem of sampling from a distribution governed by a +potential function. This work proposes an explicit score-based MCMC method that +is deterministic, resulting in a deterministic evolution for particles rather +than a stochastic differential equation evolution. The score term is given in +closed form by a regularized Wasserstein proximal, using a kernel convolution +that is approximated by sampling. We demonstrate fast convergence on various +problems and show improved dimensional dependence of mixing time bounds for the +case of Gaussian distributions compared to the unadjusted Langevin algorithm +(ULA) and the Metropolis-adjusted Langevin algorithm (MALA). We additionally +derive closed form expressions for the distributions at each iterate for +quadratic potential functions, characterizing the variance reduction. Empirical +results demonstrate that the particles behave in an organized manner, lying on +level set contours of the potential. Moreover, the posterior mean estimator of +the proposed method is shown to be closer to the maximum a-posteriori estimator +compared to ULA and MALA, in the context of Bayesian logistic regression. + +
+
+
+
+
+ + ☆ Entropy-based Guidance of Deep Neural Networks for Accelerated + Convergence and Improved Performance + + +
+ Neural networks have dramatically increased our capacity to learn from large, +high-dimensional datasets across innumerable disciplines. However, their +decisions are not easily interpretable, their computational costs are high, and +building and training them are uncertain processes. To add structure to these +efforts, we derive new mathematical results to efficiently measure the changes +in entropy as fully-connected and convolutional neural networks process data, +and introduce entropy-based loss terms. Experiments in image compression and +image classification on benchmark datasets demonstrate these losses guide +neural networks to learn rich latent data representations in fewer dimensions, +converge in fewer training epochs, and achieve better test metrics. + +
+
+ comment: 13 pages, 4 figures +
+
+
+
+
+ + ☆ Application of Quantum Pre-Processing Filter for Binary Image + Classification with Small Samples + + +
+ Over the past few years, there has been significant interest in Quantum +Machine Learning (QML) among researchers, as it has the potential to transform +the field of machine learning. Several models that exploit the properties of +quantum mechanics have been developed for practical applications. In this +study, we investigated the application of our previously proposed quantum +pre-processing filter (QPF) to binary image classification. We evaluated the +QPF on four datasets: MNIST (handwritten digits), EMNIST (handwritten digits +and alphabets), CIFAR-10 (photographic images) and GTSRB (real-life traffic +sign images). Similar to our previous multi-class classification results, the +application of QPF improved the binary image classification accuracy using +neural network against MNIST, EMNIST, and CIFAR-10 from 98.9% to 99.2%, 97.8% +to 98.3%, and 71.2% to 76.1%, respectively, but degraded it against GTSRB from +93.5% to 92.0%. We then applied QPF in cases using a smaller number of training +and testing samples, i.e. 80 and 20 samples per class, respectively. In order +to derive statistically stable results, we conducted the experiment with 100 +trials choosing randomly different training and testing samples and averaging +the results. The result showed that the application of QPF did not improve the +image classification accuracy against MNIST and EMNIST but improved it against +CIFAR-10 and GTSRB from 65.8% to 67.2% and 90.5% to 91.8%, respectively. +Further research will be conducted as part of future work to investigate the +potential of QPF to assess the scalability of the proposed approach to larger +and complex datasets. + +
+
+ comment: 13 pages, 8 figures +
+
+
+
+
+ + ☆ Maestro: Uncovering Low-Rank Structures via Trainable Decomposition + + +
+ Deep Neural Networks (DNNs) have been a large driver and enabler for AI +breakthroughs in recent years. These models have been getting larger in their +attempt to become more accurate and tackle new upcoming use-cases, including +AR/VR and intelligent assistants. However, the training process of such large +models is a costly and time-consuming process, which typically yields a single +model to fit all targets. To mitigate this, various techniques have been +proposed in the literature, including pruning, sparsification or quantization +of the model weights and updates. While able to achieve high compression rates, +they often incur computational overheads or accuracy penalties. Alternatively, +factorization methods have been leveraged to incorporate low-rank compression +in the training process. Similarly, such techniques (e.g.,~SVD) frequently rely +on the computationally expensive decomposition of layers and are potentially +sub-optimal for non-linear models, such as DNNs. In this work, we take a +further step in designing efficient low-rank models and propose Maestro, a +framework for trainable low-rank layers. Instead of regularly applying a priori +decompositions such as SVD, the low-rank structure is built into the training +process through a generalized variant of Ordered Dropout. This method imposes +an importance ordering via sampling on the decomposed DNN structure. Our +theoretical analysis demonstrates that our method recovers the SVD +decomposition of linear mapping on uniformly distributed data and PCA for +linear autoencoders. We further apply our technique on DNNs and empirically +illustrate that Maestro enables the extraction of lower footprint models that +preserve model performance while allowing for graceful accuracy-latency +tradeoff for the deployment to devices of different capabilities. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Optimal Economic Gas Turbine Dispatch with Deep Reinforcement Learning + + +
+ Dispatching strategies for gas turbines (GTs) are changing in modern +electricity grids. A growing incorporation of intermittent renewable energy +requires GTs to operate more but shorter cycles and more frequently on partial +loads. Deep reinforcement learning (DRL) has recently emerged as a tool that +can cope with this development and dispatch GTs economically. The key +advantages of DRL are a model-free optimization and the ability to handle +uncertainties, such as those introduced by varying loads or renewable energy +production. In this study, three popular DRL algorithms are implemented for an +economic GT dispatch problem on a case study in Alberta, Canada. We highlight +the benefits of DRL by incorporating an existing thermodynamic software +provided by Siemens Energy into the environment model and by simulating +uncertainty via varying electricity prices, loads, and ambient conditions. +Among the tested algorithms and baseline methods, Deep Q-Networks (DQN) +obtained the highest rewards while Proximal Policy Optimization (PPO) was the +most sample efficient. We further propose and implement a method to assign GT +operation and maintenance cost dynamically based on operating hours and cycles. +Compared to existing methods, our approach better approximates the true cost of +modern GT dispatch and hence leads to more realistic policies. + +
+
+ comment: This work has been accepted to IFAC for publication under a Creative + Commons Licence CC-BY-NC-ND +
+
+
+
+
+ + ☆ Gender bias and stereotypes in Large Language Models + + +
+ Large Language Models (LLMs) have made substantial progress in the past +several months, shattering state-of-the-art benchmarks in many domains. This +paper investigates LLMs' behavior with respect to gender stereotypes, a known +issue for prior models. We use a simple paradigm to test the presence of gender +bias, building on but differing from WinoBias, a commonly used gender bias +dataset, which is likely to be included in the training data of current LLMs. +We test four recently published LLMs and demonstrate that they express biased +assumptions about men and women's occupations. Our contributions in this paper +are as follows: (a) LLMs are 3-6 times more likely to choose an occupation that +stereotypically aligns with a person's gender; (b) these choices align with +people's perceptions better than with the ground truth as reflected in official +job statistics; (c) LLMs in fact amplify the bias beyond what is reflected in +perceptions or the ground truth; (d) LLMs ignore crucial ambiguities in +sentence structure 95% of the time in our study items, but when explicitly +prompted, they recognize the ambiguity; (e) LLMs provide explanations for their +choices that are factually inaccurate and likely obscure the true reason behind +their predictions. That is, they provide rationalizations of their biased +behavior. This highlights a key property of these models: LLMs are trained on +imbalanced datasets; as such, even with the recent successes of reinforcement +learning with human feedback, they tend to reflect those imbalances back at us. +As with other types of societal biases, we suggest that LLMs must be carefully +tested to ensure that they treat minoritized individuals and communities +equitably. + +
+
+ comment: ACM Collective Intelligence +
+
+
+
+
+ + ☆ Matbench Discovery -- An evaluation framework for machine learning + crystal stability prediction + + +
+ Matbench Discovery simulates the deployment of machine learning (ML) energy +models in a high-throughput search for stable inorganic crystals. We address +the disconnect between (i) thermodynamic stability and formation energy and +(ii) in-domain vs out-of-distribution performance. Alongside this paper, we +publish a Python package to aid with future model submissions and a growing +online leaderboard with further insights into trade-offs between various +performance metrics. To answer the question which ML methodology performs best +at materials discovery, our initial release explores a variety of models +including random forests, graph neural networks (GNN), one-shot predictors, +iterative Bayesian optimizers and universal interatomic potentials (UIP). +Ranked best-to-worst by their test set F1 score on thermodynamic stability +prediction, we find CHGNet > M3GNet > MACE > ALIGNN > MEGNet > CGCNN > CGCNN+P +> Wrenformer > BOWSR > Voronoi tessellation fingerprints with random forest. +The top 3 models are UIPs, the winning methodology for ML-guided materials +discovery, achieving F1 scores of ~0.6 for crystal stability classification and +discovery acceleration factors (DAF) of up to 5x on the first 10k most stable +predictions compared to dummy selection from our test set. We also highlight a +sharp disconnect between commonly used global regression metrics and more +task-relevant classification metrics. Accurate regressors are susceptible to +unexpectedly high false-positive rates if those accurate predictions lie close +to the decision boundary at 0 eV/atom above the convex hull where most +materials are. Our results highlight the need to focus on classification +metrics that actually correlate with improved stability hit rate. + +
+
+ comment: 18 pages, 9 figures, 3 tables +
+
+
+
+
+ + ☆ On Reward Structures of Markov Decision Processes + + +
+ A Markov decision process can be parameterized by a transition kernel and a +reward function. Both play essential roles in the study of reinforcement +learning as evidenced by their presence in the Bellman equations. In our +inquiry of various kinds of ``costs'' associated with reinforcement learning +inspired by the demands in robotic applications, rewards are central to +understanding the structure of a Markov decision process and reward-centric +notions can elucidate important concepts in reinforcement learning. +Specifically, we studied the sample complexity of policy evaluation and +developed a novel estimator with an instance-specific error bound of +$\tilde{O}(\sqrt{\frac{\tau_s}{n}})$ for estimating a single state value. Under +the online regret minimization setting, we refined the transition-based MDP +constant, diameter, into a reward-based constant, maximum expected hitting +cost, and with it, provided a theoretical explanation for how a well-known +technique, potential-based reward shaping, could accelerate learning with +expert knowledge. In an attempt to study safe reinforcement learning, we +modeled hazardous environments with irrecoverability and proposed a +quantitative notion of safe learning via reset efficiency. In this setting, we +modified a classic algorithm to account for resets achieving promising +preliminary numerical results. Lastly, for MDPs with multiple reward functions, +we developed a planning algorithm that computationally efficiently finds Pareto +optimal stochastic policies. + +
+
+ comment: This PhD thesis draws heavily from arXiv:1907.02114 and + arXiv:2002.06299 +
+
+
+
+
+ + ☆ RecRec: Algorithmic Recourse for Recommender Systems CIKM 2023 + + +
+ Recommender systems play an essential role in the choices people make in +domains such as entertainment, shopping, food, news, employment, and education. +The machine learning models underlying these recommender systems are often +enormously large and black-box in nature for users, content providers, and +system developers alike. It is often crucial for all stakeholders to understand +the model's rationale behind making certain predictions and recommendations. +This is especially true for the content providers whose livelihoods depend on +the recommender system. Drawing motivation from the practitioners' need, in +this work, we propose a recourse framework for recommender systems, targeted +towards the content providers. Algorithmic recourse in the recommendation +setting is a set of actions that, if executed, would modify the recommendations +(or ranking) of an item in the desired manner. A recourse suggests actions of +the form: "if a feature changes X to Y, then the ranking of that item for a set +of users will change to Z." Furthermore, we demonstrate that RecRec is highly +effective in generating valid, sparse, and actionable recourses through an +empirical evaluation of recommender systems trained on three real-world +datasets. To the best of our knowledge, this work is the first to conceptualize +and empirically test a generalized framework for generating recourses for +recommender systems. + +
+
+ comment: Accepted as a short paper at CIKM 2023 +
+
+
+
+
+ + ☆ Pruning Self-Attention for Zero-Shot Multi-Speaker Text-to-Speech INTERSPEECH 2023 + + +
+ For personalized speech generation, a neural text-to-speech (TTS) model must +be successfully implemented with limited data from a target speaker. To this +end, the baseline TTS model needs to be amply generalized to out-of-domain data +(i.e., target speaker's speech). However, approaches to address this +out-of-domain generalization problem in TTS have yet to be thoroughly studied. +In this work, we propose an effective pruning method for a transformer known as +sparse attention, to improve the TTS model's generalization abilities. In +particular, we prune off redundant connections from self-attention layers whose +attention weights are below the threshold. To flexibly determine the pruning +strength for searching optimal degree of generalization, we also propose a new +differentiable pruning method that allows the model to automatically learn the +thresholds. Evaluations on zero-shot multi-speaker TTS verify the effectiveness +of our method in terms of voice quality and speaker similarity. + +
+
+ comment: INTERSPEECH 2023 +
+
+
+
+
+ + ☆ BayOTIDE: Bayesian Online Multivariate Time series Imputation with + functional decomposition + + +
+ In real-world scenarios like traffic and energy, massive time-series data +with missing values and noises are widely observed, even sampled irregularly. +While many imputation methods have been proposed, most of them work with a +local horizon, which means models are trained by splitting the long sequence +into batches of fit-sized patches. This local horizon can make models ignore +global trends or periodic patterns. More importantly, almost all methods assume +the observations are sampled at regular time stamps, and fail to handle complex +irregular sampled time series arising from different applications. Thirdly, +most existing methods are learned in an offline manner. Thus, it is not +suitable for many applications with fast-arriving streaming data. To overcome +these limitations, we propose \ours: Bayesian Online Multivariate Time series +Imputation with functional decomposition. We treat the multivariate time series +as the weighted combination of groups of low-rank temporal factors with +different patterns. We apply a group of Gaussian Processes (GPs) with different +kernels as functional priors to fit the factors. For computational efficiency, +we further convert the GPs into a state-space prior by constructing an +equivalent stochastic differential equation (SDE), and developing a scalable +algorithm for online inference. The proposed method can not only handle +imputation over arbitrary time stamps, but also offer uncertainty +quantification and interpretability for the downstream application. We evaluate +our method on both synthetic and real-world datasets. + +
+
+
+
+
+ + ☆ Maturity-Aware Active Learning for Semantic Segmentation with + Hierarchically-Adaptive Sample Assessment BMVC 2023 + + +
+ Active Learning (AL) for semantic segmentation is challenging due to heavy +class imbalance and different ways of defining "sample" (pixels, areas, etc.), +leaving the interpretation of the data distribution ambiguous. We propose +"Maturity-Aware Distribution Breakdown-based Active Learning'' (MADBAL), an AL +method that benefits from a hierarchical approach to define a multiview data +distribution, which takes into account the different "sample" definitions +jointly, hence able to select the most impactful segmentation pixels with +comprehensive understanding. MADBAL also features a novel uncertainty +formulation, where AL supporting modules are included to sense the features' +maturity whose weighted influence continuously contributes to the uncertainty +detection. In this way, MADBAL makes significant performance leaps even in the +early AL stage, hence reducing the training burden significantly. It +outperforms state-of-the-art methods on Cityscapes and PASCAL VOC datasets as +verified in our extensive experiments. + +
+
+ comment: Accepted to the 34th British Machine Vision Conference (BMVC 2023) +
+
+
+
+
+ + ☆ Ad-Rec: Advanced Feature Interactions to Address Covariate-Shifts in + Recommendation Networks + + +
+ Recommendation models are vital in delivering personalized user experiences +by leveraging the correlation between multiple input features. However, deep +learning-based recommendation models often face challenges due to evolving user +behaviour and item features, leading to covariate shifts. Effective +cross-feature learning is crucial to handle data distribution drift and +adapting to changing user behaviour. Traditional feature interaction techniques +have limitations in achieving optimal performance in this context. + This work introduces Ad-Rec, an advanced network that leverages feature +interaction techniques to address covariate shifts. This helps eliminate +irrelevant interactions in recommendation tasks. Ad-Rec leverages masked +transformers to enable the learning of higher-order cross-features while +mitigating the impact of data distribution drift. Our approach improves model +quality, accelerates convergence, and reduces training time, as measured by the +Area Under Curve (AUC) metric. We demonstrate the scalability of Ad-Rec and its +ability to achieve superior model quality through comprehensive ablation +studies. + +
+
+
+
+
+ + ☆ Statistically Efficient Variance Reduction with Double Policy Estimation + for Off-Policy Evaluation in Sequence-Modeled Reinforcement Learning + + +
+ Offline reinforcement learning aims to utilize datasets of previously +gathered environment-action interaction records to learn a policy without +access to the real environment. Recent work has shown that offline +reinforcement learning can be formulated as a sequence modeling problem and +solved via supervised learning with approaches such as decision transformer. +While these sequence-based methods achieve competitive results over +return-to-go methods, especially on tasks that require longer episodes or with +scarce rewards, importance sampling is not considered to correct the policy +bias when dealing with off-policy data, mainly due to the absence of behavior +policy and the use of deterministic evaluation policies. To this end, we +propose DPE: an RL algorithm that blends offline sequence modeling and offline +reinforcement learning with Double Policy Estimation (DPE) in a unified +framework with statistically proven properties on variance reduction. We +validate our method in multiple tasks of OpenAI Gym with D4RL benchmarks. Our +method brings a performance improvements on selected methods which outperforms +SOTA baselines in several tasks, demonstrating the advantages of enabling +double policy estimation for sequence-modeled reinforcement learning. + +
+
+
+
+
+ + ☆ Conformal Meta-learners for Predictive Inference of Individual Treatment + Effects + + +
+ We investigate the problem of machine learning-based (ML) predictive +inference on individual treatment effects (ITEs). Previous work has focused +primarily on developing ML-based meta-learners that can provide point estimates +of the conditional average treatment effect (CATE); these are model-agnostic +approaches for combining intermediate nuisance estimates to produce estimates +of CATE. In this paper, we develop conformal meta-learners, a general framework +for issuing predictive intervals for ITEs by applying the standard conformal +prediction (CP) procedure on top of CATE meta-learners. We focus on a broad +class of meta-learners based on two-stage pseudo-outcome regression and develop +a stochastic ordering framework to study their validity. We show that inference +with conformal meta-learners is marginally valid if their (pseudo outcome) +conformity scores stochastically dominate oracle conformity scores evaluated on +the unobserved ITEs. Additionally, we prove that commonly used CATE +meta-learners, such as the doubly-robust learner, satisfy a model- and +distribution-free stochastic (or convex) dominance condition, making their +conformal inferences valid for practically-relevant levels of target coverage. +Whereas existing procedures conduct inference on nuisance parameters (i.e., +potential outcomes) via weighted CP, conformal meta-learners enable direct +inference on the target parameter (ITE). Numerical experiments show that +conformal meta-learners provide valid intervals with competitive efficiency +while retaining the favorable point estimation properties of CATE +meta-learners. + +
+
+
+
+
+ + ☆ When hard negative sampling meets supervised contrastive learning + + +
+ State-of-the-art image models predominantly follow a two-stage strategy: +pre-training on large datasets and fine-tuning with cross-entropy loss. Many +studies have shown that using cross-entropy can result in sub-optimal +generalisation and stability. While the supervised contrastive loss addresses +some limitations of cross-entropy loss by focusing on intra-class similarities +and inter-class differences, it neglects the importance of hard negative +mining. We propose that models will benefit from performance improvement by +weighting negative samples based on their dissimilarity to positive +counterparts. In this paper, we introduce a new supervised contrastive learning +objective, SCHaNe, which incorporates hard negative sampling during the +fine-tuning phase. Without requiring specialized architectures, additional +data, or extra computational resources, experimental results indicate that +SCHaNe outperforms the strong baseline BEiT-3 in Top-1 accuracy across various +benchmarks, with significant gains of up to $3.32\%$ in few-shot learning +settings and $3.41\%$ in full dataset fine-tuning. Importantly, our proposed +objective sets a new state-of-the-art for base models on ImageNet-1k, achieving +an 86.14\% accuracy. Furthermore, we demonstrate that the proposed objective +yields better embeddings and explains the improved effectiveness observed in +our experiments. + +
+
+
+
+
+ + ♻ ☆ Decentralized Multi-Agent Reinforcement Learning with Global State + Prediction + + +
+ Deep reinforcement learning (DRL) has seen remarkable success in the control +of single robots. However, applying DRL to robot swarms presents significant +challenges. A critical challenge is non-stationarity, which occurs when two or +more robots update individual or shared policies concurrently, thereby engaging +in an interdependent training process with no guarantees of convergence. +Circumventing non-stationarity typically involves training the robots with +global information about other agents' states and/or actions. In contrast, in +this paper we explore how to remove the need for global information. We pose +our problem as a Partially Observable Markov Decision Process, due to the +absence of global knowledge on other agents. Using collective transport as a +testbed scenario, we study two approaches to multi-agent training. In the +first, the robots exchange no messages, and are trained to rely on implicit +communication through push-and-pull on the object to transport. In the second +approach, we introduce Global State Prediction (GSP), a network trained to +forma a belief over the swarm as a whole and predict its future states. We +provide a comprehensive study over four well-known deep reinforcement learning +algorithms in environments with obstacles, measuring performance as the +successful transport of the object to the goal within a desired time-frame. +Through an ablation study, we show that including GSP boosts performance and +increases robustness when compared with methods that use global knowledge. + +
+
+
+
+
+ + ♻ ☆ Revisiting mass-radius relationships for exoplanet populations: a + machine learning insight + + +
+ The growing number of exoplanet discoveries and advances in machine learning +techniques have opened new avenues for exploring and understanding the +characteristics of worlds beyond our Solar System. In this study, we employ +efficient machine learning approaches to analyze a dataset comprising 762 +confirmed exoplanets and eight Solar System planets, aiming to characterize +their fundamental quantities. By applying different unsupervised clustering +algorithms, we classify the data into two main classes: 'small' and 'giant' +planets, with cut-off values at $R_{p}=8.13R_{\oplus}$ and +$M_{p}=52.48M_{\oplus}$. This classification reveals an intriguing distinction: +giant planets have lower densities, suggesting higher H-He mass fractions, +while small planets are denser, composed mainly of heavier elements. We apply +various regression models to uncover correlations between physical parameters +and their predictive power for exoplanet radius. Our analysis highlights that +planetary mass, orbital period, and stellar mass play crucial roles in +predicting exoplanet radius. Among the models evaluated, the Support Vector +Regression consistently outperforms others, demonstrating its promise for +obtaining accurate planetary radius estimates. Furthermore, we derive +parametric equations using the M5P and Markov Chain Monte Carlo methods. +Notably, our study reveals a noteworthy result: small planets exhibit a +positive linear mass-radius relation, aligning with previous findings. +Conversely, for giant planets, we observe a strong correlation between +planetary radius and the mass of their host stars, which might provide +intriguing insights into the relationship between giant planet formation and +stellar characteristics. + +
+
+ comment: Accepted for publication in MNRAS. 17 pages, 18 figures +
+
+
+
+
+ + ♻ ☆ The feasibility of artificial consciousness through the lens of + neuroscience + + +
+ Interactions with large language models have led to the suggestion that these +models may soon be conscious. From the perspective of neuroscience, this +position is difficult to defend. For one, the inputs to large language models +lack the embodied, embedded information content characteristic of our sensory +contact with the world around us. Secondly, the architecture of large language +models is missing key features of the thalamocortical system that have been +linked to conscious awareness in mammals. Finally, the evolutionary and +developmental trajectories that led to the emergence of living conscious +organisms arguably have no parallels in artificial systems as envisioned today. +The existence of living organisms depends on their actions, and their survival +is intricately linked to multi-level cellular, inter-cellular, and organismal +processes culminating in agency and consciousness. + +
+
+
+
+
+ + ♻ ☆ Reconstructing Spatiotemporal Data with C-VAEs + + +
+ The continuous representation of spatiotemporal data commonly relies on using +abstract data types, such as \textit{moving regions}, to represent entities +whose shape and position continuously change over time. Creating this +representation from discrete snapshots of real-world entities requires using +interpolation methods to compute in-between data representations and estimate +the position and shape of the object of interest at arbitrary temporal points. +Existing region interpolation methods often fail to generate smooth and +realistic representations of a region's evolution. However, recent advancements +in deep learning techniques have revealed the potential of deep models trained +on discrete observations to capture spatiotemporal dependencies through +implicit feature learning. + In this work, we explore the capabilities of Conditional Variational +Autoencoder (C-VAE) models to generate smooth and realistic representations of +the spatiotemporal evolution of moving regions. We evaluate our proposed +approach on a sparsely annotated dataset on the burnt area of a forest fire. We +apply compression operations to sample from the dataset and use the C-VAE model +and other commonly used interpolation algorithms to generate in-between region +representations. To evaluate the performance of the methods, we compare their +interpolation results with manually annotated data and regions generated by a +U-Net model. We also assess the quality of generated data considering temporal +consistency metrics. + The proposed C-VAE-based approach demonstrates competitive results in +geometric similarity metrics. It also exhibits superior temporal consistency, +suggesting that C-VAE models may be a viable alternative to modelling the +spatiotemporal evolution of 2D moving regions. + +
+
+ comment: Update acknowledgments to include published article information +
+
+
+
+
+ + ♻ ☆ Examining Policy Entropy of Reinforcement Learning Agents for + Personalization Tasks + + +
+ This effort is focused on examining the behavior of reinforcement learning +systems in personalization environments and detailing the differences in policy +entropy associated with the type of learning algorithm utilized. We demonstrate +that Policy Optimization agents often possess low-entropy policies during +training, which in practice results in agents prioritizing certain actions and +avoiding others. Conversely, we also show that Q-Learning agents are far less +susceptible to such behavior and generally maintain high-entropy policies +throughout training, which is often preferable in real-world applications. We +provide a wide range of numerical experiments as well as theoretical +justification to show that these differences in entropy are due to the type of +learning being employed. + +
+
+
+
+
+ + ♻ ☆ Wasserstein Geodesic Generator for Conditional Distributions + + +
+ Generating samples given a specific label requires estimating conditional +distributions. We derive a tractable upper bound of the Wasserstein distance +between conditional distributions to lay the theoretical groundwork to learn +conditional distributions. Based on this result, we propose a novel conditional +generation algorithm where conditional distributions are fully characterized by +a metric space defined by a statistical distance. We employ optimal transport +theory to propose the Wasserstein geodesic generator, a new conditional +generator that learns the Wasserstein geodesic. The proposed method learns both +conditional distributions for observed domains and optimal transport maps +between them. The conditional distributions given unobserved intermediate +domains are on the Wasserstein geodesic between conditional distributions given +two observed domain labels. Experiments on face images with light conditions as +domain labels demonstrate the efficacy of the proposed method. + +
+
+
+
+
+ + ♻ ☆ Counterpart Fairness -- Addressing Systematic between-group Differences + in Fairness Evaluation + + +
+ When using machine learning (ML) to aid decision-making, it is critical to +ensure that an algorithmic decision is fair, i.e., it does not discriminate +against specific individuals/groups, particularly those from underprivileged +populations. Existing group fairness methods require equal group-wise measures, +which however fails to consider systematic between-group differences. The +confounding factors, which are non-sensitive variables but manifest systematic +differences, can significantly affect fairness evaluation. To tackle this +problem, we believe that a fairness measurement should be based on the +comparison between counterparts (i.e., individuals who are similar to each +other with respect to the task of interest) from different groups, whose group +identities cannot be distinguished algorithmically by exploring confounding +factors. We have developed a propensity-score-based method for identifying +counterparts, which prevents fairness evaluation from comparing "oranges" with +"apples". In addition, we propose a counterpart-based statistical fairness +index, termed Counterpart-Fairness (CFair), to assess fairness of ML models. +Various empirical studies were conducted to validate the effectiveness of +CFair. We publish our code at \url{https://github.com/zhengyjo/CFair}. + +
+
+ comment: 25 pages, 6 figures, 16 tables +
+
+
+
+
+ + ♻ ☆ Reinforcement Learning with Delayed, Composite, and Partially Anonymous + Reward + + +
+ We investigate an infinite-horizon average reward Markov Decision Process +(MDP) with delayed, composite, and partially anonymous reward feedback. The +delay and compositeness of rewards mean that rewards generated as a result of +taking an action at a given state are fragmented into different components, and +they are sequentially realized at delayed time instances. The partial anonymity +attribute implies that a learner, for each state, only observes the aggregate +of past reward components generated as a result of different actions taken at +that state, but realized at the observation instance. We propose an algorithm +named $\mathrm{DUCRL2}$ to obtain a near-optimal policy for this setting and +show that it achieves a regret bound of $\tilde{\mathcal{O}}\left(DS\sqrt{AT} + +d (SA)^3\right)$ where $S$ and $A$ are the sizes of the state and action +spaces, respectively, $D$ is the diameter of the MDP, $d$ is a parameter upper +bounded by the maximum reward delay, and $T$ denotes the time horizon. This +demonstrates the optimality of the bound in the order of $T$, and an additive +impact of the delay. + +
+
+
+
+
+ + ♻ ☆ Safety Filter Design for Neural Network Systems via Convex Optimization + + +
+ With the increase in data availability, it has been widely demonstrated that +neural networks (NN) can capture complex system dynamics precisely in a +data-driven manner. However, the architectural complexity and nonlinearity of +the NNs make it challenging to synthesize a provably safe controller. In this +work, we propose a novel safety filter that relies on convex optimization to +ensure safety for a NN system, subject to additive disturbances that are +capable of capturing modeling errors. Our approach leverages tools from NN +verification to over-approximate NN dynamics with a set of linear bounds, +followed by an application of robust linear MPC to search for controllers that +can guarantee robust constraint satisfaction. We demonstrate the efficacy of +the proposed framework numerically on a nonlinear pendulum system. + +
+
+ comment: This paper has been accepted to the 2023 62nd IEEE Conference on + Decision and Control (CDC) +
+
+
+
+
+ + ♻ ☆ End-to-End Reinforcement Learning of Koopman Models for Economic + Nonlinear Model Predictive Control + + +
+ (Economic) nonlinear model predictive control ((e)NMPC) requires dynamic +system models that are sufficiently accurate in all relevant state-space +regions. These models must also be computationally cheap enough to ensure +real-time tractability. Data-driven surrogate models for mechanistic models can +be used to reduce the computational burden of (e)NMPC; however, such models are +typically trained by system identification for maximum average prediction +accuracy on simulation samples and perform suboptimally as part of actual +(e)NMPC. We present a method for end-to-end reinforcement learning of dynamic +surrogate models for optimal performance in (e)NMPC applications, resulting in +predictive controllers that strike a favorable balance between control +performance and computational demand. We validate our method on two +applications derived from an established nonlinear continuous stirred-tank +reactor model. We compare the controller performance to that of MPCs utilizing +models trained by the prevailing maximum prediction accuracy paradigm, and +model-free neural network controllers trained using reinforcement learning. We +show that our method matches the performance of the model-free neural network +controllers while consistently outperforming models derived from system +identification. Additionally, we show that the MPC policies can react to +changes in the control setting without retraining. + +
+
+ comment: manuscript (18 pages, 7 figures, 5 tables), supplementary materials + (3 pages, 2 tables) +
+
+
+
+
+ + ♻ ☆ Large Language Models are Fixated by Red Herrings: Exploring Creative + Problem Solving and Einstellung Effect using the Only Connect Wall Dataset + + +
+ The quest for human imitative AI has been an enduring topic in AI research +since its inception. The technical evolution and emerging capabilities of the +latest cohort of large language models (LLMs) have reinvigorated the subject +beyond academia to the cultural zeitgeist. While recent NLP evaluation +benchmark tasks test some aspects of human-imitative behaviour (e.g., +BIG-bench's 'human-like behavior' tasks), few, if not none, examine creative +problem solving abilities. Creative problem solving in humans is a well-studied +topic in cognitive neuroscience with standardized tests that predominantly use +the ability to associate (heterogeneous) connections among clue words as a +metric for creativity. Exposure to misleading stimuli - distractors dubbed red +herrings - impede human performance in such tasks via the fixation effect and +Einstellung paradigm. In cognitive neuroscience studies, such fixations are +experimentally induced by pre-exposing participants to orthographically similar +incorrect words to subsequent word-fragments or clues. The popular British quiz +show Only Connect's Connecting Wall segment essentially mimics Mednick's Remote +Associates Test (RAT) formulation with built-in, deliberate red herrings, which +makes it an ideal proxy dataset to explore and study fixation effect and +Einstellung paradigm from cognitive neuroscience in LLMs. In this paper we +present the novel Only Connect Wall (OCW) dataset and report results from our +evaluation of selected pre-trained language models and LLMs on creative problem +solving tasks like grouping clue words by heterogeneous connections, and +identifying correct open knowledge domain connections in respective groups. We +synthetically generate two additional datasets: OCW-Randomized, OCW-WordNet to +further analyze our red-herrings hypothesis in language models. The code and +link to the dataset are available at https://github.com/TaatiTeam/OCW. + +
+
+ comment: V2: with added OCW-Randomized and OCW-WordNet results in Section 4.3 + (added). 22 pages with Appendix +
+
+
+
+
+ + ♻ ☆ Map-based Experience Replay: A Memory-Efficient Solution to Catastrophic + Forgetting in Reinforcement Learning + + +
+ Deep Reinforcement Learning agents often suffer from catastrophic forgetting, +forgetting previously found solutions in parts of the input space when training +on new data. Replay Memories are a common solution to the problem, +decorrelating and shuffling old and new training samples. They naively store +state transitions as they come in, without regard for redundancy. We introduce +a novel cognitive-inspired replay memory approach based on the +Grow-When-Required (GWR) self-organizing network, which resembles a map-based +mental model of the world. Our approach organizes stored transitions into a +concise environment-model-like network of state-nodes and transition-edges, +merging similar samples to reduce the memory size and increase pair-wise +distance among samples, which increases the relevancy of each sample. Overall, +our paper shows that map-based experience replay allows for significant memory +reduction with only small performance decreases. + +
+
+
+
+
+ + ♻ ☆ QuadConv: Quadrature-Based Convolutions with Applications to Non-Uniform + PDE Data Compression + + +
+ We present a new convolution layer for deep learning architectures which we +call QuadConv -- an approximation to continuous convolution via quadrature. Our +operator is developed explicitly for use on non-uniform, mesh-based data, and +accomplishes this by learning a continuous kernel that can be sampled at +arbitrary locations. Moreover, the construction of our operator admits an +efficient implementation which we detail and construct. As an experimental +validation of our operator, we consider the task of compressing partial +differential equation (PDE) simulation data from fixed meshes. We show that +QuadConv can match the performance of standard discrete convolutions on uniform +grid data by comparing a QuadConv autoencoder (QCAE) to a standard +convolutional autoencoder (CAE). Further, we show that the QCAE can maintain +this accuracy even on non-uniform data. In both cases, QuadConv also +outperforms alternative unstructured convolution methods such as graph +convolution. + +
+
+ comment: 26 pages, 18 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ SimFBO: Towards Simple, Flexible and Communication-efficient Federated + Bilevel Learning + + +
+ Federated bilevel optimization (FBO) has shown great potential recently in +machine learning and edge computing due to the emerging nested optimization +structure in meta-learning, fine-tuning, hyperparameter tuning, etc. However, +existing FBO algorithms often involve complicated computations and require +multiple sub-loops per iteration, each of which contains a number of +communication rounds. In this paper, we propose a simple and flexible FBO +framework named SimFBO, which is easy to implement without sub-loops, and +includes a generalized server-side aggregation and update for improving +communication efficiency. We further propose System-level heterogeneity robust +FBO (ShroFBO) as a variant of SimFBO with stronger resilience to heterogeneous +local computation. We show that SimFBO and ShroFBO provably achieve a linear +convergence speedup with partial client participation and client sampling +without replacement, as well as improved sample and communication complexities. +Experiments demonstrate the effectiveness of the proposed methods over existing +FBO algorithms. + +
+
+
+
+
+ + ♻ ☆ An active learning method for solving competitive multi-agent + decision-making and control problems + + +
+ We propose a scheme based on active learning to reconstruct private +strategies executed by a population of interacting agents and predict an exact +outcome of the underlying multi-agent interaction process, here identified as a +stationary action profile. We envision a scenario where an external observer, +endowed with a learning procedure, can make queries and observe the agents' +reactions through private action-reaction mappings, whose collective fixed +point corresponds to a stationary profile. By iteratively collecting sensible +data and updating parametric estimates of the action-reaction mappings, we +establish sufficient conditions to assess the asymptotic properties of the +proposed active learning methodology so that, if convergence happens, it can +only be towards a stationary action profile. This fact yields two main +consequences: i) learning locally-exact surrogates of the action-reaction +mappings allows the external observer to succeed in its prediction task, and +ii) working with assumptions so general that a stationary profile is not even +guaranteed to exist, the established sufficient conditions hence act also as +certificates for the existence of such a desirable profile. Extensive numerical +simulations involving typical competitive multi-agent control and +decision-making problems illustrate the practical effectiveness of the proposed +learning-based approach. + +
+
+
+
+
+ + ♻ ☆ Deep Reinforcement Learning for Wind and Energy Storage Coordination in + Wholesale Energy and Ancillary Service Markets + + +
+ Wind energy has been increasingly adopted to mitigate climate change. +However, the variability of wind energy causes wind curtailment, resulting in +considerable economic losses for wind farm owners. Wind curtailment can be +reduced using battery energy storage systems (BESS) as onsite backup sources. +Yet, this auxiliary role may significantly weaken the economic potential of +BESS in energy trading. Ideal BESS scheduling should balance onsite wind +curtailment reduction and market bidding, but practical implementation is +challenging due to coordination complexity and the stochastic nature of energy +prices and wind generation. We investigate the joint-market bidding strategy of +a co-located wind-battery system in the spot and Regulation Frequency Control +Ancillary Service markets. We propose a novel deep reinforcement learning-based +approach that decouples the system's market participation into two related +Markov decision processes for each facility, enabling the BESS to absorb onsite +wind curtailment while performing joint-market bidding to maximize overall +operational revenues. Using realistic wind farm data, we validated the +coordinated bidding strategy, with outcomes surpassing the optimization-based +benchmark in terms of higher revenue by approximately 25\% and more wind +curtailment reduction by 2.3 times. Our results show that joint-market bidding +can significantly improve the financial performance of wind-battery systems +compared to participating in each market separately. Simulations also show that +using curtailed wind generation as a power source for charging the BESS can +lead to additional financial gains. The successful implementation of our +algorithm would encourage co-location of generation and storage assets to +unlock wider system benefits. + +
+
+
+
+
+ + ♻ ☆ Enhancing Agent Communication and Learning through Action and Language + + +
+ We introduce a novel category of GC-agents capable of functioning as both +teachers and learners. Leveraging action-based demonstrations and +language-based instructions, these agents enhance communication efficiency. We +investigate the incorporation of pedagogy and pragmatism, essential elements in +human communication and goal achievement, enhancing the agents' teaching and +learning capabilities. Furthermore, we explore the impact of combining +communication modes (action and language) on learning outcomes, highlighting +the benefits of a multi-modal approach. + +
+
+ comment: IMOL workshop, Paris 2023 +
+
+
+
+
+ + ♻ ☆ Secure & Private Federated Neuroimaging + + +
+ The amount of biomedical data continues to grow rapidly. However, collecting +data from multiple sites for joint analysis remains challenging due to +security, privacy, and regulatory concerns. To overcome this challenge, we use +Federated Learning, which enables distributed training of neural network models +over multiple data sources without sharing data. Each site trains the neural +network over its private data for some time, then shares the neural network +parameters (i.e., weights, gradients) with a Federation Controller, which in +turn aggregates the local models, sends the resulting community model back to +each site, and the process repeats. Our Federated Learning architecture, +MetisFL, provides strong security and privacy. First, sample data never leaves +a site. Second, neural network parameters are encrypted before transmission and +the global neural model is computed under fully-homomorphic encryption. +Finally, we use information-theoretic methods to limit information leakage from +the neural model to prevent a curious site from performing model inversion or +membership attacks. We present a thorough evaluation of the performance of +secure, private federated learning in neuroimaging tasks, including for +predicting Alzheimer's disease and estimating BrainAGE from magnetic resonance +imaging (MRI) studies, in challenging, heterogeneous federated environments +where sites have different amounts of data and statistical distributions. + +
+
+ comment: 18 pages, 13 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ inTformer: A Time-Embedded Attention-Based Transformer for Crash + Likelihood Prediction at Intersections Using Connected Vehicle Data + + +
+ The real-time crash likelihood prediction model is an essential component of +the proactive traffic safety management system. Over the years, numerous +studies have attempted to construct a crash likelihood prediction model in +order to enhance traffic safety, but mostly on freeways. In the majority of the +existing studies, researchers have primarily employed a deep learning-based +framework to identify crash potential. Lately, Transformer has emerged as a +potential deep neural network that fundamentally operates through +attention-based mechanisms. Transformer has several functional benefits over +extant deep learning models such as LSTM, CNN, etc. Firstly, Transformer can +readily handle long-term dependencies in a data sequence. Secondly, +Transformers can parallelly process all elements in a data sequence during +training. Finally, a Transformer does not have the vanishing gradient issue. +Realizing the immense possibility of Transformers, this paper proposes +inTersection-Transformer (inTformer), a time-embedded attention-based +Transformer model that can effectively predict intersection crash likelihood in +real-time. The proposed model was evaluated using connected vehicle data +extracted from Signal Analytics Platform. Acknowledging the complex traffic +operation mechanism at intersection, this study developed zone-specific models +by dividing the intersection region into two distinct zones: +within-intersection and approach zone. The best inTformer models in +'within-intersection,' and 'approach' zone achieved a sensitivity of 73%, and +70%, respectively. The zone-level models were also compared to earlier studies +on crash likelihood prediction at intersections and with several established +deep learning models trained on the same connected vehicle dataset. + +
+
+ comment: 29 pages, 10 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ NNP/MM: Accelerating molecular dynamics simulations with machine + learning potentials and molecular mechanic + + +
+ Machine learning potentials have emerged as a means to enhance the accuracy +of biomolecular simulations. However, their application is constrained by the +significant computational cost arising from the vast number of parameters +compared to traditional molecular mechanics. To tackle this issue, we introduce +an optimized implementation of the hybrid method (NNP/MM), which combines +neural network potentials (NNP) and molecular mechanics (MM). This approach +models a portion of the system, such as a small molecule, using NNP while +employing MM for the remaining system to boost efficiency. By conducting +molecular dynamics (MD) simulations on various protein-ligand complexes and +metadynamics (MTD) simulations on a ligand, we showcase the capabilities of our +implementation of NNP/MM. It has enabled us to increase the simulation speed by +5 times and achieve a combined sampling of one microsecond for each complex, +marking the longest simulations ever reported for this class of simulation. + +
+
+
+
+
+ + ♻ ☆ No Fear of Classifier Biases: Neural Collapse Inspired Federated + Learning with Synthetic and Fixed Classifier ICCV 2023 + + +
+ Data heterogeneity is an inherent challenge that hinders the performance of +federated learning (FL). Recent studies have identified the biased classifiers +of local models as the key bottleneck. Previous attempts have used classifier +calibration after FL training, but this approach falls short in improving the +poor feature representations caused by training-time classifier biases. +Resolving the classifier bias dilemma in FL requires a full understanding of +the mechanisms behind the classifier. Recent advances in neural collapse have +shown that the classifiers and feature prototypes under perfect training +scenarios collapse into an optimal structure called simplex equiangular tight +frame (ETF). Building on this neural collapse insight, we propose a solution to +the FL's classifier bias problem by utilizing a synthetic and fixed ETF +classifier during training. The optimal classifier structure enables all +clients to learn unified and optimal feature representations even under +extremely heterogeneous data. We devise several effective modules to better +adapt the ETF structure in FL, achieving both high generalization and +personalization. Extensive experiments demonstrate that our method achieves +state-of-the-art performances on CIFAR-10, CIFAR-100, and Tiny-ImageNet. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Invariant Lipschitz Bandits: A Side Observation Approach + + +
+ Symmetry arises in many optimization and decision-making problems, and has +attracted considerable attention from the optimization community: By utilizing +the existence of such symmetries, the process of searching for optimal +solutions can be improved significantly. Despite its success in (offline) +optimization, the utilization of symmetries has not been well examined within +the online optimization settings, especially in the bandit literature. As such, +in this paper we study the invariant Lipschitz bandit setting, a subclass of +the Lipschitz bandits where the reward function and the set of arms are +preserved under a group of transformations. We introduce an algorithm named +\texttt{UniformMesh-N}, which naturally integrates side observations using +group orbits into the \texttt{UniformMesh} algorithm +(\cite{Kleinberg2005_UniformMesh}), which uniformly discretizes the set of +arms. Using the side-observation approach, we prove an improved regret upper +bound, which depends on the cardinality of the group, given that the group is +finite. We also prove a matching regret's lower bound for the invariant +Lipschitz bandit class (up to logarithmic factors). We hope that our work will +ignite further investigation of symmetry in bandit theory and sequential +decision-making theory in general. + +
+
+
+
+
+ + ♻ ☆ A noise-robust acoustic method for recognizing foraging activities of + grazing cattle + + +
+ Farmers must continuously improve their livestock production systems to +remain competitive in the growing dairy market. Precision livestock farming +technologies provide individualized monitoring of animals on commercial farms, +optimizing livestock production. Continuous acoustic monitoring is a widely +accepted sensing technique used to estimate the daily rumination and grazing +time budget of free-ranging cattle. However, typical environmental and natural +noises on pastures noticeably affect the performance limiting the practical +application of current acoustic methods. In this study, we present the +operating principle and generalization capability of an acoustic method called +Noise-Robust Foraging Activity Recognizer (NRFAR). The proposed method +determines foraging activity bouts by analyzing fixed-length segments of +identified jaw movement events produced during grazing and rumination. The +additive noise robustness of the NRFAR was evaluated for several +signal-to-noise ratios using stationary Gaussian white noise and four different +nonstationary natural noise sources. In noiseless conditions, NRFAR reached an +average balanced accuracy of 86.4%, outperforming two previous acoustic methods +by more than 7.5%. Furthermore, NRFAR performed better than previous acoustic +methods in 77 of 80 evaluated noisy scenarios (53 cases with p<0.05). NRFAR has +been shown to be effective in harsh free-ranging environments and could be used +as a reliable solution to improve pasture management and monitor the health and +welfare of dairy cows. The instrumentation and computational algorithms +presented in this publication are protected by a pending patent application: AR +P20220100910. Web demo available at: https://sinc.unl.edu.ar/web-demo/nrfar + +
+
+ comment: list of used audio-clips is available in the list_audio_clips.xlsx +
+
+
+
+
+ + ♻ ☆ Differentiable Constrained Imitation Learning for Robot Motion Planning + and Control IROS 2023 + + +
+ Motion planning and control are crucial components of robotics applications +like automated driving. Here, spatio-temporal hard constraints like system +dynamics and safety boundaries (e.g., obstacles) restrict the robot's motions. +Direct methods from optimal control solve a constrained optimization problem. +However, in many applications finding a proper cost function is inherently +difficult because of the weighting of partially conflicting objectives. On the +other hand, Imitation Learning (IL) methods such as Behavior Cloning (BC) +provide an intuitive framework for learning decision-making from offline +demonstrations and constitute a promising avenue for planning and control in +complex robot applications. Prior work primarily relied on soft constraint +approaches, which use additional auxiliary loss terms describing the +constraints. However, catastrophic safety-critical failures might occur in +out-of-distribution (OOD) scenarios. This work integrates the flexibility of IL +with hard constraint handling in optimal control. Our approach constitutes a +general framework for constraint robotic motion planning and control, as well +as traffic agent simulation, whereas we focus on mobile robot and automated +driving applications. Hard constraints are integrated into the learning problem +in a differentiable manner, via explicit completion and gradient-based +correction. Simulated experiments of mobile robot navigation and automated +driving provide evidence for the performance of the proposed method. + +
+
+ comment: International Conference on Intelligent Robots and Systems Agents4AD + Workshop, IROS 2023 +
+
+
+
+
+ + ♻ ☆ Sufficient Invariant Learning for Distribution Shift + + +
+ Machine learning algorithms have shown remarkable performance in diverse +applications. However, it is still challenging to guarantee performance in +distribution shifts when distributions of training and test datasets are +different. There have been several approaches to improve the performance in +distribution shift cases by learning invariant features across groups or +domains. However, we observe that the previous works only learn invariant +features partially. While the prior works focus on the limited invariant +features, we first raise the importance of the sufficient invariant features. +Since only training sets are given empirically, the learned partial invariant +features from training sets might not be present in the test sets under +distribution shift. Therefore, the performance improvement on distribution +shifts might be limited. In this paper, we argue that learning sufficient +invariant features from the training set is crucial for the distribution shift +case. Concretely, we newly observe the connection between a) sufficient +invariant features and b) flatness differences between groups or domains. +Moreover, we propose a new algorithm, Adaptive Sharpness-aware Group +Distributionally Robust Optimization (ASGDRO), to learn sufficient invariant +features across domains or groups. ASGDRO learns sufficient invariant features +by seeking common flat minima across all groups or domains. Therefore, ASGDRO +improves the performance on diverse distribution shift cases. Besides, we +provide a new simple dataset, Heterogeneous-CMNIST, to diagnose whether the +various algorithms learn sufficient invariant features. + +
+
+
+
+
+ + ♻ ☆ MKL-$L_{0/1}$-SVM + + +
+ This paper presents a Multiple Kernel Learning (abbreviated as MKL) framework +for the Support Vector Machine (SVM) with the $(0, 1)$ loss function. Some +KKT-like first-order optimality conditions are provided and then exploited to +develop a fast ADMM algorithm to solve the nonsmooth nonconvex optimization +problem. Numerical experiments on synthetic and real datasets show that the +performance of our MKL-$L_{0/1}$-SVM is comparable with the one of the leading +approaches called SimpleMKL developed by Rakotomamonjy, Bach, Canu, and +Grandvalet [Journal of Machine Learning Research, vol.~9, pp.~2491--2521, +2008]. + +
+
+ comment: 26 pages in the JMLR template, 4 figures, and 2 tables. arXiv admin + note: substantial text overlap with arXiv:2303.04445 +
+
+
+
+
+ + ♻ ☆ Multi-Atlas Segmentation and Spatial Alignment of the Human Embryo in + First Trimester 3D Ultrasound + + +
+ Segmentation and spatial alignment of ultrasound (US) imaging data acquired +in the in first trimester are crucial for monitoring human embryonic growth and +development throughout this crucial period of life. Current approaches are +either manual or semi-automatic and are therefore very time-consuming and prone +to errors. To automate these tasks, we propose a multi-atlas framework for +automatic segmentation and spatial alignment of the embryo using deep learning +with minimal supervision. Our framework learns to register the embryo to an +atlas, which consists of the US images acquired at a range of gestational age +(GA), segmented and spatially aligned to a predefined standard orientation. +From this, we can derive the segmentation of the embryo and put the embryo in +standard orientation. US images acquired at 8+0 till 12+6 weeks GA were used +and eight subjects were selected as atlas. We evaluated different fusion +strategies to incorporate multiple atlases: 1) training the framework using +atlas images from a single subject, 2) training the framework with data of all +available atlases and 3) ensembling of the frameworks trained per subject. To +evaluate the performance, we calculated the Dice score over the test set. We +found that training the framework using all available atlases outperformed +ensembling and gave similar results compared to the best of all frameworks +trained on a single subject. Furthermore, we found that selecting images from +the four atlases closest in GA out of all available atlases, regardless of the +individual quality, gave the best results with a median Dice score of 0.72. We +conclude that our framework can accurately segment and spatially align the +embryo in first trimester 3D US images and is robust for the variation in +quality that existed in the available atlases. + +
+
+ comment: Accepted for publication at the Journal of Machine Learning for + Biomedical Imaging (MELBA) https://www.melba-journal.org/papers/2022:020.html +
+
+
+
+
+ + ♻ ☆ Heterogeneous Decentralized Machine Unlearning with Seed Model + Distillation + + +
+ As some recent information security legislation endowed users with +unconditional rights to be forgotten by any trained machine learning model, +personalized IoT service providers have to put unlearning functionality into +their consideration. The most straightforward method to unlearn users' +contribution is to retrain the model from the initial state, which is not +realistic in high throughput applications with frequent unlearning requests. +Though some machine unlearning frameworks have been proposed to speed up the +retraining process, they fail to match decentralized learning scenarios. In +this paper, we design a decentralized unlearning framework called HDUS, which +uses distilled seed models to construct erasable ensembles for all clients. +Moreover, the framework is compatible with heterogeneous on-device models, +representing stronger scalability in real-world applications. Extensive +experiments on three real-world datasets show that our HDUS achieves +state-of-the-art performance. + +
+
+
+
+
+ + ♻ ☆ The Re-Label Method For Data-Centric Machine Learning + + +
+ In industry deep learning application, our manually labeled data has a +certain number of noisy data. To solve this problem and achieve more than 90 +score in dev dataset, we present a simple method to find the noisy data and +re-label the noisy data by human, given the model predictions as references in +human labeling. In this paper, we illustrate our idea for a broad set of deep +learning tasks, includes classification, sequence tagging, object detection, +sequence generation, click-through rate prediction. The experimental results +and human evaluation results verify our idea. + +
+
+
+
+
+ + ♻ ☆ A probabilistic Taylor expansion with Gaussian processes + + +
+ We study a class of Gaussian processes for which the posterior mean, for a +particular choice of data, replicates a truncated Taylor expansion of any +order. The data consist of derivative evaluations at the expansion point and +the prior covariance kernel belongs to the class of Taylor kernels, which can +be written in a certain power series form. We discuss and prove some results on +maximum likelihood estimation of parameters of Taylor kernels. The proposed +framework is a special case of Gaussian process regression based on data that +is orthogonal in the reproducing kernel Hilbert space of the covariance kernel. + +
+
+ comment: To appear in Transactions on Machine Learning Research +
+
+
+
+
+ + ♻ ☆ Benign Autoencoders + + +
+ Recent progress in Generative Artificial Intelligence (AI) relies on +efficient data representations, often featuring encoder-decoder architectures. +We formalize the mathematical problem of finding the optimal encoder-decoder +pair and characterize its solution, which we name the "benign autoencoder" +(BAE). We prove that BAE projects data onto a manifold whose dimension is the +optimal compressibility dimension of the generative problem. We highlight +surprising connections between BAE and several recent developments in AI, such +as conditional GANs, context encoders, stable diffusion, stacked autoencoders, +and the learning capabilities of generative models. As an illustration, we show +how BAE can find optimal, low-dimensional latent representations that improve +the performance of a discriminator under a distribution shift. By compressing +"malignant" data dimensions, BAE leads to smoother and more stable gradients. + +
+
+ comment: This paper replaces and subsumes arXiv:2110.08884 +
+
+
+
+
+ + ♻ ☆ MindMap: Knowledge Graph Prompting Sparks Graph of Thoughts in Large + Language Models + + +
+ LLMs usually exhibit limitations in their ability to incorporate new +knowledge, the generation of hallucinations, and the transparency of their +decision-making process. In this paper, we explore how to prompt LLMs with +knowledge graphs (KG), working as a remedy to engage LLMs with up-to-date +knowledge and elicit the reasoning pathways from LLMs. Specifically, we build a +prompting pipeline that endows LLMs with the capability of comprehending KG +inputs and inferring with a combined implicit knowledge and the retrieved +external knowledge. In addition, we investigate eliciting the mind map on which +LLMs perform the reasoning and generate the answers. It is identified that the +produced mind map exhibits the reasoning pathways of LLMs grounded on the +ontology of knowledge, hence bringing the prospects of probing and gauging LLM +inference in production. The experiments on three question & answering datasets +also show that MindMap prompting leads to a striking empirical gain. For +instance, prompting a GPT-3.5 with MindMap yields an overwhelming performance +over GPT-4 consistently. We also demonstrate that with structured facts +retrieved from KG, MindMap can outperform a series of +prompting-with-document-retrieval methods, benefiting from more accurate, +concise, and comprehensive knowledge from KGs. + +
+
+ comment: 7 pages, 8 figures, 9 tables +
+
+
+
+
+ + ♻ ☆ On the Robustness of Random Forest Against Untargeted Data Poisoning: An + Ensemble-Based Approach + + +
+ Machine learning is becoming ubiquitous. From finance to medicine, machine +learning models are boosting decision-making processes and even outperforming +humans in some tasks. This huge progress in terms of prediction quality does +not however find a counterpart in the security of such models and corresponding +predictions, where perturbations of fractions of the training set (poisoning) +can seriously undermine the model accuracy. Research on poisoning attacks and +defenses received increasing attention in the last decade, leading to several +promising solutions aiming to increase the robustness of machine learning. +Among them, ensemble-based defenses, where different models are trained on +portions of the training set and their predictions are then aggregated, provide +strong theoretical guarantees at the price of a linear overhead. Surprisingly, +ensemble-based defenses, which do not pose any restrictions on the base model, +have not been applied to increase the robustness of random forest models. The +work in this paper aims to fill in this gap by designing and implementing a +novel hash-based ensemble approach that protects random forest against +untargeted, random poisoning attacks. An extensive experimental evaluation +measures the performance of our approach against a variety of attacks, as well +as its sustainability in terms of resource consumption and performance, and +compares it with a traditional monolithic model based on random forest. A final +discussion presents our main findings and compares our approach with existing +poisoning defenses targeting random forests. + +
+
+ comment: Accepted in IEEE Transactions on Sustainable Computing; 15 pages, 8 + figures +
+
+
+
+
+ + ♻ ☆ Interpolation for Robust Learning: Data Augmentation on Wasserstein + Geodesics + + +
+ We propose to study and promote the robustness of a model as per its +performance through the interpolation of training data distributions. +Specifically, (1) we augment the data by finding the worst-case Wasserstein +barycenter on the geodesic connecting subpopulation distributions of different +categories. (2) We regularize the model for smoother performance on the +continuous geodesic path connecting subpopulation distributions. (3) +Additionally, we provide a theoretical guarantee of robustness improvement and +investigate how the geodesic location and the sample size contribute, +respectively. Experimental validations of the proposed strategy on +\textit{four} datasets, including CIFAR-100 and ImageNet, establish the +efficacy of our method, e.g., our method improves the baselines' certifiable +robustness on CIFAR10 up to $7.7\%$, with $16.8\%$ on empirical robustness on +CIFAR-100. Our work provides a new perspective of model robustness through the +lens of Wasserstein geodesic-based interpolation with a practical off-the-shelf +strategy that can be combined with existing robust training methods. + +
+
+ comment: 34 pages, 3 figures, 18 tables +
+
+
+
+
+ + ♻ ☆ Deep Unfolding-based Weighted Averaging for Federated Learning in + Heterogeneous Environments + + +
+ Federated learning is a collaborative model training method that iterates +model updates by multiple clients and aggregation of the updates by a central +server. Device and statistical heterogeneity of participating clients cause +significant performance degradation so that an appropriate aggregation weight +should be assigned to each client in the aggregation phase of the server. To +adjust the aggregation weights, this paper employs deep unfolding, which is +known as the parameter tuning method that leverages both learning capability +using training data like deep learning and domain knowledge. This enables us to +directly incorporate the heterogeneity of the environment of interest into the +tuning of the aggregation weights. The proposed approach can be combined with +various federated learning algorithms. The results of numerical experiments +indicate that a higher test accuracy for unknown class-balanced data can be +obtained with the proposed method than that with conventional heuristic +weighting methods. The proposed method can handle large-scale learning models +with the aid of pretrained models such that it can perform practical real-world +tasks. Convergence rate of federated learning algorithms with the proposed +method is also provided in this paper. + +
+
+
+
+
+ + ♻ ☆ Functional optimal transport: map estimation and domain adaptation for + functional data + + +
+ We introduce a formulation of optimal transport problem for distributions on +function spaces, where the stochastic map between functional domains can be +partially represented in terms of an (infinite-dimensional) Hilbert-Schmidt +operator mapping a Hilbert space of functions to another. For numerous machine +learning tasks, data can be naturally viewed as samples drawn from spaces of +functions, such as curves and surfaces, in high dimensions. Optimal transport +for functional data analysis provides a useful framework of treatment for such +domains. { Since probability measures in infinite dimensional spaces generally +lack absolute continuity (that is, with respect to non-degenerate Gaussian +measures), the Monge map in the standard optimal transport theory for finite +dimensional spaces may not exist. Our approach to the optimal transport problem +in infinite dimensions is by a suitable regularization technique -- we restrict +the class of transport maps to be a Hilbert-Schmidt space of operators.} To +this end, we develop an efficient algorithm for finding the stochastic +transport map between functional domains and provide theoretical guarantees on +the existence, uniqueness, and consistency of our estimate for the +Hilbert-Schmidt operator. We validate our method on synthetic datasets and +examine the functional properties of the transport map. Experiments on +real-world datasets of robot arm trajectories further demonstrate the +effectiveness of our method on applications in domain adaptation. + +
+
+ comment: 48 pages, 10 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Revolutionizing Genomics with Reinforcement Learning Techniques + + +
+ In recent years, Reinforcement Learning (RL) has emerged as a powerful tool +for solving a wide range of problems, including decision-making and genomics. +The exponential growth of raw genomic data over the past two decades has +exceeded the capacity of manual analysis, leading to a growing interest in +automatic data analysis and processing. RL algorithms are capable of learning +from experience with minimal human supervision, making them well-suited for +genomic data analysis and interpretation. One of the key benefits of using RL +is the reduced cost associated with collecting labeled training data, which is +required for supervised learning. While there have been numerous studies +examining the applications of Machine Learning (ML) in genomics, this survey +focuses exclusively on the use of RL in various genomics research fields, +including gene regulatory networks (GRNs), genome assembly, and sequence +alignment. We present a comprehensive technical overview of existing studies on +the application of RL in genomics, highlighting the strengths and limitations +of these approaches. We then discuss potential research directions that are +worthy of future exploration, including the development of more sophisticated +reward functions as RL heavily depends on the accuracy of the reward function, +the integration of RL with other machine learning techniques, and the +application of RL to new and emerging areas in genomics research. Finally, we +present our findings and conclude by summarizing the current state of the field +and the future outlook for RL in genomics. + +
+
+
+
+
+ + ♻ ☆ On Formal Feature Attribution and Its Approximation + + +
+ Recent years have witnessed the widespread use of artificial intelligence +(AI) algorithms and machine learning (ML) models. Despite their tremendous +success, a number of vital problems like ML model brittleness, their fairness, +and the lack of interpretability warrant the need for the active developments +in explainable artificial intelligence (XAI) and formal ML model verification. +The two major lines of work in XAI include feature selection methods, e.g. +Anchors, and feature attribution techniques, e.g. LIME and SHAP. Despite their +promise, most of the existing feature selection and attribution approaches are +susceptible to a range of critical issues, including explanation unsoundness +and out-of-distribution sampling. A recent formal approach to XAI (FXAI) +although serving as an alternative to the above and free of these issues +suffers from a few other limitations. For instance and besides the scalability +limitation, the formal approach is unable to tackle the feature attribution +problem. Additionally, a formal explanation despite being formally sound is +typically quite large, which hampers its applicability in practical settings. +Motivated by the above, this paper proposes a way to apply the apparatus of +formal XAI to the case of feature attribution based on formal explanation +enumeration. Formal feature attribution (FFA) is argued to be advantageous over +the existing methods, both formal and non-formal. Given the practical +complexity of the problem, the paper then proposes an efficient technique for +approximating exact FFA. Finally, it offers experimental evidence of the +effectiveness of the proposed approximate FFA in comparison to the existing +feature attribution algorithms not only in terms of feature importance and but +also in terms of their relative order. + +
+
+
+
+
+ + ♻ ☆ Continuous-Time User Preference Modelling for Temporal Sets Prediction + + +
+ Given a sequence of sets, where each set has a timestamp and contains an +arbitrary number of elements, temporal sets prediction aims to predict the +elements in the subsequent set. Previous studies for temporal sets prediction +mainly focus on the modelling of elements and implicitly represent each user's +preference based on his/her interacted elements. However, user preferences are +often continuously evolving and the evolutionary trend cannot be fully captured +with the indirect learning paradigm of user preferences. To this end, we +propose a continuous-time user preference modelling framework for temporal sets +prediction, which explicitly models the evolving preference of each user by +maintaining a memory bank to store the states of all the users and elements. +Specifically, we first construct a universal sequence by arranging all the +user-set interactions in a non-descending temporal order, and then +chronologically learn from each user-set interaction. For each interaction, we +continuously update the memories of the related user and elements based on +their currently encoded messages and past memories. Moreover, we present a +personalized user behavior learning module to discover user-specific +characteristics based on each user's historical sequence, which aggregates the +previously interacted elements from dual perspectives according to the user and +elements. Finally, we develop a set-batch algorithm to improve the model +efficiency, which can create time-consistent batches in advance and achieve +3.5x and 3.0x speedups in the training and evaluation process on average. +Experiments on four real-world datasets demonstrate the superiority of our +approach over state-of-the-arts under both transductive and inductive settings. +The good interpretability of our method is also shown. + +
+
+ comment: Accepted by the TKDE journal +
+
+
+
+
+ + ♻ ☆ Symmetry-Preserving Program Representations for Learning Code Semantics + + +
+ Large Language Models (LLMs) have shown promise in automated program +reasoning, a crucial aspect of many security tasks. However, existing LLM +architectures for code are often borrowed from other domains like natural +language processing, raising concerns about their generalization and robustness +to unseen code. A key generalization challenge is to incorporate the knowledge +of code semantics, including control and data flow, into the LLM architectures. + Drawing inspiration from examples of convolution layers exploiting +translation symmetry, we explore how code symmetries can enhance LLM +architectures for program analysis and modeling. We present a rigorous +group-theoretic framework that formally defines code symmetries as +semantics-preserving transformations and provides techniques for precisely +reasoning about symmetry preservation within LLM architectures. Using this +framework, we introduce a novel variant of self-attention that preserves +program symmetries, demonstrating its effectiveness in generalization and +robustness through detailed experimental evaluations across different binary +and source code analysis tasks. Overall, our code symmetry framework offers +rigorous and powerful reasoning techniques that can guide the future +development of specialized LLMs for code and advance LLM-guided program +reasoning tasks. + +
+
+
+
+
+ + ♻ ☆ One-shot Ultra-high-Resolution Generative Adversarial Network That + Synthesizes 16K Images On A Single GPU + + +
+ We propose a one-shot ultra-high-resolution generative adversarial network +(OUR-GAN) framework that generates non-repetitive 16K (16, 384 x 8, 640) images +from a single training image and is trainable on a single consumer GPU. OUR-GAN +generates an initial image that is visually plausible and varied in shape at +low resolution, and then gradually increases the resolution by adding detail +through super-resolution. Since OUR-GAN learns from a real +ultra-high-resolution (UHR) image, it can synthesize large shapes with fine +details and long-range coherence, which is difficult to achieve with +conventional generative models that rely on the patch distribution learned from +relatively small images. OUR-GAN can synthesize high-quality 16K images with +12.5 GB of GPU memory and 4K images with only 4.29 GB as it synthesizes a UHR +image part by part through seamless subregion-wise super-resolution. +Additionally, OUR-GAN improves visual coherence while maintaining diversity by +applying vertical positional convolution. In experiments on the ST4K and RAISE +datasets, OUR-GAN exhibited improved fidelity, visual coherency, and diversity +compared with the baseline one-shot synthesis models. To the best of our +knowledge, OUR-GAN is the first one-shot image synthesizer that generates +non-repetitive UHR images on a single consumer GPU. The synthesized image +samples are presented at https://our-gan.github.io. + +
+
+ comment: 36 pages, 26 figures +
+
+
+
+
+ + ♻ ☆ Adaptive Negative Evidential Deep Learning for Open-set Semi-supervised + Learning + + +
+ Semi-supervised learning (SSL) methods assume that labeled data, unlabeled +data and test data are from the same distribution. Open-set semi-supervised +learning (Open-set SSL) considers a more practical scenario, where unlabeled +data and test data contain new categories (outliers) not observed in labeled +data (inliers). Most previous works focused on outlier detection via binary +classifiers, which suffer from insufficient scalability and inability to +distinguish different types of uncertainty. In this paper, we propose a novel +framework, Adaptive Negative Evidential Deep Learning (ANEDL) to tackle these +limitations. Concretely, we first introduce evidential deep learning (EDL) as +an outlier detector to quantify different types of uncertainty, and design +different uncertainty metrics for self-training and inference. Furthermore, we +propose a novel adaptive negative optimization strategy, making EDL more +tailored to the unlabeled dataset containing both inliers and outliers. As +demonstrated empirically, our proposed method outperforms existing +state-of-the-art methods across four datasets. + +
+
+
+
+
+ + ♻ ☆ A Low Latency Adaptive Coding Spiking Framework for Deep Reinforcement + Learning + + +
+ In recent years, spiking neural networks (SNNs) have been used in +reinforcement learning (RL) due to their low power consumption and event-driven +features. However, spiking reinforcement learning (SRL), which suffers from +fixed coding methods, still faces the problems of high latency and poor +versatility. In this paper, we use learnable matrix multiplication to encode +and decode spikes, improving the flexibility of the coders and thus reducing +latency. Meanwhile, we train the SNNs using the direct training method and use +two different structures for online and offline RL algorithms, which gives our +model a wider range of applications. Extensive experiments have revealed that +our method achieves optimal performance with ultra-low latency (as low as 0.8% +of other SRL methods) and excellent energy efficiency (up to 5X the DNNs) in +different algorithms and different environments. + +
+
+
+
+
+ + ♻ ☆ Near-Optimal Nonconvex-Strongly-Convex Bilevel Optimization with Fully + First-Order Oracles + + +
+ Bilevel optimization has wide applications such as hyperparameter tuning, +neural architecture search, and meta-learning. Designing efficient algorithms +for bilevel optimization is challenging because the lower-level problem defines +a feasibility set implicitly via another optimization problem. In this work, we +consider one tractable case when the lower-level problem is strongly convex. +Recent works show that with a Hessian-vector product oracle, one can provably +find an $\epsilon$-first-order stationary point within +$\tilde{\mathcal{O}}(\epsilon^{-2})$ oracle calls. However, Hessian-vector +product may be inaccessible or expensive in practice. Kwon et al. (ICML 2023) +addressed this issue by proposing a first-order method that can achieve the +same goal at a slower rate of $\tilde{\mathcal{O}}(\epsilon^{-3})$. In this +work, we provide a tighter analysis demonstrating that this method can converge +at the near-optimal $\tilde {\mathcal{O}}(\epsilon^{-2})$ rate as second-order +methods. Our analysis further leads to simple first-order algorithms that +achieve similar convergence rates for finding second-order stationary points +and for distributed bilevel problems. + +
+
+ comment: slightly change the title +
+
+
+
+
+ + ♻ ☆ Federated Linear Bandit Learning via Over-the-Air Computation + + +
+ In this paper, we investigate federated contextual linear bandit learning +within a wireless system that comprises a server and multiple devices. Each +device interacts with the environment, selects an action based on the received +reward, and sends model updates to the server. The primary objective is to +minimize cumulative regret across all devices within a finite time horizon. To +reduce the communication overhead, devices communicate with the server via +over-the-air computation (AirComp) over noisy fading channels, where the +channel noise may distort the signals. In this context, we propose a customized +federated linear bandits scheme, where each device transmits an analog signal, +and the server receives a superposition of these signals distorted by channel +noise. A rigorous mathematical analysis is conducted to determine the regret +bound of the proposed scheme. Both theoretical analysis and numerical +experiments demonstrate the competitive performance of our proposed scheme in +terms of regret bounds in various settings. + +
+
+
+
+
+ + ♻ ☆ Incomplete Multi-View Weak-Label Learning + + +
+ A variety of modern applications exhibit multi-view multi-label learning, +where each sample has multi-view features, and multiple labels are correlated +via common views. Current methods usually fail to directly deal with the +setting where only a subset of features and labels are observed for each +sample, and ignore the presence of noisy views and imbalanced labels in +real-world problems. In this paper, we propose a novel method to overcome the +limitations. It jointly embeds incomplete views and weak labels into a +low-dimensional subspace with adaptive weights, and facilitates the difference +between embedding weight matrices via auto-weighted Hilbert-Schmidt +Independence Criterion (HSIC) to reduce the redundancy. Moreover, it adaptively +learns view-wise importance for embedding to detect noisy views, and mitigates +the label imbalance problem by focal loss. Experimental results on four +real-world multi-view multi-label datasets demonstrate the effectiveness of the +proposed method. + +
+
+ comment: 6 pages, 2 figures, conference +
+
+
+
+
+ + ♻ ☆ ProAgent: Building Proactive Cooperative AI with Large Language Models + + +
+ Building AIs with adaptive behaviors in human-AI cooperation stands as a +pivotal focus in AGI research. Current methods for developing cooperative +agents predominantly rely on learning-based methods, where policy +generalization heavily hinges on past interactions with specific teammates. +These approaches constrain the agent's capacity to recalibrate its strategy +when confronted with novel teammates. We propose \textbf{ProAgent}, a novel +framework that harnesses large language models (LLMs) to fashion a +\textit{pro}active \textit{agent} empowered with the ability to anticipate +teammates' forthcoming decisions and formulate enhanced plans for itself. +ProAgent excels at cooperative reasoning with the capacity to dynamically adapt +its behavior to enhance collaborative efforts with teammates. Moreover, the +ProAgent framework exhibits a high degree of modularity and interpretability, +facilitating seamless integration to address a wide array of coordination +scenarios. Experimental evaluations conducted within the framework of +\textit{Overcook-AI} unveil the remarkable performance superiority of ProAgent, +outperforming five methods based on self-play and population-based training in +cooperation with AI agents. Further, when cooperating with human proxy models, +its performance exhibits an average improvement exceeding 10\% compared to the +current state-of-the-art, COLE. The advancement was consistently observed +across diverse scenarios involving interactions with both AI agents of varying +characteristics and human counterparts. These findings inspire future research +for human-robot collaborations. For a hands-on demonstration, please visit +\url{https://pku-proagent.github.io}. + +
+
+
+
+
+ + ♻ ☆ BarlowRL: Barlow Twins for Data-Efficient Reinforcement Learning + + +
+ This paper introduces BarlowRL, a data-efficient reinforcement learning agent +that combines the Barlow Twins self-supervised learning framework with DER +(Data-Efficient Rainbow) algorithm. BarlowRL outperforms both DER and its +contrastive counterpart CURL on the Atari 100k benchmark. BarlowRL avoids +dimensional collapse by enforcing information spread to the whole space. This +helps RL algorithms to utilize uniformly spread state representation that +eventually results in a remarkable performance. The integration of Barlow Twins +with DER enhances data efficiency and achieves superior performance in the RL +tasks. BarlowRL demonstrates the potential of incorporating self-supervised +learning techniques to improve RL algorithms. + +
+
+
+
+
+ + ♻ ☆ QNet: A Quantum-native Sequence Encoder Architecture + + +
+ This work proposes QNet, a novel sequence encoder model that entirely +inferences on the quantum computer using a minimum number of qubits. Let $n$ +and $d$ represent the length of the sequence and the embedding size, +respectively. The dot-product attention mechanism requires a time complexity of +$O(n^2 \cdot d)$, while QNet has merely $O(n+d)$ quantum circuit depth. In +addition, we introduce ResQNet, a quantum-classical hybrid model composed of +several QNet blocks linked by residual connections, as an isomorph Transformer +Encoder. We evaluated our work on various natural language processing tasks, +including text classification, rating score prediction, and named entity +recognition. Our models exhibit compelling performance over classical +state-of-the-art models with a thousand times fewer parameters. In summary, +this work investigates the advantage of machine learning on near-term quantum +computers in sequential data by experimenting with natural language processing +tasks. + +
+
+ comment: QCE23: 2023 IEEE International Conference on Quantum Computing & + Engineering +
+
+
+
+
+ + ♻ ☆ Bayesian low-rank adaptation for large language models + + +
+ Parameter-efficient fine-tuning (PEFT) has emerged as a new paradigm for +cost-efficient fine-tuning of large language models (LLMs), with low-rank +adaptation (LoRA) being a widely adopted choice. However, fine-tuned LLMs often +become overconfident especially when fine-tuned on small datasets. Bayesian +methods, with their inherent ability to estimate uncertainty, serve as potent +tools to mitigate overconfidence and enhance calibration. In this work, we +introduce Laplace-LoRA, a straightforward yet effective Bayesian method, which +applies the Laplace approximation to the LoRA parameters and, considerably +boosts the calibration of fine-tuned LLMs. + +
+
+
+
+
+ + ♻ ☆ Energy Management of Multi-mode Plug-in Hybrid Electric Vehicle using + Multi-agent Deep Reinforcement Learning + + +
+ The recently emerging multi-mode plug-in hybrid electric vehicle (PHEV) +technology is one of the pathways making contributions to decarbonization, and +its energy management requires multiple-input and multipleoutput (MIMO) +control. At the present, the existing methods usually decouple the MIMO control +into singleoutput (MISO) control and can only achieve its local optimal +performance. To optimize the multi-mode vehicle globally, this paper studies a +MIMO control method for energy management of the multi-mode PHEV based on +multi-agent deep reinforcement learning (MADRL). By introducing a relevance +ratio, a hand-shaking strategy is proposed to enable two learning agents to +work collaboratively under the MADRL framework using the deep deterministic +policy gradient (DDPG) algorithm. Unified settings for the DDPG agents are +obtained through a sensitivity analysis of the influencing factors to the +learning performance. The optimal working mode for the hand-shaking strategy is +attained through a parametric study on the relevance ratio. The advantage of +the proposed energy management method is demonstrated on a software-in-the-loop +testing platform. The result of the study indicates that the learning rate of +the DDPG agents is the greatest influencing factor for learning performance. +Using the unified DDPG settings and a relevance ratio of 0.2, the proposed +MADRL system can save up to 4% energy compared to the single-agent learning +system and up to 23.54% energy compared to the conventional rule-based system. + +
+
+
+
+
+ + ♻ ☆ Differentially Private Sampling from Rashomon Sets, and the Universality + of Langevin Diffusion for Convex Optimization COLT 2023 + + +
+ In this paper we provide an algorithmic framework based on Langevin diffusion +(LD) and its corresponding discretizations that allow us to simultaneously +obtain: i) An algorithm for sampling from the exponential mechanism, whose +privacy analysis does not depend on convexity and which can be stopped at +anytime without compromising privacy, and ii) tight uniform stability +guarantees for the exponential mechanism. As a direct consequence, we obtain +optimal excess empirical and population risk guarantees for (strongly) convex +losses under both pure and approximate differential privacy (DP). The framework +allows us to design a DP uniform sampler from the Rashomon set. Rashomon sets +are widely used in interpretable and robust machine learning, understanding +variable importance, and characterizing fairness. + +
+
+ comment: Appeared in COLT 2023. For ease of presentation, some results appear + in the previous version of this paper on arXiv (v3) that do not appear in + this version, nor are subsumed by results in this version. Please see Section + 1.4 for more details +
+
+
+
+
+ + ♻ ☆ Principles and Guidelines for Evaluating Social Robot Navigation + Algorithms + + +
+ A major challenge to deploying robots widely is navigation in human-populated +environments, commonly referred to as social robot navigation. While the field +of social navigation has advanced tremendously in recent years, the fair +evaluation of algorithms that tackle social navigation remains hard because it +involves not just robotic agents moving in static environments but also dynamic +human agents and their perceptions of the appropriateness of robot behavior. In +contrast, clear, repeatable, and accessible benchmarks have accelerated +progress in fields like computer vision, natural language processing and +traditional robot navigation by enabling researchers to fairly compare +algorithms, revealing limitations of existing solutions and illuminating +promising new directions. We believe the same approach can benefit social +navigation. In this paper, we pave the road towards common, widely accessible, +and repeatable benchmarking criteria to evaluate social robot navigation. Our +contributions include (a) a definition of a socially navigating robot as one +that respects the principles of safety, comfort, legibility, politeness, social +competency, agent understanding, proactivity, and responsiveness to context, +(b) guidelines for the use of metrics, development of scenarios, benchmarks, +datasets, and simulators to evaluate social navigation, and (c) a design of a +social navigation metrics framework to make it easier to compare results from +different simulators, robots and datasets. + +
+
+ comment: 42 pages, 11 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Scissorhands: Exploiting the Persistence of Importance Hypothesis for + LLM KV Cache Compression at Test Time + + +
+ Large language models(LLMs) have sparked a new wave of exciting AI +applications. Hosting these models at scale requires significant memory +resources. One crucial memory bottleneck for the deployment stems from the +context window. It is commonly recognized that model weights are memory hungry; +however, the size of key-value embedding stored during the generation process +(KV cache) can easily surpass the model size. The enormous size of the KV cache +puts constraints on the inference batch size, which is crucial for high +throughput inference workload. Inspired by an interesting observation of the +attention scores, we hypothesize the persistence of importance: only pivotal +tokens, which had a substantial influence at one step, will significantly +influence future generations. Based on our empirical verification and +theoretical analysis around this hypothesis, we propose Scissorhands, a system +that maintains the memory usage of the KV cache at a fixed budget without +finetuning the model. In essence, Scissorhands manages the KV cache by storing +the pivotal tokens with a higher probability. We validate that Scissorhands +reduces the inference memory usage of the KV cache by up to 5X without +compromising model quality. We further demonstrate that Scissorhands can be +combined with 4-bit quantization, traditionally used to compress model weights, +to achieve up to 20X compression. + +
+
+
+
+
+ + ♻ ☆ PRANC: Pseudo RAndom Networks for Compacting deep models + + +
+ We demonstrate that a deep model can be reparametrized as a linear +combination of several randomly initialized and frozen deep models in the +weight space. During training, we seek local minima that reside within the +subspace spanned by these random models (i.e., `basis' networks). Our +framework, PRANC, enables significant compaction of a deep model. The model can +be reconstructed using a single scalar `seed,' employed to generate the +pseudo-random `basis' networks, together with the learned linear mixture +coefficients. + In practical applications, PRANC addresses the challenge of efficiently +storing and communicating deep models, a common bottleneck in several +scenarios, including multi-agent learning, continual learners, federated +systems, and edge devices, among others. In this study, we employ PRANC to +condense image classification models and compress images by compacting their +associated implicit neural networks. PRANC outperforms baselines with a large +margin on image classification when compressing a deep model almost $100$ +times. Moreover, we show that PRANC enables memory-efficient inference by +generating layer-wise weights on the fly. The source code of PRANC is here: +\url{https://github.com/UCDvision/PRANC} + +
+
+
+
+
+ + ♻ ☆ Discovery and Exploitation of Generalized Network Effects + + +
+ Given a large graph with few node labels, how can we (a) identify whether +there is generalized network-effects (GNE) of the graph or not, (b) estimate +GNE to explain the interrelations among node classes, and (c) exploit GNE to +improve downstream tasks such as predicting the unknown labels accurately and +efficiently? The knowledge of GNE is valuable for various tasks like node +classification and targeted advertising. However, identifying and understanding +GNE such as homophily, heterophily or their combination is challenging in +real-world graphs due to limited availability of node labels and noisy edges. +We propose NetEffect, a graph mining approach to address the above issues, +enjoying the following properties: (i) Principled: a statistical test to +determine the presence of GNE in a graph with few node labels; (ii) General and +Explainable: a closed-form solution to estimate the specific type of GNE +observed; and (iii) Accurate and Scalable: the integration of GNE for accurate +and fast node classification. Applied on public, real-world graphs, NetEffect +discovers the unexpected absence of GNE in numerous graphs, which previously +thought to exhibit heterophily. Further, we show that incorporating GNE is +effective on node classification. On a large real-world graph with 1.6M nodes +and 22.3M edges, NetEffect achieves over 7 times speedup (14 minutes vs. 2 +hours) compared to most competitors. + +
+
+ comment: Under Submission +
+
+
+
+
+ + ♻ ☆ Mobilizing Personalized Federated Learning in Infrastructure-Less and + Heterogeneous Environments via Random Walk Stochastic ADMM + + +
+ This paper explores the challenges of implementing Federated Learning (FL) in +practical scenarios featuring isolated nodes with data heterogeneity, which can +only be connected to the server through wireless links in an +infrastructure-less environment. To overcome these challenges, we propose a +novel mobilizing personalized FL approach, which aims to facilitate mobility +and resilience. Specifically, we develop a novel optimization algorithm called +Random Walk Stochastic Alternating Direction Method of Multipliers (RWSADMM). +RWSADMM capitalizes on the server's random movement toward clients and +formulates local proximity among their adjacent clients based on hard +inequality constraints rather than requiring consensus updates or introducing +bias via regularization methods. To mitigate the computational burden on the +clients, an efficient stochastic solver of the approximated optimization +problem is designed in RWSADMM, which provably converges to the stationary +point almost surely in expectation. Our theoretical and empirical results +demonstrate the provable fast convergence and substantial accuracy improvements +achieved by RWSADMM compared to baseline methods, along with its benefits of +reduced communication costs and enhanced scalability. + +
+
+ comment: 28 pages, 7 figures, 3 tables, 1 algorithm. Proof details are + provided in the main body of the paper +
+
+
+
+
+ + ♻ ☆ When Do Annotator Demographics Matter? Measuring the Influence of + Annotator Demographics with the POPQUORN Dataset + + +
+ Annotators are not fungible. Their demographics, life experiences, and +backgrounds all contribute to how they label data. However, NLP has only +recently considered how annotator identity might influence their decisions. +Here, we present POPQUORN (the POtato-Prolific dataset for QUestion-Answering, +Offensiveness, text Rewriting, and politeness rating with demographic Nuance). +POPQUORN contains 45,000 annotations from 1,484 annotators, drawn from a +representative sample regarding sex, age, and race as the US population. +Through a series of analyses, we show that annotators' background plays a +significant role in their judgments. Further, our work shows that backgrounds +not previously considered in NLP (e.g., education), are meaningful and should +be considered. Our study suggests that understanding the background of +annotators and collecting labels from a demographically balanced pool of crowd +workers is important to reduce the bias of datasets. The dataset, annotator +background, and annotation interface are available at +https://github.com/Jiaxin-Pei/potato-prolific-dataset . + +
+
+
+
+
+ + ♻ ☆ Second-order Conditional Gradient Sliding + + +
+ Constrained second-order convex optimization algorithms are the method of +choice when a high accuracy solution to a problem is needed, due to their local +quadratic convergence. These algorithms require the solution of a constrained +quadratic subproblem at every iteration. We present the \emph{Second-Order +Conditional Gradient Sliding} (SOCGS) algorithm, which uses a projection-free +algorithm to solve the constrained quadratic subproblems inexactly. When the +feasible region is a polytope the algorithm converges quadratically in primal +gap after a finite number of linearly convergent iterations. Once in the +quadratic regime the SOCGS algorithm requires $\mathcal{O}(\log(\log +1/\varepsilon))$ first-order and Hessian oracle calls and $\mathcal{O}(\log +(1/\varepsilon) \log(\log1/\varepsilon))$ linear minimization oracle calls to +achieve an $\varepsilon$-optimal solution. This algorithm is useful when the +feasible region can only be accessed efficiently through a linear optimization +oracle, and computing first-order information of the function, although +possible, is costly. + +
+
+
+
+
+ + ♻ ☆ Theoretical Guarantees of Learning Ensembling Strategies with + Applications to Time Series Forecasting ICML 2023 + + +
+ Ensembling is among the most popular tools in machine learning (ML) due to +its effectiveness in minimizing variance and thus improving generalization. +Most ensembling methods for black-box base learners fall under the umbrella of +"stacked generalization," namely training an ML algorithm that takes the +inferences from the base learners as input. While stacking has been widely +applied in practice, its theoretical properties are poorly understood. In this +paper, we prove a novel result, showing that choosing the best stacked +generalization from a (finite or finite-dimensional) family of stacked +generalizations based on cross-validated performance does not perform "much +worse" than the oracle best. Our result strengthens and significantly extends +the results in Van der Laan et al. (2007). Inspired by the theoretical +analysis, we further propose a particular family of stacked generalizations in +the context of probabilistic forecasting, each one with a different sensitivity +for how much the ensemble weights are allowed to vary across items, timestamps +in the forecast horizon, and quantiles. Experimental results demonstrate the +performance gain of the proposed method. + +
+
+ comment: ICML 2023 +
+
+
+
+
+ + ♻ ☆ When to Show a Suggestion? Integrating Human Feedback in AI-Assisted + Programming + + +
+ AI powered code-recommendation systems, such as Copilot and CodeWhisperer, +provide code suggestions inside a programmer's environment (e.g., an IDE) with +the aim to improve their productivity. Since, in these scenarios, programmers +accept and reject suggestions, ideally, such a system should use this feedback +in furtherance of this goal. In this work, we leverage prior data of +programmers interacting with GitHub Copilot, a system used by millions of +programmers, to develop interventions that can save programmer time. We propose +a utility theory framework, which models this interaction with programmers and +decides which suggestions to display. Our framework Conditional suggestion +Display from Human Feedback (CDHF), relies on a cascade of models that predict +suggestion acceptance to selectively hide suggestions reducing both latency and +programmer verification time. Using data from 535 programmers, we perform a +retrospective evaluation of CDHF and show that we can avoid displaying a +significant fraction of suggestions that would have been rejected doing so +without total knowledge of the suggestions themselves. We further demonstrate +the importance of incorporating the programmer's latent unobserved state in +deciding when to display suggestions through ablations on user study data. +Finally, we showcase that using suggestion acceptance as a reward signal to +know which suggestions to display leads to reduced quality suggestions +indicating an unexpected pitfall. + +
+
+ comment: Previous version of these results can be found in arXiv:2210.14306 +
+
+
+
+
+ + ♻ ☆ Blockwise Parallel Transformer for Large Context Models + + +
+ Transformers have emerged as the cornerstone of state-of-the-art natural +language processing models, showcasing exceptional performance across a wide +range of AI applications. However, the memory demands posed by the +self-attention mechanism and the large feedforward network in Transformers +limit their ability to handle long sequences, thereby creating challenges for +tasks involving multiple long sequences or long-term dependencies. We present a +distinct approach, Blockwise Parallel Transformer (BPT), that leverages +blockwise computation of self-attention and feedforward network fusion to +minimize memory costs. By processing longer input sequences while maintaining +memory efficiency, BPT enables training sequences 32 times longer than vanilla +Transformers and up to 4 times longer than previous memory-efficient methods. +Extensive experiments on language modeling and reinforcement learning tasks +demonstrate the effectiveness of BPT in reducing memory requirements and +improving performance. + +
+
+
+
+
+
+
+
+ + Multimedia 7 + +
+
+
+ + ☆ Towards enabling reliable immersive teleoperation through Digital Twin: + A UAV command and control use case + + +
+ This paper addresses the challenging problem of enabling reliable immersive +teleoperation in scenarios where an Unmanned Aerial Vehicle (UAV) is remotely +controlled by an operator via a cellular network. Such scenarios can be quite +critical particularly when the UAV lacks advanced equipment (e.g., Lidar-based +auto stop) or when the network is subject to some performance constraints +(e.g., delay). To tackle these challenges, we propose a novel architecture +leveraging Digital Twin (DT) technology to create a virtual representation of +the physical environment. This virtual environment accurately mirrors the +physical world, accounting for 3D surroundings, weather constraints, and +network limitations. To enhance teleoperation, the UAV in the virtual +environment is equipped with advanced features that maybe absent in the real +UAV. Furthermore, the proposed architecture introduces an intelligent logic +that utilizes information from both virtual and physical environments to +approve, deny, or correct actions initiated by the UAV operator. This +anticipatory approach helps to mitigate potential risks. Through a series of +field trials, we demonstrate the effectiveness of the proposed architecture in +significantly improving the reliability of UAV teleoperation. + +
+
+ comment: Accepted by IEEE Globecom 2023 +
+
+
+
+
+ + ☆ Priority-Centric Human Motion Generation in Discrete Latent Space ICCV2023 + + +
+ Text-to-motion generation is a formidable task, aiming to produce human +motions that align with the input text while also adhering to human +capabilities and physical laws. While there have been advancements in diffusion +models, their application in discrete spaces remains underexplored. Current +methods often overlook the varying significance of different motions, treating +them uniformly. It is essential to recognize that not all motions hold the same +relevance to a particular textual description. Some motions, being more salient +and informative, should be given precedence during generation. In response, we +introduce a Priority-Centric Motion Discrete Diffusion Model (M2DM), which +utilizes a Transformer-based VQ-VAE to derive a concise, discrete motion +representation, incorporating a global self-attention mechanism and a +regularization term to counteract code collapse. We also present a motion +discrete diffusion model that employs an innovative noise schedule, determined +by the significance of each motion token within the entire motion sequence. +This approach retains the most salient motions during the reverse diffusion +process, leading to more semantically rich and varied motions. Additionally, we +formulate two strategies to gauge the importance of motion tokens, drawing from +both textual and visual indicators. Comprehensive experiments on the HumanML3D +and KIT-ML datasets confirm that our model surpasses existing techniques in +fidelity and diversity, particularly for intricate textual descriptions. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ UMMAFormer: A Universal Multimodal-adaptive Transformer Framework for + Temporal Forgery Localization ACM MM 2023 + + +
+ The emergence of artificial intelligence-generated content (AIGC) has raised +concerns about the authenticity of multimedia content in various fields. +However, existing research for forgery content detection has focused mainly on +binary classification tasks of complete videos, which has limited applicability +in industrial settings. To address this gap, we propose UMMAFormer, a novel +universal transformer framework for temporal forgery localization (TFL) that +predicts forgery segments with multimodal adaptation. Our approach introduces a +Temporal Feature Abnormal Attention (TFAA) module based on temporal feature +reconstruction to enhance the detection of temporal differences. We also design +a Parallel Cross-Attention Feature Pyramid Network (PCA-FPN) to optimize the +Feature Pyramid Network (FPN) for subtle feature enhancement. To evaluate the +proposed method, we contribute a novel Temporal Video Inpainting Localization +(TVIL) dataset specifically tailored for video inpainting scenes. Our +experiments show that our approach achieves state-of-the-art performance on +benchmark datasets, including Lav-DF, TVIL, and Psynd, significantly +outperforming previous methods. The code and data are available at +https://github.com/ymhzyj/UMMAFormer/. + +
+
+ comment: 11 pages, 8 figures, 66 references. This paper has been accepted for + ACM MM 2023 +
+
+
+
+
+ + ☆ UniPT: Universal Parallel Tuning for Transfer Learning with Efficient + Parameter and Memory + + +
+ Fine-tuning pre-trained models has emerged as a powerful technique in +numerous domains, owing to its ability to leverage enormous pre-existing +knowledge and achieve remarkable performance on downstream tasks. However, +updating the parameters of entire networks is computationally intensive. +Although state-of-the-art parameter-efficient transfer learning (PETL) methods +significantly reduce the trainable parameters and storage demand, almost all of +them still need to back-propagate the gradients through large pre-trained +networks. This memory-extensive characteristic extremely limits the +applicability of PETL methods in real-world scenarios. To this end, we propose +a new memory-efficient PETL strategy, dubbed Universal Parallel Tuning (UniPT). +Specifically, we facilitate the transfer process via a lightweight learnable +parallel network, which consists of two modules: 1) A parallel interaction +module that decouples the inherently sequential connections and processes the +intermediate activations detachedly of the pre-trained network. 2) A confidence +aggregation module that learns optimal strategies adaptively for integrating +cross-layer features. We evaluate UniPT with different backbones (e.g., +VSE$\infty$, CLIP4Clip, Clip-ViL, and MDETR) on five challenging +vision-and-language tasks (i.e., image-text retrieval, video-text retrieval, +visual question answering, compositional question answering, and visual +grounding). Extensive ablations on ten datasets have validated that our UniPT +can not only dramatically reduce memory consumption and outperform the best +memory-efficient competitor, but also achieve higher performance than existing +PETL methods in a low-memory scenario on different architectures. Our code is +publicly available at: https://github.com/Paranioar/UniPT. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+ + ☆ Parameter-Efficient Transfer Learning for Audio-Visual-Language Tasks + + +
+ The pretrain-then-finetune paradigm has been widely used in various unimodal +and multimodal tasks. However, finetuning all the parameters of a pre-trained +model becomes prohibitive as the model size grows exponentially. To address +this issue, the adapter mechanism that freezes the pre-trained model and only +finetunes a few extra parameters is introduced and delivers promising results. +Most studies on adapter architectures are dedicated to unimodal or bimodal +tasks, while the adapter architectures for trimodal tasks have not been +investigated yet. This paper introduces a novel Long Short-Term Trimodal +Adapter (LSTTA) approach for video understanding tasks involving audio, visual, +and language modalities. Based on the pre-trained from the three modalities, +the designed adapter module is inserted between the sequential blocks to model +the dense interactions across the three modalities. Specifically, LSTTA +consists of two types of complementary adapter modules, namely the long-term +semantic filtering module and the short-term semantic interaction module. The +long-term semantic filtering aims to characterize the temporal importance of +the video frames and the short-term semantic interaction module models local +interactions within short periods. Compared to previous state-of-the-art +trimodal learning methods pre-trained on a large-scale trimodal corpus, LSTTA +is more flexible and can inherit any powerful unimodal or bimodal models. +Experimental results on four typical trimodal learning tasks show the +effectiveness of LSTTA over existing state-of-the-art methods. + +
+
+
+
+
+ + ☆ Cross-Modal Retrieval: A Systematic Review of Methods and Future + Directions + + +
+ With the exponential surge in diverse multi-modal data, traditional uni-modal +retrieval methods struggle to meet the needs of users demanding access to data +from various modalities. To address this, cross-modal retrieval has emerged, +enabling interaction across modalities, facilitating semantic matching, and +leveraging complementarity and consistency between different modal data. +Although prior literature undertook a review of the cross-modal retrieval +field, it exhibits numerous deficiencies pertaining to timeliness, taxonomy, +and comprehensiveness. This paper conducts a comprehensive review of +cross-modal retrieval's evolution, spanning from shallow statistical analysis +techniques to vision-language pre-training models. Commencing with a +comprehensive taxonomy grounded in machine learning paradigms, mechanisms, and +models, the paper then delves deeply into the principles and architectures +underpinning existing cross-modal retrieval methods. Furthermore, it offers an +overview of widely used benchmarks, metrics, and performances. Lastly, the +paper probes the prospects and challenges that confront contemporary +cross-modal retrieval, while engaging in a discourse on potential directions +for further progress in the field. To facilitate the research on cross-modal +retrieval, we develop an open-source code repository at +https://github.com/BMC-SDNU/Cross-Modal-Retrieval. + +
+
+
+
+
+ + ♻ ☆ CLE Diffusion: Controllable Light Enhancement Diffusion Model + + +
+ Low light enhancement has gained increasing importance with the rapid +development of visual creation and editing. However, most existing enhancement +algorithms are designed to homogeneously increase the brightness of images to a +pre-defined extent, limiting the user experience. To address this issue, we +propose Controllable Light Enhancement Diffusion Model, dubbed CLE Diffusion, a +novel diffusion framework to provide users with rich controllability. Built +with a conditional diffusion model, we introduce an illumination embedding to +let users control their desired brightness level. Additionally, we incorporate +the Segment-Anything Model (SAM) to enable user-friendly region +controllability, where users can click on objects to specify the regions they +wish to enhance. Extensive experiments demonstrate that CLE Diffusion achieves +competitive performance regarding quantitative metrics, qualitative results, +and versatile controllability. Project page: +https://yuyangyin.github.io/CLEDiffusion/ + +
+
+ comment: Accepted In Proceedings of the 31st ACM International Conference on + Multimedia (MM' 23) +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 19 + +
+
+
+ + ☆ Generations of Knowledge Graphs: The Crazy Ideas and the Business Impact + + +
+ Knowledge Graphs (KGs) have been used to support a wide range of +applications, from web search to personal assistant. In this paper, we describe +three generations of knowledge graphs: entity-based KGs, which have been +supporting general search and question answering (e.g., at Google and Bing); +text-rich KGs, which have been supporting search and recommendations for +products, bio-informatics, etc. (e.g., at Amazon and Alibaba); and the emerging +integration of KGs and LLMs, which we call dual neural KGs. We describe the +characteristics of each generation of KGs, the crazy ideas behind the scenes in +constructing such KGs, and the techniques developed over time to enable +industry impact. In addition, we use KGs as examples to demonstrate a recipe to +evolve research ideas from innovations to production practice, and then to the +next level of innovations, to advance both science and business. + +
+
+
+
+
+ + ☆ Symbolic and Language Agnostic Large Language Models + + +
+ We argue that the relative success of large language models (LLMs) is not a +reflection on the symbolic vs. subsymbolic debate but a reflection on employing +an appropriate strategy of bottom-up reverse engineering of language at scale. +However, due to the subsymbolic nature of these models whatever knowledge these +systems acquire about language will always be buried in millions of +microfeatures (weights) none of which is meaningful on its own. Moreover, and +due to their stochastic nature, these models will often fail in capturing +various inferential aspects that are prevalent in natural language. What we +suggest here is employing the successful bottom-up strategy in a symbolic +setting, producing symbolic, language agnostic and ontologically grounded large +language models. + +
+
+ comment: 4 pages - draft. arXiv admin note: substantial text overlap with + arXiv:2306.00017 +
+
+
+
+
+ + ☆ Empowering Cross-lingual Abilities of Instruction-tuned Large Language + Models by Translation-following demonstrations + + +
+ The language ability of Large Language Models (LLMs) is often unbalanced +towards English because of the imbalance in the distribution of the +pre-training data. This disparity is demanded in further fine-tuning and +affecting the cross-lingual abilities of LLMs. In this paper, we propose to +empower Instructiontuned LLMs (It-LLMs) in languages other than English by +building semantic alignment between them. Hence, we propose CrossAlpaca, an +It-LLM with cross-lingual instruction-following and Translation-following +demonstrations to improve semantic alignment between languages. We validate our +approach on the multilingual Question Answering (QA) benchmarks XQUAD and MLQA +and adapted versions of MMLU and BBH. Our models, tested over six different +languages, outperform the It-LLMs tuned on monolingual data. The final results +show that instruction tuning on non-English data is not enough and that +semantic alignment can be further improved by Translation-following +demonstrations. + +
+
+
+
+
+ + ☆ Generative AI for Business Strategy: Using Foundation Models to Create + Business Strategy Tools + + +
+ Generative models (foundation models) such as LLMs (large language models) +are having a large impact on multiple fields. In this work, we propose the use +of such models for business decision making. In particular, we combine +unstructured textual data sources (e.g., news data) with multiple foundation +models (namely, GPT4, transformer-based Named Entity Recognition (NER) models +and Entailment-based Zero-shot Classifiers (ZSC)) to derive IT (information +technology) artifacts in the form of a (sequence of) signed business networks. +We posit that such artifacts can inform business stakeholders about the state +of the market and their own positioning as well as provide quantitative +insights into improving their future outlook. + +
+
+
+
+
+ + ☆ Towards Vision-Language Mechanistic Interpretability: A Causal Tracing + Tool for BLIP ICCV 2023 + + +
+ Mechanistic interpretability seeks to understand the neural mechanisms that +enable specific behaviors in Large Language Models (LLMs) by leveraging +causality-based methods. While these approaches have identified neural circuits +that copy spans of text, capture factual knowledge, and more, they remain +unusable for multimodal models since adapting these tools to the +vision-language domain requires considerable architectural changes. In this +work, we adapt a unimodal causal tracing tool to BLIP to enable the study of +the neural mechanisms underlying image-conditioned text generation. We +demonstrate our approach on a visual question answering dataset, highlighting +the causal relevance of later layer representations for all tokens. +Furthermore, we release our BLIP causal tracing tool as open source to enable +further experimentation in vision-language mechanistic interpretability by the +community. Our code is available at +https://github.com/vedantpalit/Towards-Vision-Language-Mechanistic-Interpretability. + +
+
+ comment: Final version for 5th Workshop on Closing the Loop Between Vision and + Language (CLVL) @ ICCV 2023. 4 pages, 5 figures +
+
+
+
+
+ + ☆ Examining User-Friendly and Open-Sourced Large GPT Models: A Survey on + Language, Multimodal, and Scientific GPT Models + + +
+ Generative pre-trained transformer (GPT) models have revolutionized the field +of natural language processing (NLP) with remarkable performance in various +tasks and also extend their power to multimodal domains. Despite their success, +large GPT models like GPT-4 face inherent limitations such as considerable +size, high computational requirements, complex deployment processes, and closed +development loops. These constraints restrict their widespread adoption and +raise concerns regarding their responsible development and usage. The need for +user-friendly, relatively small, and open-sourced alternative GPT models arises +from the desire to overcome these limitations while retaining high performance. +In this survey paper, we provide an examination of alternative open-sourced +models of large GPTs, focusing on user-friendly and relatively small models +that facilitate easier deployment and accessibility. Through this extensive +survey, we aim to equip researchers, practitioners, and enthusiasts with a +thorough understanding of user-friendly and relatively small open-sourced +models of large GPTs, their current state, challenges, and future research +directions, inspiring the development of more efficient, accessible, and +versatile GPT models that cater to the broader scientific community and advance +the field of general artificial intelligence. The source contents are +continuously updating in https://github.com/GPT-Alternatives/gpt_alternatives. + +
+
+
+
+
+ + ☆ Detecting Language Model Attacks with Perplexity + + +
+ A novel hack involving Large Language Models (LLMs) has emerged, leveraging +adversarial suffixes to trick models into generating perilous responses. This +method has garnered considerable attention from reputable media outlets such as +the New York Times and Wired, thereby influencing public perception regarding +the security and safety of LLMs. In this study, we advocate the utilization of +perplexity as one of the means to recognize such potential attacks. The +underlying concept behind these hacks revolves around appending an unusually +constructed string of text to a harmful query that would otherwise be blocked. +This maneuver confuses the protective mechanisms and tricks the model into +generating a forbidden response. Such scenarios could result in providing +detailed instructions to a malicious user for constructing explosives or +orchestrating a bank heist. Our investigation demonstrates the feasibility of +employing perplexity, a prevalent natural language processing metric, to detect +these adversarial tactics before generating a forbidden response. By evaluating +the perplexity of queries with and without such adversarial suffixes using an +open-source LLM, we discovered that nearly 90 percent were above a perplexity +of 1000. This contrast underscores the efficacy of perplexity for detecting +this type of exploit. + +
+
+
+
+
+ + ☆ Empowering Clinicians and Democratizing Data Science: Large Language + Models Automate Machine Learning for Clinical Studies + + +
+ A knowledge gap persists between Machine Learning (ML) developers (e.g., data +scientists) and practitioners (e.g., clinicians), hampering the full +utilization of ML for clinical data analysis. We investigated the potential of +the chatGPT Code Interpreter (CI), an extension of GPT-4, to bridge this gap +and perform ML analyses efficiently. Real-world clinical datasets and study +details from large trials across various medical specialties were presented to +chatGPT CI without specific guidance. ChatGPT CI autonomously developed +state-of-the-art ML models based on the original study's training data to +predict clinical outcomes such as cancer development, cancer progression, +disease complications, or biomarkers such as pathogenic gene sequences. +Strikingly, these ML models matched or outperformed their published +counterparts. We conclude that chatGPT CI offers a promising avenue to +democratize ML in medicine, making advanced analytics accessible to non-ML +experts and promoting broader applications in medical research and practice. + +
+
+
+
+
+ + ☆ Situated Natural Language Explanations ACL 2023 + + +
+ Natural language is among the most accessible tools for explaining decisions +to humans, and large pretrained language models (PLMs) have demonstrated +impressive abilities to generate coherent natural language explanations (NLE). +The existing NLE research perspectives do not take the audience into account. +An NLE can have high textual quality, but it might not accommodate audiences' +needs and preference. To address this limitation, we propose an alternative +perspective, situated NLE, including a situated generation framework and a +situated evaluation framework. On the generation side, we propose simple prompt +engineering methods that adapt the NLEs to situations. In human studies, the +annotators preferred the situated NLEs. On the evaluation side, we set up +automated evaluation scores in lexical, semantic, and pragmatic categories. The +scores can be used to select the most suitable prompts to generate NLEs. +Situated NLE provides a perspective to conduct further research on automatic +NLE generations. + +
+
+ comment: A previous version was presented in ACL 2023 NLRSE workshop +
+
+
+
+
+ + ☆ MedAlign: A Clinician-Generated Dataset for Instruction Following with + Electronic Medical Records + + +
+ The ability of large language models (LLMs) to follow natural language +instructions with human-level fluency suggests many opportunities in healthcare +to reduce administrative burden and improve quality of care. However, +evaluating LLMs on realistic text generation tasks for healthcare remains +challenging. Existing question answering datasets for electronic health record +(EHR) data fail to capture the complexity of information needs and +documentation burdens experienced by clinicians. To address these challenges, +we introduce MedAlign, a benchmark dataset of 983 natural language instructions +for EHR data. MedAlign is curated by 15 clinicians (7 specialities), includes +clinician-written reference responses for 303 instructions, and provides 276 +longitudinal EHRs for grounding instruction-response pairs. We used MedAlign to +evaluate 6 general domain LLMs, having clinicians rank the accuracy and quality +of each LLM response. We found high error rates, ranging from 35% (GPT-4) to +68% (MPT-7B-Instruct), and an 8.3% drop in accuracy moving from 32k to 2k +context lengths for GPT-4. Finally, we report correlations between clinician +rankings and automated natural language generation metrics as a way to rank +LLMs without human review. We make MedAlign available under a research data use +agreement to enable LLM evaluations on tasks aligned with clinician needs and +preferences. + +
+
+
+
+
+ + ☆ An Analysis of On-the-fly Determinization of Finite-state Automata + + +
+ In this paper we establish an abstraction of on-the-fly determinization of +finite-state automata using transition monoids and demonstrate how it can be +applied to bound the asymptotics. We present algebraic and combinatorial +properties that are sufficient for a polynomial state complexity of the +deterministic automaton constructed on-the-fly. A special case of our findings +is that automata with many non-deterministic transitions almost always admit a +determinization of polynomial complexity. Furthermore, we extend our ideas to +weighted finite-state automata. + +
+
+
+
+
+ + ☆ Confucius: Iterative Tool Learning from Introspection Feedback by + Easy-to-Difficult Curriculum + + +
+ Augmenting large language models (LLMs) with external tools has emerged as a +promising approach to extending the capability of LLMs. Although some works +employ open-source LLMs for the tool learning task, most of them are trained in +a controlled environment in which LLMs only learn to execute the human-provided +tools. However, selecting proper tools from the large toolset is also a crucial +ability for the tool learning model to be applied in real-world applications. +Existing methods usually directly employ self-instruction methods to train the +model, which ignores differences in tool complexity. In this paper, we propose +the Confucius, a novel tool learning framework to train LLM to use complicated +tools in real-world scenarios, which contains two main phases: (1) We first +propose a multi-stage learning method to teach the LLM to use various tools +from an easy-to-difficult curriculum; (2) thenceforth, we propose the Iterative +Self-instruct from Introspective Feedback (ISIF) to dynamically construct the +dataset to improve the ability to use the complicated tool. Extensive +experiments conducted on both controlled and real-world settings demonstrate +the superiority of our tool learning framework in the real-world application +scenarios compared to both tuning-free (e.g. ChatGPT, Claude) and tuning-based +baselines (e.g. GPT4Tools). + +
+
+
+
+
+ + ☆ VoiceBank-2023: A Multi-Speaker Mandarin Speech Corpus for Constructing + Personalized TTS Systems for the Speech Impaired + + +
+ Services of personalized TTS systems for the Mandarin-speaking speech +impaired are rarely mentioned. Taiwan started the VoiceBanking project in 2020, +aiming to build a complete set of services to deliver personalized Mandarin TTS +systems to amyotrophic lateral sclerosis patients. This paper reports the +corpus design, corpus recording, data purging and correction for the corpus, +and evaluations of the developed personalized TTS systems, for the VoiceBanking +project. The developed corpus is named after the VoiceBank-2023 speech corpus +because of its release year. The corpus contains 29.78 hours of utterances with +prompts of short paragraphs and common phrases spoken by 111 native Mandarin +speakers. The corpus is labeled with information about gender, degree of speech +impairment, types of users, transcription, SNRs, and speaking rates. The +VoiceBank-2023 is available by request for non-commercial use and welcomes all +parties to join the VoiceBanking project to improve the services for the speech +impaired. + +
+
+ comment: submitted to 26th International Conference of the ORIENTAL-COCOSDA +
+
+
+
+
+ + ♻ ☆ A Study on Robustness and Reliability of Large Language Model Code + Generation + + +
+ Recently, the large language models (LLMs) have shown extraordinary ability +in understanding natural language and generating programming code. It has been +a common practice of software engineers to consult LLMs when encountering +coding questions. Although efforts have been made to avoid syntax errors and +align the code with the intended semantics, the reliability and robustness of +the code generationfrom LLMs have not yet been thoroughly studied. The +executable code is not equivalent to the reliable and robust code, especially +in the context of real-world software development. The misuse of APIs in the +generated code could lead to severe problem, such as resource leaks, program +crashes. To make things worse, the users of LLM code generation services are +actually the developers that are most vulnerable to these code that seems right +-- They are always novice developers that are not familiar with the APIs that +LLMs generate code for them. Therefore, they could hardly tell the misuse in +the code generated by LLMs, which further facilitates the incorrect code +applied in real-world software. Existing code evaluation benchmark and datasets +focus on crafting small tasks such as programming questions in coding +interviews, which however deviates from the problem that developers would ask +LLM for real-world coding help. To fill the missing piece, in this work, we +propose a dataset RobustAPI for evaluating the reliability and robustness of +code generated by LLMs. We collect 1208 coding questions from StackOverflow on +24 representative Java APIs. We summarize thecommon misuse patterns of these +APIs and evaluate them oncurrent popular LLMs. The evaluation results show that +evenfor GPT-4, 62% of the generated code contains API misuses,which would cause +unexpected consequences if the code isintroduced into real-world software. + +
+
+
+
+
+ + ♻ ☆ SPM: Structured Pretraining and Matching Architectures for Relevance + Modeling in Meituan Search CIKM '23 + + +
+ In e-commerce search, relevance between query and documents is an essential +requirement for satisfying user experience. Different from traditional +e-commerce platforms that offer products, users search on life service +platforms such as Meituan mainly for product providers, which usually have +abundant structured information, e.g. name, address, category, thousands of +products. Modeling search relevance with these rich structured contents is +challenging due to the following issues: (1) there is language distribution +discrepancy among different fields of structured document, making it difficult +to directly adopt off-the-shelf pretrained language model based methods like +BERT. (2) different fields usually have different importance and their length +vary greatly, making it difficult to extract document information helpful for +relevance matching. + To tackle these issues, in this paper we propose a novel two-stage +pretraining and matching architecture for relevance matching with rich +structured documents. At pretraining stage, we propose an effective pretraining +method that employs both query and multiple fields of document as inputs, +including an effective information compression method for lengthy fields. At +relevance matching stage, a novel matching method is proposed by leveraging +domain knowledge in search query to generate more effective document +representations for relevance scoring. Extensive offline experiments and online +A/B tests on millions of users verify that the proposed architectures +effectively improve the performance of relevance modeling. The model has +already been deployed online, serving the search traffic of Meituan for over a +year. + +
+
+ comment: Accepted by CIKM '23 +
+
+
+
+
+ + ♻ ☆ RestGPT: Connecting Large Language Models with Real-World RESTful APIs + + +
+ Tool-augmented large language models (LLMs) have achieved remarkable progress +in tackling a broad range of tasks. However, existing methods are mainly +restricted to specifically designed tools and fail to fulfill complex +instructions, having great limitations when confronted with real-world +scenarios. In this paper, we explore a more realistic scenario by connecting +LLMs with RESTful APIs, which adhere to the widely adopted REST software +architectural style for web service development. To address the practical +challenges of tackling complex instructions, we propose RestGPT, which exploits +the power of LLMs and conducts a coarse-to-fine online planning mechanism to +enhance the abilities of task decomposition and API selection. RestGPT also +contains an API executor tailored for calling RESTful APIs, which can +meticulously formulate parameters and parse API responses. To fully evaluate +the performance of RestGPT, we propose RestBench, a high-quality benchmark +which consists of two real-world scenarios and human-annotated instructions +with gold solution paths. Experiments show that RestGPT is able to achieve +impressive results in complex tasks and has strong robustness, which paves a +new way towards AGI. RestGPT and RestBench is publicly available at +https://restgpt.github.io/. + +
+
+ comment: Add RestBench to evaluate RestGPT +
+
+
+
+
+ + ♻ ☆ TwHIN-BERT: A Socially-Enriched Pre-trained Language Model for + Multilingual Tweet Representations at Twitter + + +
+ Pre-trained language models (PLMs) are fundamental for natural language +processing applications. Most existing PLMs are not tailored to the noisy +user-generated text on social media, and the pre-training does not factor in +the valuable social engagement logs available in a social network. We present +TwHIN-BERT, a multilingual language model productionized at Twitter, trained on +in-domain data from the popular social network. TwHIN-BERT differs from prior +pre-trained language models as it is trained with not only text-based +self-supervision, but also with a social objective based on the rich social +engagements within a Twitter heterogeneous information network (TwHIN). Our +model is trained on 7 billion tweets covering over 100 distinct languages, +providing a valuable representation to model short, noisy, user-generated text. +We evaluate our model on various multilingual social recommendation and +semantic understanding tasks and demonstrate significant metric improvement +over established pre-trained language models. We open-source TwHIN-BERT and our +curated hashtag prediction and social engagement benchmark datasets to the +research community. + +
+
+
+
+
+ + ♻ ☆ Event knowledge in large language models: the gap between the impossible + and the unlikely + + +
+ Word co-occurrence patterns in language corpora contain a surprising amount +of conceptual knowledge. Large language models (LLMs), trained to predict words +in context, leverage these patterns to achieve impressive performance on +diverse semantic tasks requiring world knowledge. An important but understudied +question about LLMs' semantic abilities is whether they acquire generalized +knowledge of common events. Here, we test whether five pre-trained LLMs (from +2018's BERT to 2023's MPT) assign higher likelihood to plausible descriptions +of agent-patient interactions than to minimally different implausible versions +of the same event. Using three curated sets of minimal sentence pairs (total +n=1,215), we found that pre-trained LLMs possess substantial event knowledge, +outperforming other distributional language models. In particular, they almost +always assign higher likelihood to possible vs. impossible events (The teacher +bought the laptop vs. The laptop bought the teacher). However, LLMs show less +consistent preferences for likely vs. unlikely events (The nanny tutored the +boy vs. The boy tutored the nanny). In follow-up analyses, we show that (i) LLM +scores are driven by both plausibility and surface-level sentence features, +(ii) LLM scores generalize well across syntactic variants (active vs. passive +constructions) but less well across semantic variants (synonymous sentences), +(iii) some LLM errors mirror human judgment ambiguity, and (iv) sentence +plausibility serves as an organizing dimension in internal LLM representations. +Overall, our results show that important aspects of event knowledge naturally +emerge from distributional linguistic patterns, but also highlight a gap +between representations of possible/impossible and likely/unlikely events. + +
+
+ comment: The two lead authors have contributed equally to this work +
+
+
+
+
+ + ♻ ☆ Protecting Language Generation Models via Invisible Watermarking ICML 2023 + + +
+ Language generation models have been an increasingly powerful enabler for +many applications. Many such models offer free or affordable API access, which +makes them potentially vulnerable to model extraction attacks through +distillation. To protect intellectual property (IP) and ensure fair use of +these models, various techniques such as lexical watermarking and synonym +replacement have been proposed. However, these methods can be nullified by +obvious countermeasures such as "synonym randomization". To address this issue, +we propose GINSEW, a novel method to protect text generation models from being +stolen through distillation. The key idea of our method is to inject secret +signals into the probability vector of the decoding steps for each target +token. We can then detect the secret message by probing a suspect model to tell +if it is distilled from the protected one. Experimental results show that +GINSEW can effectively identify instances of IP infringement with minimal +impact on the generation quality of protected APIs. Our method demonstrates an +absolute improvement of 19 to 29 points on mean average precision (mAP) in +detecting suspects compared to previous methods against watermark removal +attacks. + +
+
+ comment: ICML 2023 +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 48 + +
+
+
+ + ☆ High-Resolution Document Shadow Removal via A Large-Scale Real-World + Dataset and A Frequency-Aware Shadow Erasing Net + + +
+ Shadows often occur when we capture the documents with casual equipment, +which influences the visual quality and readability of the digital copies. +Different from the algorithms for natural shadow removal, the algorithms in +document shadow removal need to preserve the details of fonts and figures in +high-resolution input. Previous works ignore this problem and remove the +shadows via approximate attention and small datasets, which might not work in +real-world situations. We handle high-resolution document shadow removal +directly via a larger-scale real-world dataset and a carefully designed +frequency-aware network. As for the dataset, we acquire over 7k couples of +high-resolution (2462 x 3699) images of real-world document pairs with various +samples under different lighting circumstances, which is 10 times larger than +existing datasets. As for the design of the network, we decouple the +high-resolution images in the frequency domain, where the low-frequency details +and high-frequency boundaries can be effectively learned via the carefully +designed network structure. Powered by our network and dataset, the proposed +method clearly shows a better performance than previous methods in terms of +visual quality and numerical results. The code, models, and dataset are +available at: https://github.com/CXH-Research/DocShadow-SD7K + +
+
+
+
+
+ + ☆ Post-Hoc Explainability of BI-RADS Descriptors in a Multi-task Framework + for Breast Cancer Detection and Segmentation SP + + +
+ Despite recent medical advancements, breast cancer remains one of the most +prevalent and deadly diseases among women. Although machine learning-based +Computer-Aided Diagnosis (CAD) systems have shown potential to assist +radiologists in analyzing medical images, the opaque nature of the +best-performing CAD systems has raised concerns about their trustworthiness and +interpretability. This paper proposes MT-BI-RADS, a novel explainable deep +learning approach for tumor detection in Breast Ultrasound (BUS) images. The +approach offers three levels of explanations to enable radiologists to +comprehend the decision-making process in predicting tumor malignancy. Firstly, +the proposed model outputs the BI-RADS categories used for BUS image analysis +by radiologists. Secondly, the model employs multi-task learning to +concurrently segment regions in images that correspond to tumors. Thirdly, the +proposed approach outputs quantified contributions of each BI-RADS descriptor +toward predicting the benign or malignant class using post-hoc explanations +with Shapley Values. + +
+
+ comment: 11 pages, 5 figures. Published at 2023 IEEE Workshop on MLSP +
+
+
+
+
+ + ☆ Exploring the Transfer Learning Capabilities of CLIP in Domain + Generalization for Diabetic Retinopathy + + +
+ Diabetic Retinopathy (DR), a leading cause of vision impairment, requires +early detection and treatment. Developing robust AI models for DR +classification holds substantial potential, but a key challenge is ensuring +their generalization in unfamiliar domains with varying data distributions. To +address this, our paper investigates cross-domain generalization, also known as +domain generalization (DG), within the context of DR classification. DG, a +challenging problem in the medical domain, is complicated by the difficulty of +gathering labeled data across different domains, such as patient demographics +and disease stages. Some recent studies have shown the effectiveness of using +CLIP to handle the DG problem in natural images. In this study, we investigate +CLIP's transfer learning capabilities and its potential for cross-domain +generalization in diabetic retinopathy (DR) classification. We carry out +comprehensive experiments to assess the efficacy and potential of CLIP in +addressing DG for DR classification. Further, we introduce a multi-modal +fine-tuning strategy named Context Optimization with Learnable Visual Tokens +(CoOpLVT), which enhances context optimization by conditioning on visual +features. Our findings demonstrate that the proposed method increases the +F1-score by 1.8% over the baseline, thus underlining its promise for effective +DG in DR classification. Our code is publicly available at +https://github.com/Sanoojan/CLIP-DRDG. + +
+
+
+
+
+ + ☆ SketchDreamer: Interactive Text-Augmented Creative Sketch Ideation BMVC 2023 + + +
+ Artificial Intelligence Generated Content (AIGC) has shown remarkable +progress in generating realistic images. However, in this paper, we take a step +"backward" and address AIGC for the most rudimentary visual modality of human +sketches. Our objective is on the creative nature of sketches, and that +creative sketching should take the form of an interactive process. We further +enable text to drive the sketch ideation process, allowing creativity to be +freely defined, while simultaneously tackling the challenge of "I can't +sketch". We present a method to generate controlled sketches using a +text-conditioned diffusion model trained on pixel representations of images. +Our proposed approach, referred to as SketchDreamer, integrates a +differentiable rasteriser of Bezier curves that optimises an initial input to +distil abstract semantic knowledge from a pretrained diffusion model. We +utilise Score Distillation Sampling to learn a sketch that aligns with a given +caption, which importantly enable both text and sketch to interact with the +ideation process. Our objective is to empower non-professional users to create +sketches and, through a series of optimisation processes, transform a narrative +into a storyboard by expanding the text prompt while making minor adjustments +to the sketch input. Through this work, we hope to aspire the way we create +visual content, democratise the creative process, and inspire further research +in enhancing human creativity in AIGC. The code is available at +\url{https://github.com/WinKawaks/SketchDreamer}. + +
+
+ comment: BMVC 2023 +
+
+
+
+
+ + ☆ Score-Based Generative Models for PET Image Reconstruction + + +
+ Score-based generative models have demonstrated highly promising results for +medical image reconstruction tasks in magnetic resonance imaging or computed +tomography. However, their application to Positron Emission Tomography (PET) is +still largely unexplored. PET image reconstruction involves a variety of +challenges, including Poisson noise with high variance and a wide dynamic +range. To address these challenges, we propose several PET-specific adaptations +of score-based generative models. The proposed framework is developed for both +2D and 3D PET. In addition, we provide an extension to guided reconstruction +using magnetic resonance images. We validate the approach through extensive 2D +and 3D $\textit{in-silico}$ experiments with a model trained on +patient-realistic data without lesions, and evaluate on data without lesions as +well as out-of-distribution data with lesions. This demonstrates the proposed +method's robustness and significant potential for improved PET reconstruction. + +
+
+ comment: 35 pages, 16 figures, submitted to Journal of Machine Learning for + Biomedical Imaging (MELBA) +
+
+
+
+
+ + ☆ Towards Vision-Language Mechanistic Interpretability: A Causal Tracing + Tool for BLIP ICCV 2023 + + +
+ Mechanistic interpretability seeks to understand the neural mechanisms that +enable specific behaviors in Large Language Models (LLMs) by leveraging +causality-based methods. While these approaches have identified neural circuits +that copy spans of text, capture factual knowledge, and more, they remain +unusable for multimodal models since adapting these tools to the +vision-language domain requires considerable architectural changes. In this +work, we adapt a unimodal causal tracing tool to BLIP to enable the study of +the neural mechanisms underlying image-conditioned text generation. We +demonstrate our approach on a visual question answering dataset, highlighting +the causal relevance of later layer representations for all tokens. +Furthermore, we release our BLIP causal tracing tool as open source to enable +further experimentation in vision-language mechanistic interpretability by the +community. Our code is available at +https://github.com/vedantpalit/Towards-Vision-Language-Mechanistic-Interpretability. + +
+
+ comment: Final version for 5th Workshop on Closing the Loop Between Vision and + Language (CLVL) @ ICCV 2023. 4 pages, 5 figures +
+
+
+
+
+ + ☆ AIGC for Various Data Modalities: A Survey + + +
+ AI-generated content (AIGC) methods aim to produce text, images, videos, 3D +assets, and other media using AI algorithms. Due to its wide range of +applications and the demonstrated potential of recent works, AIGC developments +have been attracting a lot of attention recently, and AIGC methods have been +developed for various data modalities, such as image, video, text, 3D shape (as +voxels, point clouds, meshes, and neural implicit fields), 3D scene, 3D human +avatar (body and head), 3D motion, and audio -- each presenting different +characteristics and challenges. Furthermore, there have also been many +significant developments in cross-modality AIGC methods, where generative +methods can receive conditioning input in one modality and produce outputs in +another. Examples include going from various modalities to image, video, 3D +shape, 3D scene, 3D avatar (body and head), 3D motion (skeleton and avatar), +and audio modalities. In this paper, we provide a comprehensive review of AIGC +methods across different data modalities, including both single-modal and +cross-modality methods, highlighting the various challenges, representative +works, and recent technical directions in each setting. We also present +comparative results on several benchmark datasets in various modalities. +Moreover, we also discuss the challenges and potential future research +directions. + +
+
+
+
+
+ + ☆ Intergrated Segmentation and Detection Models for Dentex Challenge 2023 + + +
+ Dental panoramic x-rays are commonly used in dental diagnosing. With the +development of deep learning, auto detection of diseases from dental panoramic +x-rays can help dentists to diagnose diseases more efficiently.The Dentex +Challenge 2023 is a competition for automatic detection of abnormal teeth along +with their enumeration ids from dental panoramic x-rays. In this paper, we +propose a method integrating segmentation and detection models to detect +abnormal teeth as well as obtain their enumeration ids.Our codes are available +at https://github.com/xyzlancehe/DentexSegAndDet. + +
+
+
+
+
+ + ☆ A Unified Transformer-based Network for multimodal Emotion Recognition + + +
+ The development of transformer-based models has resulted in significant +advances in addressing various vision and NLP-based research challenges. +However, the progress made in transformer-based methods has not been +effectively applied to biosensing research. This paper presents a novel Unified +Biosensor-Vision Multi-modal Transformer-based (UBVMT) method to classify +emotions in an arousal-valence space by combining a 2D representation of an +ECG/PPG signal with the face information. To achieve this goal, we first +investigate and compare the unimodal emotion recognition performance of three +image-based representations of the ECG/PPG signal. We then present our UBVMT +network which is trained to perform emotion recognition by combining the 2D +image-based representation of the ECG/PPG signal and the facial expression +features. Our unified transformer model consists of homogeneous transformer +blocks that take as an input the 2D representation of the ECG/PPG signal and +the corresponding face frame for emotion representation learning with minimal +modality-specific design. Our UBVMT model is trained by reconstructing masked +patches of video frames and 2D images of ECG/PPG signals, and contrastive +modeling to align face and ECG/PPG data. Extensive experiments on the +MAHNOB-HCI and DEAP datasets show that our Unified UBVMT-based model produces +comparable results to the state-of-the-art techniques. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ Sparse Sampling Transformer with Uncertainty-Driven Ranking for Unified + Removal of Raindrops and Rain Streaks ICCV'23 + + +
+ In the real world, image degradations caused by rain often exhibit a +combination of rain streaks and raindrops, thereby increasing the challenges of +recovering the underlying clean image. Note that the rain streaks and raindrops +have diverse shapes, sizes, and locations in the captured image, and thus +modeling the correlation relationship between irregular degradations caused by +rain artifacts is a necessary prerequisite for image deraining. This paper aims +to present an efficient and flexible mechanism to learn and model degradation +relationships in a global view, thereby achieving a unified removal of +intricate rain scenes. To do so, we propose a Sparse Sampling Transformer based +on Uncertainty-Driven Ranking, dubbed UDR-S2Former. Compared to previous +methods, our UDR-S2Former has three merits. First, it can adaptively sample +relevant image degradation information to model underlying degradation +relationships. Second, explicit application of the uncertainty-driven ranking +strategy can facilitate the network to attend to degradation features and +understand the reconstruction process. Finally, experimental results show that +our UDR-S2Former clearly outperforms state-of-the-art methods for all +benchmarks. + +
+
+ comment: Accepted by ICCV'23 +
+
+
+
+
+ + ☆ Unaligned 2D to 3D Translation with Conditional Vector-Quantized Code + Diffusion using Transformers ICCV 2023 + + +
+ Generating 3D images of complex objects conditionally from a few 2D views is +a difficult synthesis problem, compounded by issues such as domain gap and +geometric misalignment. For instance, a unified framework such as Generative +Adversarial Networks cannot achieve this unless they explicitly define both a +domain-invariant and geometric-invariant joint latent distribution, whereas +Neural Radiance Fields are generally unable to handle both issues as they +optimize at the pixel level. By contrast, we propose a simple and novel 2D to +3D synthesis approach based on conditional diffusion with vector-quantized +codes. Operating in an information-rich code space enables high-resolution 3D +synthesis via full-coverage attention across the views. Specifically, we +generate the 3D codes (e.g. for CT images) conditional on previously generated +3D codes and the entire codebook of two 2D views (e.g. 2D X-rays). Qualitative +and quantitative results demonstrate state-of-the-art performance over +specialized methods across varied evaluation criteria, including fidelity +metrics such as density, coverage, and distortion metrics for two complex +volumetric imagery datasets from in real-world scenarios. + +
+
+ comment: Camera-ready version for ICCV 2023 +
+
+
+
+
+ + ☆ Cheap Lunch for Medical Image Segmentation by Fine-tuning SAM on Few + Exemplars MICCAI + + +
+ The Segment Anything Model (SAM) has demonstrated remarkable capabilities of +scaled-up segmentation models, enabling zero-shot generalization across a +variety of domains. By leveraging large-scale foundational models as +pre-trained models, it is a natural progression to fine-tune SAM for specific +domains to further enhance performances. However, the adoption of foundational +models in the medical domain presents a challenge due to the difficulty and +expense of labeling sufficient data for adaptation within hospital systems. In +this paper, we introduce an efficient and practical approach for fine-tuning +SAM using a limited number of exemplars, making it suitable for such scenarios. +Our approach combines two established techniques from the literature: an +exemplar-guided synthesis module and the widely recognized Low-Rank Adaptation +(LoRA) fine-tuning strategy, serving as data-level and model-level attempts +respectively. Interestingly, our empirical findings suggest that SAM can be +effectively aligned within the medical domain even with few labeled data. We +validate our approach through experiments on brain tumor segmentation (BraTS) +and multi-organ CT segmentation (Synapse). The comprehensive results underscore +the feasibility and effectiveness of such an approach, paving the way for the +practical application of SAM in the medical domain. + +
+
+ comment: Accepted by Brain Lesion (BrainLes) workshop of International + Conference on Medical Image Computing and Computer Assisted Intervention + (MICCAI BrainLes 2023). 10 pages, 3 figures +
+
+
+
+
+ + ☆ Synergizing Contrastive Learning and Optimal Transport for 3D Point + Cloud Domain Adaptation + + +
+ Recently, the fundamental problem of unsupervised domain adaptation (UDA) on +3D point clouds has been motivated by a wide variety of applications in +robotics, virtual reality, and scene understanding, to name a few. The point +cloud data acquisition procedures manifest themselves as significant domain +discrepancies and geometric variations among both similar and dissimilar +classes. The standard domain adaptation methods developed for images do not +directly translate to point cloud data because of their complex geometric +nature. To address this challenge, we leverage the idea of multimodality and +alignment between distributions. We propose a new UDA architecture for point +cloud classification that benefits from multimodal contrastive learning to get +better class separation in both domains individually. Further, the use of +optimal transport (OT) aims at learning source and target data distributions +jointly to reduce the cross-domain shift and provide a better alignment. We +conduct a comprehensive empirical study on PointDA-10 and GraspNetPC-10 and +show that our method achieves state-of-the-art performance on GraspNetPC-10 +(with approx 4-12% margin) and best average performance on PointDA-10. Our +ablation studies and decision boundary analysis also validate the significance +of our contrastive learning module and OT alignment. + +
+
+
+
+
+ + ☆ Semi-Supervised Learning in the Few-Shot Zero-Shot Scenario + + +
+ Semi-Supervised Learning (SSL) leverages both labeled and unlabeled data to +improve model performance. Traditional SSL methods assume that labeled and +unlabeled data share the same label space. However, in real-world applications, +especially when the labeled training set is small, there may be classes that +are missing from the labeled set. Existing frameworks aim to either reject all +unseen classes (open-set SSL) or to discover unseen classes by partitioning an +unlabeled set during training (open-world SSL). In our work, we construct a +classifier for points from both seen and unseen classes. Our approach is based +on extending an existing SSL method, such as FlexMatch, by incorporating an +additional entropy loss. This enhancement allows our method to improve the +performance of any existing SSL method in the classification of both seen and +unseen classes. We demonstrate large improvement gains over state-of-the-art +SSL, open-set SSL, and open-world SSL methods, on two benchmark image +classification data sets, CIFAR-100 and STL-10. The gains are most pronounced +when the labeled data is severely limited (1-25 labeled examples per class). + +
+
+
+
+
+ + ☆ Semantic-aware Consistency Network for Cloth-changing Person + Re-Identification ACM MM 2023 + + +
+ Cloth-changing Person Re-Identification (CC-ReID) is a challenging task that +aims to retrieve the target person across multiple surveillance cameras when +clothing changes might happen. Despite recent progress in CC-ReID, existing +approaches are still hindered by the interference of clothing variations since +they lack effective constraints to keep the model consistently focused on +clothing-irrelevant regions. To address this issue, we present a Semantic-aware +Consistency Network (SCNet) to learn identity-related semantic features by +proposing effective consistency constraints. Specifically, we generate the +black-clothing image by erasing pixels in the clothing area, which explicitly +mitigates the interference from clothing variations. In addition, to fully +exploit the fine-grained identity information, a head-enhanced attention module +is introduced, which learns soft attention maps by utilizing the proposed +part-based matching loss to highlight head information. We further design a +semantic consistency loss to facilitate the learning of high-level +identity-related semantic features, forcing the model to focus on semantically +consistent cloth-irrelevant regions. By using the consistency constraint, our +model does not require any extra auxiliary segmentation module to generate the +black-clothing image or locate the head region during the inference stage. +Extensive experiments on four cloth-changing person Re-ID datasets (LTCC, PRCC, +Vc-Clothes, and DeepChange) demonstrate that our proposed SCNet makes +significant improvements over prior state-of-the-art approaches. Our code is +available at: https://github.com/Gpn-star/SCNet. + +
+
+ comment: Accepted by ACM MM 2023 +
+
+
+
+
+ + ☆ Depth self-supervision for single image novel view synthesis + + +
+ In this paper, we tackle the problem of generating a novel image from an +arbitrary viewpoint given a single frame as input. While existing methods +operating in this setup aim at predicting the target view depth map to guide +the synthesis, without explicit supervision over such a task, we jointly +optimize our framework for both novel view synthesis and depth estimation to +unleash the synergy between the two at its best. Specifically, a shared depth +decoder is trained in a self-supervised manner to predict depth maps that are +consistent across the source and target views. Our results demonstrate the +effectiveness of our approach in addressing the challenges of both tasks +allowing for higher-quality generated images, as well as more accurate depth +for the target viewpoint. + +
+
+
+
+
+ + ☆ Unified and Dynamic Graph for Temporal Character Grouping in Long Videos + + +
+ Video temporal character grouping locates appearing moments of major +characters within a video according to their identities. To this end, recent +works have evolved from unsupervised clustering to graph-based supervised +clustering. However, graph methods are built upon the premise of fixed affinity +graphs, bringing many inexact connections. Besides, they extract multi-modal +features with kinds of models, which are unfriendly to deployment. In this +paper, we present a unified and dynamic graph (UniDG) framework for temporal +character grouping. This is accomplished firstly by a unified representation +network that learns representations of multiple modalities within the same +space and still preserves the modality's uniqueness simultaneously. Secondly, +we present a dynamic graph clustering where the neighbors of different +quantities are dynamically constructed for each node via a cyclic matching +strategy, leading to a more reliable affinity graph. Thirdly, a progressive +association method is introduced to exploit spatial and temporal contexts among +different modalities, allowing multi-modal clustering results to be well fused. +As current datasets only provide pre-extracted features, we evaluate our UniDG +method on a collected dataset named MTCG, which contains each character's +appearing clips of face and body and speaking voice tracks. We also evaluate +our key components on existing clustering and retrieval datasets to verify the +generalization ability. Experimental results manifest that our method can +achieve promising results and outperform several state-of-the-art approaches. + +
+
+
+
+
+ + ☆ Towards Unified Token Learning for Vision-Language Tracking + + +
+ In this paper, we present a simple, flexible and effective vision-language +(VL) tracking pipeline, termed \textbf{MMTrack}, which casts VL tracking as a +token generation task. Traditional paradigms address VL tracking task +indirectly with sophisticated prior designs, making them over-specialize on the +features of specific architectures or mechanisms. In contrast, our proposed +framework serializes language description and bounding box into a sequence of +discrete tokens. In this new design paradigm, all token queries are required to +perceive the desired target and directly predict spatial coordinates of the +target in an auto-regressive manner. The design without other prior modules +avoids multiple sub-tasks learning and hand-designed loss functions, +significantly reducing the complexity of VL tracking modeling and allowing our +tracker to use a simple cross-entropy loss as unified optimization objective +for VL tracking task. Extensive experiments on TNL2K, LaSOT, LaSOT$_{\rm{ext}}$ +and OTB99-Lang benchmarks show that our approach achieves promising results, +compared to other state-of-the-arts. + +
+
+
+
+
+ + ☆ Superpixels algorithms through network community detection + + +
+ Community detection is a powerful tool from complex networks analysis that +finds applications in various research areas. Several image segmentation +methods rely for instance on community detection algorithms as a black box in +order to compute undersegmentations, i.e. a small number of regions that +represent areas of interest of the image. However, to the best of our +knowledge, the efficiency of such an approach w.r.t. superpixels, that aim at +representing the image at a smaller level while preserving as much as possible +original information, has been neglected so far. The only related work seems to +be the one by Liu et. al. (IET Image Processing, 2022) that developed a +superpixels algorithm using a so-called modularity maximization approach, +leading to relevant results. We follow this line of research by studying the +efficiency of superpixels computed by state-of-the-art community detection +algorithms on a 4-connected pixel graph, so-called pixel-grid. We first detect +communities on such a graph and then apply a simple merging procedure that +allows to obtain the desired number of superpixels. As we shall see, such +methods result in the computation of relevant superpixels as emphasized by both +qualitative and quantitative experiments, according to different widely-used +metrics based on ground-truth comparison or on superpixels only. We observe +that the choice of the community detection algorithm has a great impact on the +number of communities and hence on the merging procedure. Similarly, small +variations on the pixel-grid may provide different results from both +qualitative and quantitative viewpoints. For the sake of completeness, we +compare our results with those of several state-of-the-art superpixels +algorithms as computed by Stutz et al. (Computer Vision and Image +Understanding, 2018). + +
+
+
+
+
+ + ☆ Rethinking Exemplars for Continual Semantic Segmentation in Endoscopy + Scenes: Entropy-based Mini-Batch Pseudo-Replay + + +
+ Endoscopy is a widely used technique for the early detection of diseases or +robotic-assisted minimally invasive surgery (RMIS). Numerous deep learning +(DL)-based research works have been developed for automated diagnosis or +processing of endoscopic view. However, existing DL models may suffer from +catastrophic forgetting. When new target classes are introduced over time or +cross institutions, the performance of old classes may suffer severe +degradation. More seriously, data privacy and storage issues may lead to the +unavailability of old data when updating the model. Therefore, it is necessary +to develop a continual learning (CL) methodology to solve the problem of +catastrophic forgetting in endoscopic image segmentation. To tackle this, we +propose a Endoscopy Continual Semantic Segmentation (EndoCSS) framework that +does not involve the storage and privacy issues of exemplar data. The framework +includes a mini-batch pseudo-replay (MB-PR) mechanism and a self-adaptive noisy +cross-entropy (SAN-CE) loss. The MB-PR strategy circumvents privacy and storage +issues by generating pseudo-replay images through a generative model. +Meanwhile, the MB-PR strategy can also correct the model deviation to the +replay data and current training data, which is aroused by the significant +difference in the amount of current and replay images. Therefore, the model can +perform effective representation learning on both new and old tasks. SAN-CE +loss can help model fitting by adjusting the model's output logits, and also +improve the robustness of training. Extensive continual semantic segmentation +(CSS) experiments on public datasets demonstrate that our method can robustly +and effectively address the catastrophic forgetting brought by class increment +in endoscopy scenes. The results show that our framework holds excellent +potential for real-world deployment in a streaming learning manner. + +
+
+ comment: Accepted by Computers in Biology and Medicine +
+
+
+
+
+ + ☆ A comprehensive review on Plant Leaf Disease detection using Deep + learning + + +
+ Leaf disease is a common fatal disease for plants. Early diagnosis and +detection is necessary in order to improve the prognosis of leaf diseases +affecting plant. For predicting leaf disease, several automated systems have +already been developed using different plant pathology imaging modalities. This +paper provides a systematic review of the literature on leaf disease-based +models for the diagnosis of various plant leaf diseases via deep learning. The +advantages and limitations of different deep learning models including Vision +Transformer (ViT), Deep convolutional neural network (DCNN), Convolutional +neural network (CNN), Residual Skip Network-based Super-Resolution for Leaf +Disease Detection (RSNSR-LDD), Disease Detection Network (DDN), and YOLO (You +only look once) are described in this review. The review also shows that the +studies related to leaf disease detection applied different deep learning +models to a number of publicly available datasets. For comparing the +performance of the models, different metrics such as accuracy, precision, +recall, etc. were used in the existing studies. + +
+
+
+
+
+ + ☆ Practical Edge Detection via Robust Collaborative Learning + + +
+ Edge detection, as a core component in a wide range of visionoriented tasks, +is to identify object boundaries and prominent edges in natural images. An edge +detector is desired to be both efficient and accurate for practical use. To +achieve the goal, two key issues should be concerned: 1) How to liberate deep +edge models from inefficient pre-trained backbones that are leveraged by most +existing deep learning methods, for saving the computational cost and cutting +the model size; and 2) How to mitigate the negative influence from noisy or +even wrong labels in training data, which widely exist in edge detection due to +the subjectivity and ambiguity of annotators, for the robustness and accuracy. +In this paper, we attempt to simultaneously address the above problems via +developing a collaborative learning based model, termed PEdger. The principle +behind our PEdger is that, the information learned from different training +moments and heterogeneous (recurrent and non recurrent in this work) +architectures, can be assembled to explore robust knowledge against noisy +annotations, even without the help of pre-training on extra data. Extensive +ablation studies together with quantitative and qualitative experimental +comparisons on the BSDS500 and NYUD datasets are conducted to verify the +effectiveness of our design, and demonstrate its superiority over other +competitors in terms of accuracy, speed, and model size. Codes can be found at +https://github.co/ForawardStar/PEdger. + +
+
+
+
+
+ + ☆ 4D Myocardium Reconstruction with Decoupled Motion and Shape Model ICCV2023 + + +
+ Estimating the shape and motion state of the myocardium is essential in +diagnosing cardiovascular diseases.However, cine magnetic resonance (CMR) +imaging is dominated by 2D slices, whose large slice spacing challenges +inter-slice shape reconstruction and motion acquisition.To address this +problem, we propose a 4D reconstruction method that decouples motion and shape, +which can predict the inter-/intra- shape and motion estimation from a given +sparse point cloud sequence obtained from limited slices. Our framework +comprises a neural motion model and an end-diastolic (ED) shape model. The +implicit ED shape model can learn a continuous boundary and encourage the +motion model to predict without the supervision of ground truth deformation, +and the motion model enables canonical input of the shape model by deforming +any point from any phase to the ED phase. Additionally, the constructed +ED-space enables pre-training of the shape model, thereby guiding the motion +model and addressing the issue of data scarcity. We propose the first 4D +myocardial dataset as we know and verify our method on the proposed, public, +and cross-modal datasets, showing superior reconstruction performance and +enabling various clinical applications. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ Reconstructing Interacting Hands with Interaction Prior from Monocular + Images ICCV2023 + + +
+ Reconstructing interacting hands from monocular images is indispensable in +AR/VR applications. Most existing solutions rely on the accurate localization +of each skeleton joint. However, these methods tend to be unreliable due to the +severe occlusion and confusing similarity among adjacent hand parts. This also +defies human perception because humans can quickly imitate an interaction +pattern without localizing all joints. Our key idea is to first construct a +two-hand interaction prior and recast the interaction reconstruction task as +the conditional sampling from the prior. To expand more interaction states, a +large-scale multimodal dataset with physical plausibility is proposed. Then a +VAE is trained to further condense these interaction patterns as latent codes +in a prior distribution. When looking for image cues that contribute to +interaction prior sampling, we propose the interaction adjacency heatmap (IAH). +Compared with a joint-wise heatmap for localization, IAH assigns denser visible +features to those invisible joints. Compared with an all-in-one visible +heatmap, it provides more fine-grained local interaction information in each +interaction region. Finally, the correlations between the extracted features +and corresponding interaction codes are linked by the ViT module. Comprehensive +evaluations on benchmark datasets have verified the effectiveness of this +framework. The code and dataset are publicly available at +https://github.com/binghui-z/InterPrior_pytorch + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ U-SEANNet: A Simple, Efficient and Applied U-Shaped Network for + Diagnosing Nasal Diseases from Nasal Endoscopic Images + + +
+ Utilizing deep learning (DL) models to improve the early diagnosis of nasal +diseases from nasal endoscopic images holds paramount importance. However, the +lack of available datasets stymies advancements in this field. Furthermore, +existing models fail to strike a good trade-off between model diagnosis +performance, model complexity and parameter size, rendering them unsuitable for +practical application. To bridge these gaps, we created the first large-scale +nasal endoscopy dataset, named 7-NasEID, comprising 11,352 images that span six +nasal diseases and normal samples. Building on this, we proposed U-SEANNet, an +innovative architecture, underpinned by depth-wise separable convolutions. +Additionally, to augment its discernment capabilities for subtle variations in +input images, we further proposed the Global-Local Channel Feature Fusion +Module, enabling the U-SEANNet to focus salient channel features from both +global and local contexts. Notably, U-SEANNet's parameter size and GFLOPs are +only 0.78M and 0.21, respectively. Employing the 7-NasalEID, we conducted the +five-fold cross-validation on U-SEANNet, juxtaposing its performance against +seventeen renowned architectures. The experimental results suggest U-SEANNet as +the state-of-the-art (SOTA) model, achieves an accuracy of 93.58%, sensitivity +of 90.17%, and specificity of 91.27%. These findings demonstrate U-SEANNet's +prodigious potential for diagnosing nasal diseases in practical use, providing +the development of efficacy nasal diseases diagnosis tools with a new insight. + +
+
+
+
+
+ + ☆ Sparse3D: Distilling Multiview-Consistent Diffusion for Object + Reconstruction from Sparse Views + + +
+ Reconstructing 3D objects from extremely sparse views is a long-standing and +challenging problem. While recent techniques employ image diffusion models for +generating plausible images at novel viewpoints or for distilling pre-trained +diffusion priors into 3D representations using score distillation sampling +(SDS), these methods often struggle to simultaneously achieve high-quality, +consistent, and detailed results for both novel-view synthesis (NVS) and +geometry. In this work, we present Sparse3D, a novel 3D reconstruction method +tailored for sparse view inputs. Our approach distills robust priors from a +multiview-consistent diffusion model to refine a neural radiance field. +Specifically, we employ a controller that harnesses epipolar features from +input views, guiding a pre-trained diffusion model, such as Stable Diffusion, +to produce novel-view images that maintain 3D consistency with the input. By +tapping into 2D priors from powerful image diffusion models, our integrated +model consistently delivers high-quality results, even when faced with +open-world objects. To address the blurriness introduced by conventional SDS, +we introduce the category-score distillation sampling (C-SDS) to enhance +detail. We conduct experiments on CO3DV2 which is a multi-view dataset of +real-world objects. Both quantitative and qualitative evaluations demonstrate +that our approach outperforms previous state-of-the-art works on the metrics +regarding NVS and geometry reconstruction. + +
+
+
+
+
+ + ☆ A Novel Multi-scale Attention Feature Extraction Block for Aerial Remote + Sensing Image Classification + + +
+ Classification of very high-resolution (VHR) aerial remote sensing (RS) +images is a well-established research area in the remote sensing community as +it provides valuable spatial information for decision-making. Existing works on +VHR aerial RS image classification produce an excellent classification +performance; nevertheless, they have a limited capability to well-represent VHR +RS images having complex and small objects, thereby leading to performance +instability. As such, we propose a novel plug-and-play multi-scale attention +feature extraction block (MSAFEB) based on multi-scale convolution at two +levels with skip connection, producing discriminative/salient information at a +deeper/finer level. The experimental study on two benchmark VHR aerial RS image +datasets (AID and NWPU) demonstrates that our proposal achieves a +stable/consistent performance (minimum standard deviation of $0.002$) and +competent overall classification performance (AID: 95.85\% and NWPU: 94.09\%). + +
+
+ comment: The paper is under review in IEEE Geoscience and Remote Sensing + Letters Journal (IEEE-GRSL). This version may be deleted and/or updated based + on the journal's policy +
+
+
+
+
+ + ☆ FaceCoresetNet: Differentiable Coresets for Face Set Recognition + + +
+ In set-based face recognition, we aim to compute the most discriminative +descriptor from an unbounded set of images and videos showing a single person. +A discriminative descriptor balances two policies when aggregating information +from a given set. The first is a quality-based policy: emphasizing high-quality +and down-weighting low-quality images. The second is a diversity-based policy: +emphasizing unique images in the set and down-weighting multiple occurrences of +similar images as found in video clips which can overwhelm the set +representation. This work frames face-set representation as a differentiable +coreset selection problem. Our model learns how to select a small coreset of +the input set that balances quality and diversity policies using a learned +metric parameterized by the face quality, optimized end-to-end. The selection +process is a differentiable farthest-point sampling (FPS) realized by +approximating the non-differentiable Argmax operation with differentiable +sampling from the Gumbel-Softmax distribution of distances. The small coreset +is later used as queries in a self and cross-attention architecture to enrich +the descriptor with information from the whole set. Our model is +order-invariant and linear in the input set size. We set a new SOTA to set face +verification on the IJB-B and IJB-C datasets. Our code is publicly available. + +
+
+
+
+
+ + ☆ Nonrigid Object Contact Estimation With Regional Unwrapping Transformer ICCV2023 + + +
+ Acquiring contact patterns between hands and nonrigid objects is a common +concern in the vision and robotics community. However, existing learning-based +methods focus more on contact with rigid ones from monocular images. When +adopting them for nonrigid contact, a major problem is that the existing +contact representation is restricted by the geometry of the object. +Consequently, contact neighborhoods are stored in an unordered manner and +contact features are difficult to align with image cues. At the core of our +approach lies a novel hand-object contact representation called RUPs (Region +Unwrapping Profiles), which unwrap the roughly estimated hand-object surfaces +as multiple high-resolution 2D regional profiles. The region grouping strategy +is consistent with the hand kinematic bone division because they are the +primitive initiators for a composite contact pattern. Based on this +representation, our Regional Unwrapping Transformer (RUFormer) learns the +correlation priors across regions from monocular inputs and predicts +corresponding contact and deformed transformations. Our experiments demonstrate +that the proposed framework can robustly estimate the deformed degrees and +deformed transformations, which makes it suitable for both nonrigid and rigid +contact. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ DETDet: Dual Ensemble Teeth Detection + + +
+ The field of dentistry is in the era of digital transformation. Particularly, +artificial intelligence is anticipated to play a significant role in digital +dentistry. AI holds the potential to significantly assist dental practitioners +and elevate diagnostic accuracy. In alignment with this vision, the 2023 MICCAI +DENTEX challenge aims to enhance the performance of dental panoramic X-ray +diagnosis and enumeration through technological advancement. In response, we +introduce DETDet, a Dual Ensemble Teeth Detection network. DETDet encompasses +two distinct modules dedicated to enumeration and diagnosis. Leveraging the +advantages of teeth mask data, we employ Mask-RCNN for the enumeration module. +For the diagnosis module, we adopt an ensemble model comprising DiffusionDet +and DINO. To further enhance precision scores, we integrate a complementary +module to harness the potential of unlabeled data. The code for our approach +will be made accessible at https://github.com/Bestever-choi/Evident + +
+
+
+
+
+ + ☆ Bi-Modality Medical Image Synthesis Using Semi-Supervised Sequential + Generative Adversarial Networks + + +
+ In this paper, we propose a bi-modality medical image synthesis approach +based on sequential generative adversarial network (GAN) and semi-supervised +learning. Our approach consists of two generative modules that synthesize +images of the two modalities in a sequential order. A method for measuring the +synthesis complexity is proposed to automatically determine the synthesis order +in our sequential GAN. Images of the modality with a lower complexity are +synthesized first, and the counterparts with a higher complexity are generated +later. Our sequential GAN is trained end-to-end in a semi-supervised manner. In +supervised training, the joint distribution of bi-modality images are learned +from real paired images of the two modalities by explicitly minimizing the +reconstruction losses between the real and synthetic images. To avoid +overfitting limited training images, in unsupervised training, the marginal +distribution of each modality is learned based on unpaired images by minimizing +the Wasserstein distance between the distributions of real and fake images. We +comprehensively evaluate the proposed model using two synthesis tasks based on +three types of evaluate metrics and user studies. Visual and quantitative +results demonstrate the superiority of our method to the state-of-the-art +methods, and reasonable visual quality and clinical significance. Code is made +publicly available at +https://github.com/hustlinyi/Multimodal-Medical-Image-Synthesis. + +
+
+
+
+
+ + ☆ Multi-model fusion for Aerial Vision and Dialog Navigation based on + human attention aids + + +
+ Drones have been widely used in many areas of our daily lives. It relieves +people of the burden of holding a controller all the time and makes drone +control easier to use for people with disabilities or occupied hands. However, +the control of aerial robots is more complicated compared to normal robots due +to factors such as uncontrollable height. Therefore, it is crucial to develop +an intelligent UAV that has the ability to talk to humans and follow natural +language commands. In this report, we present an aerial navigation task for the +2023 ICCV Conversation History. Based on the AVDN dataset containing more than +3k recorded navigation trajectories and asynchronous human-robot conversations, +we propose an effective method of fusion training of Human Attention Aided +Transformer model (HAA-Transformer) and Human Attention Aided LSTM (HAA-LSTM) +model, which achieves the prediction of the navigation routing points and human +attention. The method not only achieves high SR and SPL metrics, but also shows +a 7% improvement in GP metrics compared to the baseline model. + +
+
+ comment: 4 pages, 1 figures +
+
+
+
+
+ + ☆ Hierarchical Contrastive Learning for Pattern-Generalizable Image + Corruption Detection ICCV 2023 + + +
+ Effective image restoration with large-size corruptions, such as blind image +inpainting, entails precise detection of corruption region masks which remains +extremely challenging due to diverse shapes and patterns of corruptions. In +this work, we present a novel method for automatic corruption detection, which +allows for blind corruption restoration without known corruption masks. +Specifically, we develop a hierarchical contrastive learning framework to +detect corrupted regions by capturing the intrinsic semantic distinctions +between corrupted and uncorrupted regions. In particular, our model detects the +corrupted mask in a coarse-to-fine manner by first predicting a coarse mask by +contrastive learning in low-resolution feature space and then refines the +uncertain area of the mask by high-resolution contrastive learning. A +specialized hierarchical interaction mechanism is designed to facilitate the +knowledge propagation of contrastive learning in different scales, boosting the +modeling performance substantially. The detected multi-scale corruption masks +are then leveraged to guide the corruption restoration. Detecting corrupted +regions by learning the contrastive distinctions rather than the semantic +patterns of corruptions, our model has well generalization ability across +different corruption patterns. Extensive experiments demonstrate following +merits of our model: 1) the superior performance over other methods on both +corruption detection and various image restoration tasks including blind +inpainting and watermark removal, and 2) strong generalization across different +corruption patterns such as graffiti, random noise or other image content. +Codes and trained weights are available at https://github.com/xyfJASON/HCL . + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Pruning the Unlabeled Data to Improve Semi-Supervised Learning + + +
+ In the domain of semi-supervised learning (SSL), the conventional approach +involves training a learner with a limited amount of labeled data alongside a +substantial volume of unlabeled data, both drawn from the same underlying +distribution. However, for deep learning models, this standard practice may not +yield optimal results. In this research, we propose an alternative perspective, +suggesting that distributions that are more readily separable could offer +superior benefits to the learner as compared to the original distribution. To +achieve this, we present PruneSSL, a practical technique for selectively +removing examples from the original unlabeled dataset to enhance its +separability. We present an empirical study, showing that although PruneSSL +reduces the quantity of available training data for the learner, it +significantly improves the performance of various competitive SSL algorithms, +thereby achieving state-of-the-art results across several image classification +tasks. + +
+
+
+
+
+ + ♻ ☆ Masked Diffusion as Self-supervised Representation Learner + + +
+ Denoising diffusion probabilistic models have recently demonstrated +state-of-the-art generative performance and been used as strong pixel-level +representation learners. This paper decomposes the interrelation between the +generative capability and representation learning ability inherent in diffusion +models. We present masked diffusion model (MDM), a scalable self-supervised +representation learner that substitutes the conventional additive Gaussian +noise of traditional diffusion with a masking mechanism. Our proposed approach +convincingly surpasses prior benchmarks, demonstrating remarkable advancements +in both medical and natural image semantic segmentation tasks, particularly +within the context of few-shot scenario. + +
+
+
+
+
+ + ♻ ☆ Learning Melanocytic Cell Masks from Adjacent Stained Tissue MICCAI + 2022 + + +
+ Melanoma is one of the most aggressive forms of skin cancer, causing a large +proportion of skin cancer deaths. However, melanoma diagnoses by pathologists +shows low interrater reliability. As melanoma is a cancer of the melanocyte, +there is a clear need to develop a melanocytic cell segmentation tool that is +agnostic to pathologist variability and automates pixel-level annotation. +Gigapixel-level pathologist labeling, however, is impractical. Herein, we +propose a means to train deep neural networks for melanocytic cell segmentation +from hematoxylin and eosin (H&E) stained slides using paired +immunohistochemical (IHC) slides of adjacent tissue sections, achieving a mean +IOU of 0.64 despite imperfect ground-truth labels. + +
+
+ comment: {Medical Image Learning with Limited & Noisy Data Workshop at MICCAI + 2022 +
+
+
+
+
+ + ♻ ☆ Day2Dark: Pseudo-Supervised Activity Recognition beyond Silent Daylight + + +
+ This paper strives to recognize activities in the dark, as well as in the +day. We first establish that state-of-the-art activity recognizers are +effective during the day, but not trustworthy in the dark. The main causes are +the limited availability of labeled dark videos to learn from, as well as the +distribution shift towards the lower color contrast at test-time. To compensate +for the lack of labeled dark videos, we introduce a pseudo-supervised learning +scheme, which utilizes easy to obtain unlabeled and task-irrelevant dark videos +to improve an activity recognizer in low light. As the lower color contrast +results in visual information loss, we further propose to incorporate the +complementary activity information within audio, which is invariant to +illumination. Since the usefulness of audio and visual features differs +depending on the amount of illumination, we introduce our `darkness-adaptive' +audio-visual recognizer. Experiments on EPIC-Kitchens, Kinetics-Sound, and +Charades demonstrate our proposals are superior to image enhancement, domain +adaptation and alternative audio-visual fusion methods, and can even improve +robustness to local darkness caused by occlusions. Project page: +https://xiaobai1217.github.io/Day2Dark/ + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Artificial Intelligence for Automatic Detection and Classification + Disease on the X-Ray Images + + +
+ Detecting and classifying diseases using X-ray images is one of the more +challenging core tasks in the medical and research world. Due to the recent +high interest in radiological images and AI, early detection of diseases in +X-ray images has become notably more essential to prevent further spreading and +flatten the curve. Innovations and revolutions of Computer Vision with Deep +learning methods offer great promise for fast and accurate diagnosis of +screening and detection from chest X-ray images (CXR). This work presents rapid +detection of diseases in the lung using the efficient Deep learning pre-trained +RepVGG algorithm for deep feature extraction and classification. We used X-ray +images as an example to show the model's efficiency. To perform this task, we +classify X-Ray images into Covid-19, Pneumonia, and Normal X-Ray images. Employ +ROI object to improve the detection accuracy for lung extraction, followed by +data pre-processing and augmentation. We are applying Artificial Intelligence +technology for automatic highlighted detection of affected areas of people's +lungs. Based on the X-Ray images, an algorithm was developed that classifies +X-Ray images with height accuracy and power faster thanks to the architecture +transformation of the model. We compared deep learning frameworks' accuracy and +detection of disease. The study shows the high power of deep learning methods +for X-ray images based on COVID-19 detection utilizing chest X-rays. The +proposed framework offers better diagnostic accuracy by comparing popular deep +learning models, i.e., VGG, ResNet50, inceptionV3, DenseNet, and +InceptionResnetV2. + +
+
+
+
+
+ + ♻ ☆ Implicit Autoencoder for Point-Cloud Self-Supervised Representation + Learning ICCV 2023 + + +
+ This paper advocates the use of implicit surface representation in +autoencoder-based self-supervised 3D representation learning. The most popular +and accessible 3D representation, i.e., point clouds, involves discrete samples +of the underlying continuous 3D surface. This discretization process introduces +sampling variations on the 3D shape, making it challenging to develop +transferable knowledge of the true 3D geometry. In the standard autoencoding +paradigm, the encoder is compelled to encode not only the 3D geometry but also +information on the specific discrete sampling of the 3D shape into the latent +code. This is because the point cloud reconstructed by the decoder is +considered unacceptable unless there is a perfect mapping between the original +and the reconstructed point clouds. This paper introduces the Implicit +AutoEncoder (IAE), a simple yet effective method that addresses the sampling +variation issue by replacing the commonly-used point-cloud decoder with an +implicit decoder. The implicit decoder reconstructs a continuous representation +of the 3D shape, independent of the imperfections in the discrete samples. +Extensive experiments demonstrate that the proposed IAE achieves +state-of-the-art performance across various self-supervised learning +benchmarks. + +
+
+ comment: Published in ICCV 2023. The code is available at + https://github.com/SimingYan/IAE +
+
+
+
+
+ + ♻ ☆ Few-shot Forgery Detection via Guided Adversarial Interpolation + + +
+ The increase in face manipulation models has led to a critical issue in +society - the synthesis of realistic visual media. With the emergence of new +forgery approaches at an unprecedented rate, existing forgery detection methods +suffer from significant performance drops when applied to unseen novel forgery +approaches. In this work, we address the few-shot forgery detection problem by +1) designing a comprehensive benchmark based on coverage analysis among various +forgery approaches, and 2) proposing Guided Adversarial Interpolation (GAI). +Our key insight is that there exist transferable distribution characteristics +between majority and minority forgery classes1. Specifically, we enhance the +discriminative ability against novel forgery approaches via adversarially +interpolating the forgery artifacts of the minority samples to the majority +samples under the guidance of a teacher network. Unlike the standard +re-balancing method which usually results in over-fitting to minority classes, +our method simultaneously takes account of the diversity of majority +information as well as the significance of minority information. Extensive +experiments demonstrate that our GAI achieves state-of-the-art performances on +the established few-shot forgery detection benchmark. Notably, our method is +also validated to be robust to choices of majority and minority forgery +approaches. The formal publication version is available in Pattern Recognition. + +
+
+
+
+
+ + ♻ ☆ Local Context-Aware Active Domain Adaptation ICCV 2023 + + +
+ Active Domain Adaptation (ADA) queries the labels of a small number of +selected target samples to help adapting a model from a source domain to a +target domain. The local context of queried data is important, especially when +the domain gap is large. However, this has not been fully explored by existing +ADA works. In this paper, we propose a Local context-aware ADA framework, named +LADA, to address this issue. To select informative target samples, we devise a +novel criterion based on the local inconsistency of model predictions. Since +the labeling budget is usually small, fine-tuning model on only queried data +can be inefficient. We progressively augment labeled target data with the +confident neighbors in a class-balanced manner. Experiments validate that the +proposed criterion chooses more informative target samples than existing active +selection strategies. Furthermore, our full method clearly surpasses recent ADA +arts on various benchmarks. Code is available at https://github.com/tsun/LADA. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ SurroundOcc: Multi-Camera 3D Occupancy Prediction for Autonomous Driving ICCV 2023 + + +
+ 3D scene understanding plays a vital role in vision-based autonomous driving. +While most existing methods focus on 3D object detection, they have difficulty +describing real-world objects of arbitrary shapes and infinite classes. Towards +a more comprehensive perception of a 3D scene, in this paper, we propose a +SurroundOcc method to predict the 3D occupancy with multi-camera images. We +first extract multi-scale features for each image and adopt spatial 2D-3D +attention to lift them to the 3D volume space. Then we apply 3D convolutions to +progressively upsample the volume features and impose supervision on multiple +levels. To obtain dense occupancy prediction, we design a pipeline to generate +dense occupancy ground truth without expansive occupancy annotations. +Specifically, we fuse multi-frame LiDAR scans of dynamic objects and static +scenes separately. Then we adopt Poisson Reconstruction to fill the holes and +voxelize the mesh to get dense occupancy labels. Extensive experiments on +nuScenes and SemanticKITTI datasets demonstrate the superiority of our method. +Code and dataset are available at https://github.com/weiyithu/SurroundOcc + +
+
+ comment: Accepted to ICCV 2023. Code is available at + https://github.com/weiyithu/SurroundOcc +
+
+
+
+
+ + ♻ ☆ VDD: Varied Drone Dataset for Semantic Segmentation + + +
+ Semantic segmentation of drone images is critical to many aerial vision tasks +as it provides essential semantic details that can compensate for the lack of +depth information from monocular cameras. However, maintaining high accuracy of +semantic segmentation models for drones requires diverse, large-scale, and +high-resolution datasets, which are rare in the field of aerial image +processing. Existing datasets are typically small and focus primarily on urban +scenes, neglecting rural and industrial areas. Models trained on such datasets +are not sufficiently equipped to handle the variety of inputs seen in drone +imagery. In the VDD-Varied Drone Dataset, we offer a large-scale and densely +labeled dataset comprising 400 high-resolution images that feature carefully +chosen scenes, camera angles, and varied light and weather conditions. +Furthermore, we have adapted existing drone datasets to conform to our +annotation standards and integrated them with VDD to create a dataset 1.5 times +the size of fine annotation of Cityscapes. We have developed a novel DeepLabT +model, which combines CNN and Transformer backbones, to provide a reliable +baseline for semantic segmentation in drone imagery. Our experiments indicate +that DeepLabT performs admirably on VDD and other drone datasets. We expect +that our dataset will generate considerable interest in drone image +segmentation and serve as a foundation for other drone vision tasks. VDD is +freely available on our website at https://vddvdd.com . + +
+
+
+
+
+ + ♻ ☆ VMA: Divide-and-Conquer Vectorized Map Annotation System for Large-Scale + Driving Scene + + +
+ High-definition (HD) map serves as the essential infrastructure of autonomous +driving. In this work, we build up a systematic vectorized map annotation +framework (termed VMA) for efficiently generating HD map of large-scale driving +scene. We design a divide-and-conquer annotation scheme to solve the spatial +extensibility problem of HD map generation, and abstract map elements with a +variety of geometric patterns as unified point sequence representation, which +can be extended to most map elements in the driving scene. VMA is highly +efficient and extensible, requiring negligible human effort, and flexible in +terms of spatial scale and element type. We quantitatively and qualitatively +validate the annotation performance on real-world urban and highway scenes, as +well as NYC Planimetric Database. VMA can significantly improve map generation +efficiency and require little human effort. On average VMA takes 160min for +annotating a scene with a range of hundreds of meters, and reduces 52.3% of the +human cost, showing great application value. Code: +https://github.com/hustvl/VMA. + +
+
+ comment: https://github.com/hustvl/VMA +
+
+
+
+
+ + ♻ ☆ Single image reflection removal via learning with multi-image + constraints + + +
+ Reflections are very common phenomena in our daily photography, which +distract people's attention from the scene behind the glass. The problem of +removing reflection artifacts is important but challenging due to its ill-posed +nature. The traditional approaches solve an optimization problem over the +constraints induced from multiple images, at the expense of large computation +costs. Recent learning-based approaches have demonstrated a significant +improvement in both performance and running time for single image reflection +removal, but are limited as they require a large number of synthetic +reflection/clean image pairs for direct supervision to approximate the ground +truth, at the risk of overfitting in the synthetic image domain and degrading +in the real image domain. In this paper, we propose a novel learning-based +solution that combines the advantages of the aforementioned approaches and +overcomes their drawbacks. Our algorithm works by learning a deep neural +network to optimize the target with joint constraints enhanced among multiple +input images during the training phase, but is able to eliminate reflections +only from a single input for evaluation. Our algorithm runs in real-time and +achieves state-of-the-art reflection removal performance on real images. We +further propose a strong network backbone that disentangles the background and +reflection information into separate latent codes, which are embedded into a +shared one-branch deep neural network for both background and reflection +predictions. The proposed backbone experimentally performs better than the +other common network implementations, and provides insightful knowledge to +understand the reflection removal task. + +
+
+
+
+
+ + ♻ ☆ LXL: LiDAR Excluded Lean 3D Object Detection with 4D Imaging Radar and + Camera Fusion + + +
+ As an emerging technology and a relatively affordable device, the 4D imaging +radar has already been confirmed effective in performing 3D object detection in +autonomous driving. Nevertheless, the sparsity and noisiness of 4D radar point +clouds hinder further performance improvement, and in-depth studies about its +fusion with other modalities are lacking. On the other hand, as a new image +view transformation strategy, "sampling" has been applied in a few image-based +detectors and shown to outperform the widely applied "depth-based splatting" +proposed in Lift-Splat-Shoot (LSS), even without image depth prediction. +However, the potential of "sampling" is not fully unleashed. In this paper, we +investigate the "sampling" view transformation strategy on the camera and 4D +imaging radar fusion-based 3D object detection. In the proposed LiDAR Excluded +Lean (LXL) model, predicted image depth distribution maps and radar 3D +occupancy grids are generated from image perspective view (PV) features and +radar bird's eye view (BEV) features, respectively. They are sent to the core +of LXL, called "radar occupancy-assisted depth-based sampling", to aid image +view transformation. Introducing image depths and radar information enhances +the "sampling" strategy and leads to more accurate view transformation. +Experiments on VoD and TJ4DRadSet datasets show that the proposed method +outperforms the state-of-the-art 3D object detection methods by a significant +margin without bells and whistles. Ablation studies demonstrate that our method +performs the best among different enhancement settings. + +
+
+
+
+
+ + ♻ ☆ SimpleMapping: Real-Time Visual-Inertial Dense Mapping with Deep + Multi-View Stereo + + +
+ We present a real-time visual-inertial dense mapping method capable of +performing incremental 3D mesh reconstruction with high quality using only +sequential monocular images and inertial measurement unit (IMU) readings. 6-DoF +camera poses are estimated by a robust feature-based visual-inertial odometry +(VIO), which also generates noisy sparse 3D map points as a by-product. We +propose a sparse point aided multi-view stereo neural network (SPA-MVSNet) that +can effectively leverage the informative but noisy sparse points from the VIO +system. The sparse depth from VIO is firstly completed by a single-view depth +completion network. This dense depth map, although naturally limited in +accuracy, is then used as a prior to guide our MVS network in the cost volume +generation and regularization for accurate dense depth prediction. Predicted +depth maps of keyframe images by the MVS network are incrementally fused into a +global map using TSDF-Fusion. We extensively evaluate both the proposed +SPA-MVSNet and the entire visual-inertial dense mapping system on several +public datasets as well as our own dataset, demonstrating the system's +impressive generalization capabilities and its ability to deliver high-quality +3D mesh reconstruction online. Our proposed dense mapping system achieves a +39.7% improvement in F-score over existing systems when evaluated on the +challenging scenarios of the EuRoC dataset. + +
+
+
+
+
+ + ♻ ☆ EEP-3DQA: Efficient and Effective Projection-based 3D Model Quality + Assessment + + +
+ Currently, great numbers of efforts have been put into improving the +effectiveness of 3D model quality assessment (3DQA) methods. However, little +attention has been paid to the computational costs and inference time, which is +also important for practical applications. Unlike 2D media, 3D models are +represented by more complicated and irregular digital formats, such as point +cloud and mesh. Thus it is normally difficult to perform an efficient module to +extract quality-aware features of 3D models. In this paper, we address this +problem from the aspect of projection-based 3DQA and develop a no-reference +(NR) \underline{E}fficient and \underline{E}ffective +\underline{P}rojection-based \underline{3D} Model \underline{Q}uality +\underline{A}ssessment (\textbf{EEP-3DQA}) method. The input projection images +of EEP-3DQA are randomly sampled from the six perpendicular viewpoints of the +3D model and are further spatially downsampled by the grid-mini patch sampling +strategy. Further, the lightweight Swin-Transformer tiny is utilized as the +backbone to extract the quality-aware features. Finally, the proposed EEP-3DQA +and EEP-3DQA-t (tiny version) achieve the best performance than the existing +state-of-the-art NR-3DQA methods and even outperforms most full-reference (FR) +3DQA methods on the point cloud and mesh quality assessment databases while +consuming less inference time than the compared 3DQA methods. + +
+
+
+
+
+
+
+
+ + Information Retrieval 7 + +
+
+
+ + ☆ Distributional Off-Policy Evaluation for Slate Recommendations + + +
+ Recommendation strategies are typically evaluated by using previously logged +data, employing off-policy evaluation methods to estimate their expected +performance. However, for strategies that present users with slates of multiple +items, the resulting combinatorial action space renders many of these methods +impractical. Prior work has developed estimators that leverage the structure in +slates to estimate the expected off-policy performance, but the estimation of +the entire performance distribution remains elusive. Estimating the complete +distribution allows for a more comprehensive evaluation of recommendation +strategies, particularly along the axes of risk and fairness that employ +metrics computable from the distribution. In this paper, we propose an +estimator for the complete off-policy performance distribution for slates and +establish conditions under which the estimator is unbiased and consistent. This +builds upon prior work on off-policy evaluation for slates and off-policy +distribution estimation in reinforcement learning. We validate the efficacy of +our method empirically on synthetic data as well as on a slate recommendation +simulator constructed from real-world data (MovieLens-20M). Our results show a +significant reduction in estimation variance and improved sample efficiency +over prior work across a range of slate structures. + +
+
+
+
+
+ + ☆ Only Encode Once: Making Content-based News Recommender Greener + + +
+ Large pretrained language models (PLM) have become de facto news encoders in +modern news recommender systems, due to their strong ability in comprehending +textual content. These huge Transformer-based architectures, when finetuned on +recommendation tasks, can greatly improve news recommendation performance. +However, the PLM-based pretrain-finetune framework incurs high computational +cost and energy consumption, primarily due to the extensive redundant +processing of news encoding during each training epoch. In this paper, we +propose the ``Only Encode Once'' framework for news recommendation (OLEO), by +decoupling news representation learning from downstream recommendation task +learning. The decoupled design makes content-based news recommender as green +and efficient as id-based ones, leading to great reduction in computational +cost and training resources. Extensive experiments show that our OLEO framework +can reduce carbon emissions by up to 13 times compared with the +state-of-the-art pretrain-finetune framework and maintain a competitive or even +superior performance level. The source code is released for reproducibility. + +
+
+
+
+
+ + ☆ CTR is not Enough: a Novel Reinforcement Learning based Ranking Approach + for Optimizing Session Clicks + + +
+ Ranking is a crucial module using in the recommender system. In particular, +the ranking module using in our YoungTao recommendation scenario is to provide +an ordered list of items to users, to maximize the click number throughout the +recommendation session for each user. However, we found that the traditional +ranking method for optimizing Click-Through rate(CTR) cannot address our +ranking scenario well, since it completely ignores user leaving, and CTR is the +optimization goal for the one-step recommendation. To effectively undertake the +purpose of our ranking module, we propose a long-term optimization goal, named +as CTE (Click-Through quantity expectation), for explicitly taking the behavior +of user leaving into account. Based on CTE, we propose an effective model +trained by reinforcement learning. Moreover, we build a simulation environment +from offline log data for estimating PBR and CTR. We conduct extensive +experiments on offline datasets and an online e-commerce scenario in TaoBao. +Experimental results show that our method can boost performance effectively + +
+
+
+
+
+ + ☆ Text Matching Improves Sequential Recommendation by Reducing Popularity + Biases CIKM 2023 + + +
+ This paper proposes Text mAtching based SequenTial rEcommendation model +(TASTE), which maps items and users in an embedding space and recommends items +by matching their text representations. TASTE verbalizes items and user-item +interactions using identifiers and attributes of items. To better characterize +user behaviors, TASTE additionally proposes an attention sparsity method, which +enables TASTE to model longer user-item interactions by reducing the +self-attention computations during encoding. Our experiments show that TASTE +outperforms the state-of-the-art methods on widely used sequential +recommendation datasets. TASTE alleviates the cold start problem by +representing long-tail items using full-text modeling and bringing the benefits +of pretrained language models to recommendation systems. Our further analyses +illustrate that TASTE significantly improves the recommendation accuracy by +reducing the popularity bias of previous item id based recommendation models +and returning more appropriate and text-relevant items to satisfy users. All +codes are available at https://github.com/OpenMatch/TASTE. + +
+
+ comment: Accepted by CIKM 2023 +
+
+
+
+
+ + ♻ ☆ Analyzing and visualizing polarization and balance with signed networks: + the U.S. Congress case study + + +
+ Signed networks and balance theory provide a natural setting for real-world +scenarios that show polarization dynamics, positive/negative relationships, and +political partisanship. For example, they have been proven effective in +studying the increasing polarization of the votes in the two chambers of the +U.S. Congress from World War II on. + To provide further insights into this particular case study, we propose the +application of a pipeline to analyze and visualize a signed graph's +configuration based on the exploitation of the corresponding Laplacian matrix' +spectral properties. The overall methodology is comparable with others based on +the frustration index, but it has at least two main advantages: first, it +requires a much lower computational cost; second, it allows for a quantitative +and visual assessment of how arbitrarily small subgraphs (even single nodes) +contribute to the overall balance (or unbalance) of the network. + The proposed pipeline allows the exploration of polarization dynamics shown +by the U.S. Congress from 1945 to 2020 at different resolution scales. In fact, +we are able to spot and point out the influence of some (groups of) congressmen +in the overall balance, as well as to observe and explore polarization's +evolution of both chambers across the years. + +
+
+
+
+
+ + ♻ ☆ SPM: Structured Pretraining and Matching Architectures for Relevance + Modeling in Meituan Search CIKM '23 + + +
+ In e-commerce search, relevance between query and documents is an essential +requirement for satisfying user experience. Different from traditional +e-commerce platforms that offer products, users search on life service +platforms such as Meituan mainly for product providers, which usually have +abundant structured information, e.g. name, address, category, thousands of +products. Modeling search relevance with these rich structured contents is +challenging due to the following issues: (1) there is language distribution +discrepancy among different fields of structured document, making it difficult +to directly adopt off-the-shelf pretrained language model based methods like +BERT. (2) different fields usually have different importance and their length +vary greatly, making it difficult to extract document information helpful for +relevance matching. + To tackle these issues, in this paper we propose a novel two-stage +pretraining and matching architecture for relevance matching with rich +structured documents. At pretraining stage, we propose an effective pretraining +method that employs both query and multiple fields of document as inputs, +including an effective information compression method for lengthy fields. At +relevance matching stage, a novel matching method is proposed by leveraging +domain knowledge in search query to generate more effective document +representations for relevance scoring. Extensive offline experiments and online +A/B tests on millions of users verify that the proposed architectures +effectively improve the performance of relevance modeling. The model has +already been deployed online, serving the search traffic of Meituan for over a +year. + +
+
+ comment: Accepted by CIKM '23 +
+
+
+
+
+ + ♻ ☆ Causal Decision Transformer for Recommender Systems via Offline + Reinforcement Learning SIGIR'23 + + +
+ Reinforcement learning-based recommender systems have recently gained +popularity. However, the design of the reward function, on which the agent +relies to optimize its recommendation policy, is often not straightforward. +Exploring the causality underlying users' behavior can take the place of the +reward function in guiding the agent to capture the dynamic interests of users. +Moreover, due to the typical limitations of simulation environments (e.g., data +inefficiency), most of the work cannot be broadly applied in large-scale +situations. Although some works attempt to convert the offline dataset into a +simulator, data inefficiency makes the learning process even slower. Because of +the nature of reinforcement learning (i.e., learning by interaction), it cannot +collect enough data to train during a single interaction. Furthermore, +traditional reinforcement learning algorithms do not have a solid capability +like supervised learning methods to learn from offline datasets directly. In +this paper, we propose a new model named the causal decision transformer for +recommender systems (CDT4Rec). CDT4Rec is an offline reinforcement learning +system that can learn from a dataset rather than from online interaction. +Moreover, CDT4Rec employs the transformer architecture, which is capable of +processing large offline datasets and capturing both short-term and long-term +dependencies within the data to estimate the causal relationship between +action, state, and reward. To demonstrate the feasibility and superiority of +our model, we have conducted experiments on six real-world offline datasets and +one online simulator. + +
+
+ comment: Accepted by SIGIR'23, please check the camera-ready version for more + details such as the implementation +
+
+
+
+
+
+
+
+ + Machine Learning 33 + +
+
+
+ + ☆ Modeling Player Personality Factors from In-Game Behavior and Affective + Expression + + +
+ Developing a thorough understanding of the target audience (and/or single +individuals) is a key factor for success - which is exceptionally important and +powerful for the domain of video games that can not only benefit from informed +decision making during development, but ideally even tailor game content, +difficulty and player experience while playing. The granular assessment of +individual personality and differences across players is a particularly +difficult endeavor, given the highly variant human nature, disagreement in +psychological background models and because of the effortful data collection +that most often builds upon long, time-consuming and deterrent questionnaires. +In this work, we explore possibilities to predict a series of player +personality questionnaire metrics from recorded in-game behavior and extend +related work by explicitly adding affective dialog decisions to the game +environment which could elevate the model's accuracy. Using random forest +regression, we predicted a wide variety of personality metrics from seven +established questionnaires across 62 players over 60 minute gameplay of a +customized version of the role-playing game Fallout: New Vegas. While some +personality variables could already be identified from reasonable underlying +in-game actions and affective expressions, we did not find ways to predict +others or encountered questionable correlations that could not be justified by +theoretical background literature. Yet, building on the initial opportunities +of this explorative study, we are striving to massively enlarge our data set to +players from an ecologically valid industrial game environment and investigate +the performance of more sophisticated machine learning approaches. + +
+
+
+
+
+ + ☆ On Active Learning for Gaussian Process-based Global Sensitivity + Analysis + + +
+ This paper explores the application of active learning strategies to +adaptively learn Sobol indices for global sensitivity analysis. We demonstrate +that active learning for Sobol indices poses unique challenges due to the +definition of the Sobol index as a ratio of variances estimated from Gaussian +process surrogates. Consequently, learning strategies must either focus on +convergence in the numerator or the denominator of this ratio. However, rapid +convergence in either one does not guarantee convergence in the Sobol index. We +propose a novel strategy for active learning that focuses on resolving the main +effects of the Gaussian process (associated with the numerator of the Sobol +index) and compare this with existing strategies based on convergence in the +total variance (the denominator of the Sobol index). The new strategy, +implemented through a new learning function termed the MUSIC (minimize +uncertainty in Sobol index convergence), generally converges in Sobol index +error more rapidly than the existing strategies based on the Expected +Improvement for Global Fit (EIGF) and the Variance Improvement for Global Fit +(VIGF). Both strategies are compared with simple sequential random sampling and +the MUSIC learning function generally converges most rapidly for +low-dimensional problems. However, for high-dimensional problems, the +performance is comparable to random sampling. The new learning strategy is +demonstrated for a practical case of adaptive experimental design for +large-scale Boundary Layer Wind Tunnel experiments. + +
+
+ comment: 31 pages, 16 figures +
+
+
+
+
+ + ☆ Machine Learning for Administrative Health Records: A Systematic Review + of Techniques and Applications + + +
+ Machine learning provides many powerful and effective techniques for +analysing heterogeneous electronic health records (EHR). Administrative Health +Records (AHR) are a subset of EHR collected for administrative purposes, and +the use of machine learning on AHRs is a growing subfield of EHR analytics. +Existing reviews of EHR analytics emphasise that the data-modality of the EHR +limits the breadth of suitable machine learning techniques, and pursuable +healthcare applications. Despite emphasising the importance of data modality, +the literature fails to analyse which techniques and applications are relevant +to AHRs. AHRs contain uniquely well-structured, categorically encoded records +which are distinct from other data-modalities captured by EHRs, and they can +provide valuable information pertaining to how patients interact with the +healthcare system. + This paper systematically reviews AHR-based research, analysing 70 relevant +studies and spanning multiple databases. We identify and analyse which machine +learning techniques are applied to AHRs and which health informatics +applications are pursued in AHR-based research. We also analyse how these +techniques are applied in pursuit of each application, and identify the +limitations of these approaches. We find that while AHR-based studies are +disconnected from each other, the use of AHRs in health informatics research is +substantial and accelerating. Our synthesis of these studies highlights the +utility of AHRs for pursuing increasingly complex and diverse research +objectives despite a number of pervading data- and technique-based limitations. +Finally, through our findings, we propose a set of future research directions +that can enhance the utility of AHR data and machine learning techniques for +health informatics research. + +
+
+
+
+
+ + ☆ TimeTrail: Unveiling Financial Fraud Patterns through Temporal + Correlation Analysis + + +
+ In the field of financial fraud detection, understanding the underlying +patterns and dynamics is important to ensure effective and reliable systems. +This research introduces a new technique, "TimeTrail," which employs advanced +temporal correlation analysis to explain complex financial fraud patterns. The +technique leverages time-related insights to provide transparent and +interpretable explanations for fraud detection decisions, enhancing +accountability and trust. + The "TimeTrail" methodology consists of three key phases: temporal data +enrichment, dynamic correlation analysis, and interpretable pattern +visualization. Initially, raw financial transaction data is enriched with +temporal attributes. Dynamic correlations between these attributes are then +quantified using innovative statistical measures. Finally, a unified +visualization framework presents these correlations in an interpretable manner. +To validate the effectiveness of "TimeTrail," a study is conducted on a diverse +financial dataset, surrounding various fraud scenarios. Results demonstrate the +technique's capability to uncover hidden temporal correlations and patterns, +performing better than conventional methods in both accuracy and +interpretability. Moreover, a case study showcasing the application of +"TimeTrail" in real-world scenarios highlights its utility for fraud detection. + +
+
+
+
+
+ + ☆ Predictive Sparse Manifold Transform ICML + + +
+ We present Predictive Sparse Manifold Transform (PSMT), a minimalistic, +interpretable and biologically plausible framework for learning and predicting +natural dynamics. PSMT incorporates two layers where the first sparse coding +layer represents the input sequence as sparse coefficients over an overcomplete +dictionary and the second manifold learning layer learns a geometric embedding +space that captures topological similarity and dynamic temporal linearity in +sparse coefficients. We apply PSMT on a natural video dataset and evaluate the +reconstruction performance with respect to contextual variability, the number +of sparse coding basis functions and training samples. We then interpret the +dynamic topological organization in the embedding space. We next utilize PSMT +to predict future frames compared with two baseline methods with a static +embedding space. We demonstrate that PSMT with a dynamic embedding space can +achieve better prediction performance compared to static baselines. Our work +establishes that PSMT is an efficient unsupervised generative framework for +prediction of future visual stimuli. + +
+
+ comment: Paper presented at the 1st Workshop on High-dimensional Learning + Dynamics (HLD) at the 40th International Conference on Machine Learning + (ICML) 2023, Honolulu, Hawaii, USA + (https://sites.google.com/view/hidimlearning), 10 pages +
+
+
+
+
+ + ☆ Score-Based Generative Models for PET Image Reconstruction + + +
+ Score-based generative models have demonstrated highly promising results for +medical image reconstruction tasks in magnetic resonance imaging or computed +tomography. However, their application to Positron Emission Tomography (PET) is +still largely unexplored. PET image reconstruction involves a variety of +challenges, including Poisson noise with high variance and a wide dynamic +range. To address these challenges, we propose several PET-specific adaptations +of score-based generative models. The proposed framework is developed for both +2D and 3D PET. In addition, we provide an extension to guided reconstruction +using magnetic resonance images. We validate the approach through extensive 2D +and 3D $\textit{in-silico}$ experiments with a model trained on +patient-realistic data without lesions, and evaluate on data without lesions as +well as out-of-distribution data with lesions. This demonstrates the proposed +method's robustness and significant potential for improved PET reconstruction. + +
+
+ comment: 35 pages, 16 figures, submitted to Journal of Machine Learning for + Biomedical Imaging (MELBA) +
+
+
+
+
+ + ☆ Topological Augmentation for Class-Imbalanced Node Classification + + +
+ Class imbalance is prevalent in real-world node classification tasks and +often biases graph learning models toward majority classes. Most existing +studies root from a node-centric perspective and aim to address the class +imbalance in training data by node/class-wise reweighting or resampling. In +this paper, we approach the source of the class-imbalance bias from an +under-explored topology-centric perspective. Our investigation reveals that +beyond the inherently skewed training class distribution, the graph topology +also plays an important role in the formation of predictive bias: we identify +two fundamental challenges, namely ambivalent and distant message-passing, that +can exacerbate the bias by aggravating majority-class over-generalization and +minority-class misclassification. In light of these findings, we devise a +lightweight topological augmentation method ToBA to dynamically rectify the +nodes influenced by ambivalent/distant message-passing during graph learning, +so as to mitigate the class-imbalance bias. We highlight that ToBA is a +model-agnostic, efficient, and versatile solution that can be seamlessly +combined with and further boost other imbalance-handling techniques. Systematic +experiments validate the superior performance of ToBA in both promoting +imbalanced node classification and mitigating the prediction bias between +different classes. + +
+
+ comment: 19 pages, 8 figures +
+
+
+
+
+ + ☆ Leveraging Linear Independence of Component Classifiers: Optimizing Size + and Prediction Accuracy for Online Ensembles + + +
+ Ensembles, which employ a set of classifiers to enhance classification +accuracy collectively, are crucial in the era of big data. However, although +there is general agreement that the relation between ensemble size and its +prediction accuracy, the exact nature of this relationship is still unknown. We +introduce a novel perspective, rooted in the linear independence of +classifier's votes, to analyze the interplay between ensemble size and +prediction accuracy. This framework reveals a theoretical link, consequently +proposing an ensemble size based on this relationship. Our study builds upon a +geometric framework and develops a series of theorems. These theorems clarify +the role of linear dependency in crafting ensembles. We present a method to +determine the minimum ensemble size required to ensure a target probability of +linearly independent votes among component classifiers. Incorporating real and +synthetic datasets, our empirical results demonstrate a trend: increasing the +number of classifiers enhances accuracy, as predicted by our theoretical +insights. However, we also identify a point of diminishing returns, beyond +which additional classifiers provide diminishing improvements in accuracy. +Surprisingly, the calculated ideal ensemble size deviates from empirical +results for certain datasets, emphasizing the influence of other factors. This +study opens avenues for deeper investigations into the complex dynamics +governing ensemble design and offers guidance for constructing efficient and +effective ensembles in practical scenarios. + +
+
+
+
+
+ + ☆ Integrated Approach of Gearbox Fault Diagnosis + + +
+ Gearbox fault diagnosis is one of the most important parts in any industrial +systems. Failure of components inside gearbox can lead to a catastrophic +failure, uneven breakdown, and financial losses in industrial organization. In +that case intelligent maintenance of the gearbox comes into context. This paper +presents an integrated gearbox fault diagnosis approach which can easily deploy +in online condition monitoring. This work introduces a nonparametric data +preprocessing technique i.e., calculus enhanced energy operator (CEEO) to +preserve the characteristics frequencies in the noisy and inferred vibrational +signal. A set of time domain and spectral domain features are calculated from +the raw and CEEO vibration signal and inputted to the multiclass support vector +machine (MCSVM) to diagnose the faults on the system. An effective comparison +between raw signal and CEEO signal are presented to show the impact of CEEO in +gearbox fault diagnosis. The obtained results of this work look very promising +and can be implemented in any type of industrial system due to its +nonparametric nature. + +
+
+
+
+
+ + ☆ Hypergraph Structure Inference From Data Under Smoothness Prior + + +
+ Hypergraphs are important for processing data with higher-order relationships +involving more than two entities. In scenarios where explicit hypergraphs are +not readily available, it is desirable to infer a meaningful hypergraph +structure from the node features to capture the intrinsic relations within the +data. However, existing methods either adopt simple pre-defined rules that fail +to precisely capture the distribution of the potential hypergraph structure, or +learn a mapping between hypergraph structures and node features but require a +large amount of labelled data, i.e., pre-existing hypergraph structures, for +training. Both restrict their applications in practical scenarios. To fill this +gap, we propose a novel smoothness prior that enables us to design a method to +infer the probability for each potential hyperedge without labelled data as +supervision. The proposed prior indicates features of nodes in a hyperedge are +highly correlated by the features of the hyperedge containing them. We use this +prior to derive the relation between the hypergraph structure and the node +features via probabilistic modelling. This allows us to develop an unsupervised +inference method to estimate the probability for each potential hyperedge via +solving an optimisation problem that has an analytical solution. Experiments on +both synthetic and real-world data demonstrate that our method can learn +meaningful hypergraph structures from data more efficiently than existing +hypergraph structure inference methods. + +
+
+
+
+
+ + ☆ Distributional Off-Policy Evaluation for Slate Recommendations + + +
+ Recommendation strategies are typically evaluated by using previously logged +data, employing off-policy evaluation methods to estimate their expected +performance. However, for strategies that present users with slates of multiple +items, the resulting combinatorial action space renders many of these methods +impractical. Prior work has developed estimators that leverage the structure in +slates to estimate the expected off-policy performance, but the estimation of +the entire performance distribution remains elusive. Estimating the complete +distribution allows for a more comprehensive evaluation of recommendation +strategies, particularly along the axes of risk and fairness that employ +metrics computable from the distribution. In this paper, we propose an +estimator for the complete off-policy performance distribution for slates and +establish conditions under which the estimator is unbiased and consistent. This +builds upon prior work on off-policy evaluation for slates and off-policy +distribution estimation in reinforcement learning. We validate the efficacy of +our method empirically on synthetic data as well as on a slate recommendation +simulator constructed from real-world data (MovieLens-20M). Our results show a +significant reduction in estimation variance and improved sample efficiency +over prior work across a range of slate structures. + +
+
+
+
+
+ + ☆ Explaining with Attribute-based and Relational Near Misses: An + Interpretable Approach to Distinguishing Facial Expressions of Pain and + Disgust + + +
+ Explaining concepts by contrasting examples is an efficient and convenient +way of giving insights into the reasons behind a classification decision. This +is of particular interest in decision-critical domains, such as medical +diagnostics. One particular challenging use case is to distinguish facial +expressions of pain and other states, such as disgust, due to high similarity +of manifestation. In this paper, we present an approach for generating +contrastive explanations to explain facial expressions of pain and disgust +shown in video sequences. We implement and compare two approaches for +contrastive explanation generation. The first approach explains a specific pain +instance in contrast to the most similar disgust instance(s) based on the +occurrence of facial expressions (attributes). The second approach takes into +account which temporal relations hold between intervals of facial expressions +within a sequence (relations). The input to our explanation generation approach +is the output of an interpretable rule-based classifier for pain and disgust.We +utilize two different similarity metrics to determine near misses and far +misses as contrasting instances. Our results show that near miss explanations +are shorter than far miss explanations, independent from the applied similarity +metric. The outcome of our evaluation indicates that pain and disgust can be +distinguished with the help of temporal relations. We currently plan +experiments to evaluate how the explanations help in teaching concepts and how +they could be enhanced by further modalities and interaction. + +
+
+
+
+
+ + ☆ Learning end-to-end inversion of circular Radon transforms in the + partial radial setup + + +
+ We present a deep learning-based computational algorithm for inversion of +circular Radon transforms in the partial radial setup, arising in photoacoustic +tomography. We first demonstrate that the truncated singular value +decomposition-based method, which is the only traditional algorithm available +to solve this problem, leads to severe artifacts which renders the +reconstructed field as unusable. With the objective of overcoming this +computational bottleneck, we train a ResBlock based U-Net to recover the +inferred field that directly operates on the measured data. Numerical results +with augmented Shepp-Logan phantoms, in the presence of noisy full and limited +view data, demonstrate the superiority of the proposed algorithm. + +
+
+
+
+
+ + ☆ Integrated Variational Fourier Features for Fast Spatial Modelling with + Gaussian Processes + + +
+ Sparse variational approximations are popular methods for scaling up +inference and learning in Gaussian processes to larger datasets. For $N$ +training points, exact inference has $O(N^3)$ cost; with $M \ll N$ features, +state of the art sparse variational methods have $O(NM^2)$ cost. Recently, +methods have been proposed using more sophisticated features; these promise +$O(M^3)$ cost, with good performance in low dimensional tasks such as spatial +modelling, but they only work with a very limited class of kernels, excluding +some of the most commonly used. In this work, we propose integrated Fourier +features, which extends these performance benefits to a very broad class of +stationary covariance functions. We motivate the method and choice of +parameters from a convergence analysis and empirical exploration, and show +practical speedup in synthetic and real world spatial regression tasks. + +
+
+
+
+
+ + ☆ Detecting Language Model Attacks with Perplexity + + +
+ A novel hack involving Large Language Models (LLMs) has emerged, leveraging +adversarial suffixes to trick models into generating perilous responses. This +method has garnered considerable attention from reputable media outlets such as +the New York Times and Wired, thereby influencing public perception regarding +the security and safety of LLMs. In this study, we advocate the utilization of +perplexity as one of the means to recognize such potential attacks. The +underlying concept behind these hacks revolves around appending an unusually +constructed string of text to a harmful query that would otherwise be blocked. +This maneuver confuses the protective mechanisms and tricks the model into +generating a forbidden response. Such scenarios could result in providing +detailed instructions to a malicious user for constructing explosives or +orchestrating a bank heist. Our investigation demonstrates the feasibility of +employing perplexity, a prevalent natural language processing metric, to detect +these adversarial tactics before generating a forbidden response. By evaluating +the perplexity of queries with and without such adversarial suffixes using an +open-source LLM, we discovered that nearly 90 percent were above a perplexity +of 1000. This contrast underscores the efficacy of perplexity for detecting +this type of exploit. + +
+
+
+
+
+ + ☆ SPEED: Streaming Partition and Parallel Acceleration for Temporal + Interaction Graph Embedding + + +
+ Temporal Interaction Graphs (TIGs) are widely employed to model intricate +real-world systems such as financial systems and social networks. To capture +the dynamism and interdependencies of nodes, existing TIG embedding models need +to process edges sequentially and chronologically. However, this requirement +prevents it from being processed in parallel and struggle to accommodate +burgeoning data volumes to GPU. Consequently, many large-scale temporal +interaction graphs are confined to CPU processing. Furthermore, a generalized +GPU scaling and acceleration approach remains unavailable. To facilitate +large-scale TIGs' implementation on GPUs for acceleration, we introduce a novel +training approach namely Streaming Edge Partitioning and Parallel Acceleration +for Temporal Interaction Graph Embedding (SPEED). The SPEED is comprised of a +Streaming Edge Partitioning Component (SEP) which addresses space overhead +issue by assigning fewer nodes to each GPU, and a Parallel Acceleration +Component (PAC) which enables simultaneous training of different sub-graphs, +addressing time overhead issue. Our method can achieve a good balance in +computing resources, computing time, and downstream task performance. Empirical +validation across 7 real-world datasets demonstrates the potential to expedite +training speeds by a factor of up to 19.29x. Simultaneously, resource +consumption of a single-GPU can be diminished by up to 69%, thus enabling the +multiple GPU-based training and acceleration encompassing millions of nodes and +billions of edges. Furthermore, our approach also maintains its competitiveness +in downstream tasks. + +
+
+ comment: 13 pages, 8 figures +
+
+
+
+
+ + ☆ Empowering Clinicians and Democratizing Data Science: Large Language + Models Automate Machine Learning for Clinical Studies + + +
+ A knowledge gap persists between Machine Learning (ML) developers (e.g., data +scientists) and practitioners (e.g., clinicians), hampering the full +utilization of ML for clinical data analysis. We investigated the potential of +the chatGPT Code Interpreter (CI), an extension of GPT-4, to bridge this gap +and perform ML analyses efficiently. Real-world clinical datasets and study +details from large trials across various medical specialties were presented to +chatGPT CI without specific guidance. ChatGPT CI autonomously developed +state-of-the-art ML models based on the original study's training data to +predict clinical outcomes such as cancer development, cancer progression, +disease complications, or biomarkers such as pathogenic gene sequences. +Strikingly, these ML models matched or outperformed their published +counterparts. We conclude that chatGPT CI offers a promising avenue to +democratize ML in medicine, making advanced analytics accessible to non-ML +experts and promoting broader applications in medical research and practice. + +
+
+
+
+
+ + ☆ Semi-Supervised Learning in the Few-Shot Zero-Shot Scenario + + +
+ Semi-Supervised Learning (SSL) leverages both labeled and unlabeled data to +improve model performance. Traditional SSL methods assume that labeled and +unlabeled data share the same label space. However, in real-world applications, +especially when the labeled training set is small, there may be classes that +are missing from the labeled set. Existing frameworks aim to either reject all +unseen classes (open-set SSL) or to discover unseen classes by partitioning an +unlabeled set during training (open-world SSL). In our work, we construct a +classifier for points from both seen and unseen classes. Our approach is based +on extending an existing SSL method, such as FlexMatch, by incorporating an +additional entropy loss. This enhancement allows our method to improve the +performance of any existing SSL method in the classification of both seen and +unseen classes. We demonstrate large improvement gains over state-of-the-art +SSL, open-set SSL, and open-world SSL methods, on two benchmark image +classification data sets, CIFAR-100 and STL-10. The gains are most pronounced +when the labeled data is severely limited (1-25 labeled examples per class). + +
+
+
+
+
+ + ☆ Hybrid Transformer-RNN Architecture for Household Occupancy Detection + Using Low-Resolution Smart Meter Data + + +
+ Residential occupancy detection has become an enabling technology in today's +urbanized world for various smart home applications, such as building +automation, energy management, and improved security and comfort. +Digitalization of the energy system provides smart meter data that can be used +for occupancy detection in a non-intrusive manner without causing concerns +regarding privacy and data security. In particular, deep learning techniques +make it possible to infer occupancy from low-resolution smart meter data, such +that the need for accurate occupancy detection with privacy preservation can be +achieved. Our work is thus motivated to develop a privacy-aware and effective +model for residential occupancy detection in contemporary living environments. +Our model aims to leverage the advantages of both recurrent neural networks +(RNNs), which are adept at capturing local temporal dependencies, and +transformers, which are effective at handling global temporal dependencies. Our +designed hybrid transformer-RNN model detects residential occupancy using +hourly smart meter data, achieving an accuracy of nearly 92\% across households +with diverse profiles. We validate the effectiveness of our method using a +publicly accessible dataset and demonstrate its performance by comparing it +with state-of-the-art models, including attention-based occupancy detection +methods. + +
+
+ comment: IEEE IECON 2023 (The 49th Annual Conference of the IEEE Industrial + Electronics Society) +
+
+
+
+
+ + ☆ Depth self-supervision for single image novel view synthesis + + +
+ In this paper, we tackle the problem of generating a novel image from an +arbitrary viewpoint given a single frame as input. While existing methods +operating in this setup aim at predicting the target view depth map to guide +the synthesis, without explicit supervision over such a task, we jointly +optimize our framework for both novel view synthesis and depth estimation to +unleash the synergy between the two at its best. Specifically, a shared depth +decoder is trained in a self-supervised manner to predict depth maps that are +consistent across the source and target views. Our results demonstrate the +effectiveness of our approach in addressing the challenges of both tasks +allowing for higher-quality generated images, as well as more accurate depth +for the target viewpoint. + +
+
+
+
+
+ + ☆ Towards Generalizable Neural Solvers for Vehicle Routing Problems via + Ensemble with Transferrable Local Policy + + +
+ Machine learning has been adapted to help solve NP-hard combinatorial +optimization problems. One prevalent way is learning to construct solutions by +deep neural networks, which has been receiving more and more attention due to +the high efficiency and less requirement for expert knowledge. However, many +neural construction methods for Vehicle Routing Problems (VRPs) focus on +synthetic problem instances with limited scales and specified node +distributions, leading to poor performance on real-world problems which usually +involve large scales together with complex and unknown node distributions. To +make neural VRP solvers more practical in real-world scenarios, we design an +auxiliary policy that learns from the local transferable topological features, +named local policy, and integrate it with a typical constructive policy (which +learns from the global information of VRP instances) to form an ensemble +policy. With joint training, the aggregated policies perform cooperatively and +complementarily to boost generalization. The experimental results on two +well-known benchmarks, TSPLIB and CVRPLIB, of travelling salesman problem and +capacitated VRP show that the ensemble policy consistently achieves better +generalization than state-of-the-art construction methods and even works well +on real-world problems with several thousand nodes. + +
+
+
+
+
+ + ☆ The inverse problem for neural networks + + +
+ We study the problem of computing the preimage of a set under a neural +network with piecewise-affine activation functions. We recall an old result +that the preimage of a polyhedral set is again a union of polyhedral sets and +can be effectively computed. We show several applications of computing the +preimage for analysis and interpretability of neural networks. + +
+
+
+
+
+ + ☆ MedAlign: A Clinician-Generated Dataset for Instruction Following with + Electronic Medical Records + + +
+ The ability of large language models (LLMs) to follow natural language +instructions with human-level fluency suggests many opportunities in healthcare +to reduce administrative burden and improve quality of care. However, +evaluating LLMs on realistic text generation tasks for healthcare remains +challenging. Existing question answering datasets for electronic health record +(EHR) data fail to capture the complexity of information needs and +documentation burdens experienced by clinicians. To address these challenges, +we introduce MedAlign, a benchmark dataset of 983 natural language instructions +for EHR data. MedAlign is curated by 15 clinicians (7 specialities), includes +clinician-written reference responses for 303 instructions, and provides 276 +longitudinal EHRs for grounding instruction-response pairs. We used MedAlign to +evaluate 6 general domain LLMs, having clinicians rank the accuracy and quality +of each LLM response. We found high error rates, ranging from 35% (GPT-4) to +68% (MPT-7B-Instruct), and an 8.3% drop in accuracy moving from 32k to 2k +context lengths for GPT-4. Finally, we report correlations between clinician +rankings and automated natural language generation metrics as a way to rank +LLMs without human review. We make MedAlign available under a research data use +agreement to enable LLM evaluations on tasks aligned with clinician needs and +preferences. + +
+
+
+
+
+ + ☆ Sampling with flows, diffusion and autoregressive neural networks: A + spin-glass perspective + + +
+ Recent years witnessed the development of powerful generative models based on +flows, diffusion or autoregressive neural networks, achieving remarkable +success in generating data from examples with applications in a broad range of +areas. A theoretical analysis of the performance and understanding of the +limitations of these methods remain, however, challenging. In this paper, we +undertake a step in this direction by analysing the efficiency of sampling by +these methods on a class of problems with a known probability distribution and +comparing it with the sampling performance of more traditional methods such as +the Monte Carlo Markov chain and Langevin dynamics. We focus on a class of +probability distribution widely studied in the statistical physics of +disordered systems that relate to spin glasses, statistical inference and +constraint satisfaction problems. + We leverage the fact that sampling via flow-based, diffusion-based or +autoregressive networks methods can be equivalently mapped to the analysis of a +Bayes optimal denoising of a modified probability measure. Our findings +demonstrate that these methods encounter difficulties in sampling stemming from +the presence of a first-order phase transition along the algorithm's denoising +path. Our conclusions go both ways: we identify regions of parameters where +these methods are unable to sample efficiently, while that is possible using +standard Monte Carlo or Langevin approaches. We also identify regions where the +opposite happens: standard approaches are inefficient while the discussed +generative methods work well. + +
+
+ comment: 39 pages, 12 figures +
+
+
+
+
+ + ☆ Pruning the Unlabeled Data to Improve Semi-Supervised Learning + + +
+ In the domain of semi-supervised learning (SSL), the conventional approach +involves training a learner with a limited amount of labeled data alongside a +substantial volume of unlabeled data, both drawn from the same underlying +distribution. However, for deep learning models, this standard practice may not +yield optimal results. In this research, we propose an alternative perspective, +suggesting that distributions that are more readily separable could offer +superior benefits to the learner as compared to the original distribution. To +achieve this, we present PruneSSL, a practical technique for selectively +removing examples from the original unlabeled dataset to enhance its +separability. We present an empirical study, showing that although PruneSSL +reduces the quantity of available training data for the learner, it +significantly improves the performance of various competitive SSL algorithms, +thereby achieving state-of-the-art results across several image classification +tasks. + +
+
+
+
+
+ + ♻ ☆ Learning Melanocytic Cell Masks from Adjacent Stained Tissue MICCAI + 2022 + + +
+ Melanoma is one of the most aggressive forms of skin cancer, causing a large +proportion of skin cancer deaths. However, melanoma diagnoses by pathologists +shows low interrater reliability. As melanoma is a cancer of the melanocyte, +there is a clear need to develop a melanocytic cell segmentation tool that is +agnostic to pathologist variability and automates pixel-level annotation. +Gigapixel-level pathologist labeling, however, is impractical. Herein, we +propose a means to train deep neural networks for melanocytic cell segmentation +from hematoxylin and eosin (H&E) stained slides using paired +immunohistochemical (IHC) slides of adjacent tissue sections, achieving a mean +IOU of 0.64 despite imperfect ground-truth labels. + +
+
+ comment: {Medical Image Learning with Limited & Noisy Data Workshop at MICCAI + 2022 +
+
+
+
+
+ + ♻ ☆ How to choose the most appropriate centrality measure? A decision tree + approach + + +
+ Centrality metrics play a crucial role in network analysis, while the choice +of specific measures significantly influences the accuracy of conclusions as +each measure represents a unique concept of node importance. Among over 400 +proposed indices, selecting the most suitable ones for specific applications +remains a challenge. Existing approaches -- model-based, data-driven, and +axiomatic -- have limitations, requiring association with models, training +datasets, or restrictive axioms for each specific application. To address this, +we introduce the culling method, which relies on the expert concept of +centrality behavior on simple graphs. The culling method involves forming a set +of candidate measures, generating a list of as small graphs as possible needed +to distinguish the measures from each other, constructing a decision-tree +survey, and identifying the measure consistent with the expert's concept. We +apply this approach to a diverse set of 40 centralities, including novel +kernel-based indices, and combine it with the axiomatic approach. Remarkably, +only 13 small 1-trees are sufficient to separate all 40 measures, even for +pairs of closely related ones. By adopting simple ordinal axioms like +Self-consistency or Bridge axiom, the set of measures can be drastically +reduced making the culling survey short. Applying the culling method provides +insightful findings on some centrality indices, such as PageRank, Bridging, and +dissimilarity-based Eigencentrality measures, among others. The proposed +approach offers a cost-effective solution in terms of labor and time, +complementing existing methods for measure selection, and providing deeper +insights into the underlying mechanisms of centrality measures. + +
+
+ comment: 12 pages, 2 tables, 1 algorithm, 8 figures. Presentation has been + improved +
+
+
+
+
+ + ♻ ☆ Practical Batch Bayesian Sampling Algorithms for Online Adaptive Traffic + Experimentation + + +
+ Online controlled experiments have emerged as industry gold standard for +assessing new web features. As new web algorithms proliferate, experimentation +platform faces an increasing demand on the velocity of online experiments, +which encourages adaptive traffic testing methods to speed up identifying best +variant by efficiently allocating traffic. This paper proposed four Bayesian +batch bandit algorithms (NB-TS, WB-TS, NB-TTTS, WB-TTTS) for eBay's +experimentation platform, using summary batch statistics of a goal metric +without incurring new engineering technical debts. The novel WB-TTTS, in +particular, demonstrates as an efficient, trustworthy and robust alternative to +fixed horizon A/B testing. Another novel contribution is to bring +trustworthiness of best arm identification algorithms into evaluation criterion +and highlight the existence of severe false positive inflation with equivalent +best arms. To gain the trust of experimenters, the experimentation platform +must consider both efficiency and trustworthiness; However, to the best of +authors' knowledge, trustworthiness as an important topic is rarely discussed +in literatures of either best arm identification or multi-armed bandit. This +paper shows that Bayesian bandits without neutral posterior reshaping, +particularly naive Thompson sampling (NB-TS), are untrustworthy because they +can always identify an arm as best from equivalent best arms. To restore +trustworthiness, a novel finding uncovers connections between convergence +distribution of posterior optimal probabilities of equivalent best arms and +neutral posterior reshaping, which controls false positives. Lastly, this paper +presents lessons learned from eBay's experience, as well as evaluations of the +four algorithms. We hope our work is useful to other industrial practitioners +and inspire academic researchers interested in the trustworthiness of adaptive +traffic experimentation. + +
+
+
+
+
+ + ♻ ☆ Artificial Intelligence for Automatic Detection and Classification + Disease on the X-Ray Images + + +
+ Detecting and classifying diseases using X-ray images is one of the more +challenging core tasks in the medical and research world. Due to the recent +high interest in radiological images and AI, early detection of diseases in +X-ray images has become notably more essential to prevent further spreading and +flatten the curve. Innovations and revolutions of Computer Vision with Deep +learning methods offer great promise for fast and accurate diagnosis of +screening and detection from chest X-ray images (CXR). This work presents rapid +detection of diseases in the lung using the efficient Deep learning pre-trained +RepVGG algorithm for deep feature extraction and classification. We used X-ray +images as an example to show the model's efficiency. To perform this task, we +classify X-Ray images into Covid-19, Pneumonia, and Normal X-Ray images. Employ +ROI object to improve the detection accuracy for lung extraction, followed by +data pre-processing and augmentation. We are applying Artificial Intelligence +technology for automatic highlighted detection of affected areas of people's +lungs. Based on the X-Ray images, an algorithm was developed that classifies +X-Ray images with height accuracy and power faster thanks to the architecture +transformation of the model. We compared deep learning frameworks' accuracy and +detection of disease. The study shows the high power of deep learning methods +for X-ray images based on COVID-19 detection utilizing chest X-rays. The +proposed framework offers better diagnostic accuracy by comparing popular deep +learning models, i.e., VGG, ResNet50, inceptionV3, DenseNet, and +InceptionResnetV2. + +
+
+
+
+
+ + ♻ ☆ Local Context-Aware Active Domain Adaptation ICCV 2023 + + +
+ Active Domain Adaptation (ADA) queries the labels of a small number of +selected target samples to help adapting a model from a source domain to a +target domain. The local context of queried data is important, especially when +the domain gap is large. However, this has not been fully explored by existing +ADA works. In this paper, we propose a Local context-aware ADA framework, named +LADA, to address this issue. To select informative target samples, we devise a +novel criterion based on the local inconsistency of model predictions. Since +the labeling budget is usually small, fine-tuning model on only queried data +can be inefficient. We progressively augment labeled target data with the +confident neighbors in a class-balanced manner. Experiments validate that the +proposed criterion chooses more informative target samples than existing active +selection strategies. Furthermore, our full method clearly surpasses recent ADA +arts on various benchmarks. Code is available at https://github.com/tsun/LADA. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ ReCo: A Dataset for Residential Community Layout Planning + + +
+ Layout planning is centrally important in the field of architecture and urban +design. Among the various basic units carrying urban functions, residential +community plays a vital part for supporting human life. Therefore, the layout +planning of residential community has always been of concern, and has attracted +particular attention since the advent of deep learning that facilitates the +automated layout generation and spatial pattern recognition. However, the +research circles generally suffer from the insufficiency of residential +community layout benchmark or high-quality datasets, which hampers the future +exploration of data-driven methods for residential community layout planning. +The lack of datasets is largely due to the difficulties of large-scale +real-world residential data acquisition and long-term expert screening. In +order to address the issues and advance a benchmark dataset for various +intelligent spatial design and analysis applications in the development of +smart city, we introduce Residential Community Layout Planning (ReCo) Dataset, +which is the first and largest open-source vector dataset related to real-world +community to date. ReCo Dataset is presented in multiple data formats with +37,646 residential community layout plans, covering 598,728 residential +buildings with height information. ReCo can be conveniently adapted for +residential community layout related urban design tasks, e.g., generative +layout design, morphological pattern recognition and spatial evaluation. To +validate the utility of ReCo in automated residential community layout +planning, two Generative Adversarial Network (GAN) based generative models are +further applied to the dataset. We expect ReCo Dataset to inspire more creative +and practical work in intelligent design and beyond. The ReCo Dataset is +published at: https://www.kaggle.com/fdudsde/reco-dataset. + +
+
+ comment: 9 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ A Survey of Safety and Trustworthiness of Large Language Models through + the Lens of Verification and Validation + + +
+ Large Language Models (LLMs) have exploded a new heatwave of AI for their +ability to engage end-users in human-level conversations with detailed and +articulate answers across many knowledge domains. In response to their fast +adoption in many industrial applications, this survey concerns their safety and +trustworthiness. First, we review known vulnerabilities and limitations of the +LLMs, categorising them into inherent issues, attacks, and unintended bugs. +Then, we consider if and how the Verification and Validation (V&V) techniques, +which have been widely developed for traditional software and deep learning +models such as convolutional neural networks as independent processes to check +the alignment of their implementations against the specifications, can be +integrated and further extended throughout the lifecycle of the LLMs to provide +rigorous analysis to the safety and trustworthiness of LLMs and their +applications. Specifically, we consider four complementary techniques: +falsification and evaluation, verification, runtime monitoring, and regulations +and ethical use. In total, 370+ references are considered to support the quick +understanding of the safety and trustworthiness issues from the perspective of +V&V. While intensive research has been conducted to identify the safety and +trustworthiness issues, rigorous yet practical methods are called for to ensure +the alignment of LLMs with safety and trustworthiness requirements. + +
+
+
+
+
+ + ♻ ☆ PMU measurements based short-term voltage stability assessment of power + systems via deep transfer learning + + +
+ Deep learning has emerged as an effective solution for addressing the +challenges of short-term voltage stability assessment (STVSA) in power systems. +However, existing deep learning-based STVSA approaches face limitations in +adapting to topological changes, sample labeling, and handling small datasets. +To overcome these challenges, this paper proposes a novel phasor measurement +unit (PMU) measurements-based STVSA method by using deep transfer learning. The +method leverages the real-time dynamic information captured by PMUs to create +an initial dataset. It employs temporal ensembling for sample labeling and +utilizes least squares generative adversarial networks (LSGAN) for data +augmentation, enabling effective deep learning on small-scale datasets. +Additionally, the method enhances adaptability to topological changes by +exploring connections between different faults. Experimental results on the +IEEE 39-bus test system demonstrate that the proposed method improves model +evaluation accuracy by approximately 20% through transfer learning, exhibiting +strong adaptability to topological changes. Leveraging the self-attention +mechanism of the Transformer model, this approach offers significant advantages +over shallow learning methods and other deep learning-based approaches. + +
+
+ comment: Accepted by IEEE Transactions on Instrumentation & Measurement +
+
+
+
+
+
+
+
+ + Multimedia 2 + +
+
+
+ + ☆ Computation-efficient Deep Learning for Computer Vision: A Survey + + +
+ Over the past decade, deep learning models have exhibited considerable +advancements, reaching or even exceeding human-level performance in a range of +visual perception tasks. This remarkable progress has sparked interest in +applying deep networks to real-world applications, such as autonomous vehicles, +mobile devices, robotics, and edge computing. However, the challenge remains +that state-of-the-art models usually demand significant computational +resources, leading to impractical power consumption, latency, or carbon +emissions in real-world scenarios. This trade-off between effectiveness and +efficiency has catalyzed the emergence of a new research focus: computationally +efficient deep learning, which strives to achieve satisfactory performance +while minimizing the computational cost during inference. This review offers an +extensive analysis of this rapidly evolving field by examining four key areas: +1) the development of static or dynamic light-weighted backbone models for the +efficient extraction of discriminative deep representations; 2) the specialized +network architectures or algorithms tailored for specific computer vision +tasks; 3) the techniques employed for compressing deep learning models; and 4) +the strategies for deploying efficient deep networks on hardware platforms. +Additionally, we provide a systematic discussion on the critical challenges +faced in this domain, such as network architecture design, training schemes, +practical efficiency, and more realistic model compression approaches, as well +as potential future research directions. + +
+
+
+
+
+ + ♻ ☆ VATP360: Viewport Adaptive 360-Degree Video Streaming based on Tile + Priority + + +
+ 360-degree video becomes increasingly popular among users. In the current +network bandwidth, serving high resolution 360 degree video to users is quite +difficult. Most of the work has been devoted to the prediction of user +viewports or tile-based adaptive algorithms. However, it is difficult to +predict user viewports more accurately using only information such as user's +historical viewports or video saliency maps. In this paper, we propose a +viewport adaptive 360-degree video streaming method based on tile priority +(VATP360), which tries to balance between the performance and the overhead. The +proposed VATP360 consists of three main modules: viewport prediction, tile +priority classification and bitrate allocation. In the viewport prediction +module, object motion trajectory and predicted user's region-of-interest (ROI) +are used to achieve accurate prediction of the user's future viewport. Then, +the predicted viewport, along with the object motion trajectory, are fed into +the proposed tile priority classification algorithm to assign different +priorities to tiles, which would reduce the computational complexity of the +bitrate allocation module. Finally in the bitrate allocation stage, we +adaptively assign bitrates to tiles of different priority by reinforcement +learning. Experimental results on publicly available datasets have demonstrated +the effectiveness of the proposed method. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 22 + +
+
+
+ + ☆ Translate Meanings, Not Just Words: IdiomKB's Role in Optimizing + Idiomatic Translation with Language Models + + +
+ To translate well, machine translation (MT) systems and general-purposed +language models (LMs) need a deep understanding of both source and target +languages and cultures. Therefore, idioms, with their non-compositional nature, +pose particular challenges for Transformer-based systems, as literal +translations often miss the intended meaning. Traditional methods, which +replace idioms using existing knowledge bases (KBs), often lack scale and +context awareness. Addressing these challenges, our approach prioritizes +context awareness and scalability, allowing for offline storage of idioms in a +manageable KB size. This ensures efficient serving with smaller models and +provides a more comprehensive understanding of idiomatic expressions. We +introduce a multilingual idiom KB (IdiomKB) developed using large LMs to +address this. This KB facilitates better translation by smaller models, such as +BLOOMZ (7.1B), Alpaca (7B), and InstructGPT (6.7B), by retrieving idioms' +figurative meanings. We present a novel, GPT-4-powered metric for human-aligned +evaluation, demonstrating that IdiomKB considerably boosts model performance. +Human evaluations further validate our KB's quality. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Improving Knowledge Distillation for BERT Models: Loss Functions, + Mapping Methods, and Weight Tuning + + +
+ The use of large transformer-based models such as BERT, GPT, and T5 has led +to significant advancements in natural language processing. However, these +models are computationally expensive, necessitating model compression +techniques that reduce their size and complexity while maintaining accuracy. +This project investigates and applies knowledge distillation for BERT model +compression, specifically focusing on the TinyBERT student model. We explore +various techniques to improve knowledge distillation, including experimentation +with loss functions, transformer layer mapping methods, and tuning the weights +of attention and representation loss and evaluate our proposed techniques on a +selection of downstream tasks from the GLUE benchmark. The goal of this work is +to improve the efficiency and effectiveness of knowledge distillation, enabling +the development of more efficient and accurate models for a range of natural +language processing tasks. + +
+
+
+
+
+ + ☆ Exploring Large Language Models for Knowledge Graph Completion + + +
+ Knowledge graphs play a vital role in numerous artificial intelligence tasks, +yet they frequently face the issue of incompleteness. In this study, we explore +utilizing Large Language Models (LLM) for knowledge graph completion. We +consider triples in knowledge graphs as text sequences and introduce an +innovative framework called Knowledge Graph LLM (KG-LLM) to model these +triples. Our technique employs entity and relation descriptions of a triple as +prompts and utilizes the response for predictions. Experiments on various +benchmark knowledge graphs demonstrate that our method attains state-of-the-art +performance in tasks such as triple classification and relation prediction. We +also find that fine-tuning relatively smaller models (e.g., LLaMA-7B, +ChatGLM-6B) outperforms recent ChatGPT and GPT-4. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ A Wide Evaluation of ChatGPT on Affective Computing Tasks + + +
+ With the rise of foundation models, a new artificial intelligence paradigm +has emerged, by simply using general purpose foundation models with prompting +to solve problems instead of training a separate machine learning model for +each problem. Such models have been shown to have emergent properties of +solving problems that they were not initially trained on. The studies for the +effectiveness of such models are still quite limited. In this work, we widely +study the capabilities of the ChatGPT models, namely GPT-4 and GPT-3.5, on 13 +affective computing problems, namely aspect extraction, aspect polarity +classification, opinion extraction, sentiment analysis, sentiment intensity +ranking, emotions intensity ranking, suicide tendency detection, toxicity +detection, well-being assessment, engagement measurement, personality +assessment, sarcasm detection, and subjectivity detection. We introduce a +framework to evaluate the ChatGPT models on regression-based problems, such as +intensity ranking problems, by modelling them as pairwise ranking +classification. We compare ChatGPT against more traditional NLP methods, such +as end-to-end recurrent neural networks and transformers. The results +demonstrate the emergent abilities of the ChatGPT models on a wide range of +affective computing problems, where GPT-3.5 and especially GPT-4 have shown +strong performance on many problems, particularly the ones related to +sentiment, emotions, or toxicity. The ChatGPT models fell short for problems +with implicit signals, such as engagement measurement and subjectivity +detection. + +
+
+ comment: 8 pages with references, 2 tables +
+
+
+
+
+ + ☆ LMSanitator: Defending Prompt-Tuning Against Task-Agnostic Backdoors NDSS + + +
+ Prompt-tuning has emerged as an attractive paradigm for deploying large-scale +language models due to its strong downstream task performance and efficient +multitask serving ability. Despite its wide adoption, we empirically show that +prompt-tuning is vulnerable to downstream task-agnostic backdoors, which reside +in the pretrained models and can affect arbitrary downstream tasks. The +state-of-the-art backdoor detection approaches cannot defend against +task-agnostic backdoors since they hardly converge in reversing the backdoor +triggers. To address this issue, we propose LMSanitator, a novel approach for +detecting and removing task-agnostic backdoors on Transformer models. Instead +of directly inversing the triggers, LMSanitator aims to inverse the predefined +attack vectors (pretrained models' output when the input is embedded with +triggers) of the task-agnostic backdoors, which achieves much better +convergence performance and backdoor detection accuracy. LMSanitator further +leverages prompt-tuning's property of freezing the pretrained model to perform +accurate and fast output monitoring and input purging during the inference +phase. Extensive experiments on multiple language models and NLP tasks +illustrate the effectiveness of LMSanitator. For instance, LMSanitator achieves +92.8% backdoor detection accuracy on 960 models and decreases the attack +success rate to less than 1% in most scenarios. + +
+
+ comment: To Appear in the Network and Distributed System Security (NDSS) + Symposium 2024, 26 February - 1 March 2024, San Diego, CA, USA +
+
+
+
+
+ + ☆ Solving Math Word Problem with Problem Type Classification NLPCC2023 + + +
+ Math word problems (MWPs) require analyzing text descriptions and generating +mathematical equations to derive solutions. Existing works focus on solving +MWPs with two types of solvers: tree-based solver and large language model +(LLM) solver. However, these approaches always solve MWPs by a single solver, +which will bring the following problems: (1) Single type of solver is hard to +solve all types of MWPs well. (2) A single solver will result in poor +performance due to over-fitting. To address these challenges, this paper +utilizes multiple ensemble approaches to improve MWP-solving ability. Firstly, +We propose a problem type classifier that combines the strengths of the +tree-based solver and the LLM solver. This ensemble approach leverages their +respective advantages and broadens the range of MWPs that can be solved. +Furthermore, we also apply ensemble techniques to both tree-based solver and +LLM solver to improve their performance. For the tree-based solver, we propose +an ensemble learning framework based on ten-fold cross-validation and voting +mechanism. In the LLM solver, we adopt self-consistency (SC) method to improve +answer selection. Experimental results demonstrate the effectiveness of these +ensemble approaches in enhancing MWP-solving ability. The comprehensive +evaluation showcases improved performance, validating the advantages of our +proposed approach. Our code is available at this url: +https://github.com/zhouzihao501/NLPCC2023-Shared-Task3-ChineseMWP. + +
+
+ comment: Accpected by NLPCC2023 +
+
+
+
+
+ + ☆ Planning with Logical Graph-based Language Model for Instruction + Generation + + +
+ Despite the superior performance of large language models to generate natural +language texts, it is hard to generate texts with correct logic according to a +given task, due to the difficulties for neural models to capture implied rules +from free-form texts. In this paper, we propose a novel graph-based language +model, Logical-GLM, to infuse logic into language models for more valid text +generation and interpretability. Specifically, we first capture information +from natural language instructions and construct logical bayes graphs that +generally describe domains. Next, we generate logical skeletons to guide +language model training, infusing domain knowledge into language models. +Finally, we alternately optimize the searching policy of graphs and language +models until convergence. The experimental results show that Logical-GLM is +both effective and efficient compared with traditional language models, despite +using smaller-scale training data and fewer parameters. Our approach can +generate instructional texts with more correct logic owing to the internalized +domain knowledge. Moreover, the usage of logical graphs reflects the inner +mechanism of the language models, which improves the interpretability of +black-box models. + +
+
+ comment: 9 pages, 8 figures +
+
+
+
+
+ + ☆ EditSum: A Retrieve-and-Edit Framework for Source Code Summarization + + +
+ Existing studies show that code summaries help developers understand and +maintain source code. Unfortunately, these summaries are often missing or +outdated in software projects. Code summarization aims to generate natural +language descriptions automatically for source code. Code summaries are highly +structured and have repetitive patterns. Besides the patternized words, a code +summary also contains important keywords, which are the key to reflecting the +functionality of the code. However, the state-of-the-art approaches perform +poorly on predicting the keywords, which leads to the generated summaries +suffering a loss in informativeness. To alleviate this problem, this paper +proposes a novel retrieve-and-edit approach named EditSum for code +summarization. Specifically, EditSum first retrieves a similar code snippet +from a pre-defined corpus and treats its summary as a prototype summary to +learn the pattern. Then, EditSum edits the prototype automatically to combine +the pattern in the prototype with the semantic information of input code. Our +motivation is that the retrieved prototype provides a good start-point for +post-generation because the summaries of similar code snippets often have the +same pattern. The post-editing process further reuses the patternized words in +the prototype and generates keywords based on the semantic information of input +code. We conduct experiments on a large-scale Java corpus and experimental +results demonstrate that EditSum outperforms the state-of-the-art approaches by +a substantial margin. The human evaluation also proves the summaries generated +by EditSum are more informative and useful. We also verify that EditSum +performs well on predicting the patternized words and keywords. + +
+
+ comment: Accepted by the 36th IEEE/ACM International Conference on Automated + Software Engineering (ASE 2021) +
+
+
+
+
+ + ☆ Adversarial Fine-Tuning of Language Models: An Iterative Optimisation + Approach for the Generation and Detection of Problematic Content + + +
+ In this paper, we tackle the emerging challenge of unintended harmful content +generation in Large Language Models (LLMs) with a novel dual-stage optimisation +technique using adversarial fine-tuning. Our two-pronged approach employs an +adversarial model, fine-tuned to generate potentially harmful prompts, and a +judge model, iteratively optimised to discern these prompts. In this +adversarial cycle, the two models seek to outperform each other in the +prompting phase, generating a dataset of rich examples which are then used for +fine-tuning. This iterative application of prompting and fine-tuning allows +continuous refinement and improved performance. The performance of our approach +is evaluated through classification accuracy on a dataset consisting of +problematic prompts not detected by GPT-4, as well as a selection of +contentious but unproblematic prompts. We show considerable increase in +classification accuracy of the judge model on this challenging dataset as it +undergoes the optimisation process. Furthermore, we show that a rudimentary +model \texttt{ada} can achieve 13\% higher accuracy on the hold-out test set +than GPT-4 after only a few rounds of this process, and that this fine-tuning +improves performance in parallel tasks such as toxic comment identification. + +
+
+
+
+
+ + ☆ How Can Context Help? Exploring Joint Retrieval of Passage and + Personalized Context + + +
+ The integration of external personalized context information into +document-grounded conversational systems has significant potential business +value, but has not been well-studied. Motivated by the concept of personalized +context-aware document-grounded conversational systems, we introduce the task +of context-aware passage retrieval. We also construct a dataset specifically +curated for this purpose. We describe multiple baseline systems to address this +task, and propose a novel approach, Personalized Context-Aware Search (PCAS), +that effectively harnesses contextual information during passage retrieval. +Experimental evaluations conducted on multiple popular dense retrieval systems +demonstrate that our proposed approach not only outperforms the baselines in +retrieving the most relevant passage but also excels at identifying the +pertinent context among all the available contexts. We envision that our +contributions will serve as a catalyst for inspiring future research endeavors +in this promising direction. + +
+
+
+
+
+ + ☆ ZC3: Zero-Shot Cross-Language Code Clone Detection + + +
+ Developers introduce code clones to improve programming productivity. Many +existing studies have achieved impressive performance in monolingual code clone +detection. However, during software development, more and more developers write +semantically equivalent programs with different languages to support different +platforms and help developers translate projects from one language to another. +Considering that collecting cross-language parallel data, especially for +low-resource languages, is expensive and time-consuming, how designing an +effective cross-language model that does not rely on any parallel data is a +significant problem. In this paper, we propose a novel method named ZC3 for +Zero-shot Cross-language Code Clone detection. ZC3 designs the contrastive +snippet prediction to form an isomorphic representation space among different +programming languages. Based on this, ZC3 exploits domain-aware learning and +cycle consistency learning to further constrain the model to generate +representations that are aligned among different languages meanwhile are +diacritical for different types of clones. To evaluate our approach, we conduct +extensive experiments on four representative cross-language clone detection +datasets. Experimental results show that ZC3 outperforms the state-of-the-art +baselines by 67.12%, 51.39%, 14.85%, and 53.01% on the MAP score, respectively. +We further investigate the representational distribution of different languages +and discuss the effectiveness of our method. + +
+
+ comment: Accepted by the 38th IEEE/ACM International Conference on Automated + Software Engineering (ASE 2023) +
+
+
+
+
+ + ☆ On Philomatics and Psychomatics for Combining Philosophy and Psychology + with Mathematics + + +
+ We propose the concepts of philomatics and psychomatics as hybrid +combinations of philosophy and psychology with mathematics. We explain four +motivations for this combination which are fulfilling the desire of analytical +philosophy, proposing science of philosophy, justifying mathematical algorithms +by philosophy, and abstraction in both philosophy and mathematics. We enumerate +various examples for philomatics and psychomatics, some of which are explained +in more depth. The first example is the analysis of relation between the +context principle, semantic holism, and the usage theory of meaning with the +attention mechanism in mathematics. The other example is on the relations of +Plato's theory of forms in philosophy with the holographic principle in string +theory, object-oriented programming, and machine learning. Finally, the +relation between Wittgenstein's family resemblance and clustering in +mathematics is explained. This paper opens the door of research for combining +philosophy and psychology with mathematics. + +
+
+
+
+
+ + ☆ A Computational Evaluation Framework for Singable Lyric Translation + + +
+ Lyric translation plays a pivotal role in amplifying the global resonance of +music, bridging cultural divides, and fostering universal connections. +Translating lyrics, unlike conventional translation tasks, requires a delicate +balance between singability and semantics. In this paper, we present a +computational framework for the quantitative evaluation of singable lyric +translation, which seamlessly integrates musical, linguistic, and cultural +dimensions of lyrics. Our comprehensive framework consists of four metrics that +measure syllable count distance, phoneme repetition similarity, musical +structure distance, and semantic similarity. To substantiate the efficacy of +our framework, we collected a singable lyrics dataset, which precisely aligns +English, Japanese, and Korean lyrics on a line-by-line and section-by-section +basis, and conducted a comparative analysis between singable and non-singable +lyrics. Our multidisciplinary approach provides insights into the key +components that underlie the art of lyric translation and establishes a solid +groundwork for the future of computational lyric translation assessment. + +
+
+ comment: ISMIR 2023 +
+
+
+
+
+ + ♻ ☆ External Reasoning: Towards Multi-Large-Language-Models Interchangeable + Assistance with Human Feedback + + +
+ Memory is identified as a crucial human faculty that allows for the retention +of visual and linguistic information within the hippocampus and neurons in the +brain, which can subsequently be retrieved to address real-world challenges +that arise through a lifetime of learning. The resolution of complex AI tasks +through the application of acquired knowledge represents a stride toward the +realization of artificial general intelligence. However, despite the prevalence +of Large Language Models (LLMs) like GPT-3.5 and GPT-4 \cite{brown2020language, +leiter2023chatgpt, zaitsu2023distinguishing, OpenAI2023GPT4TR} , which have +displayed remarkable capabilities in language comprehension, generation, +interaction, and reasoning, they are inhibited by constraints on context length +that preclude the processing of extensive, continually evolving knowledge +bases. This paper proposes that LLMs could be augmented through the selective +integration of knowledge from external repositories, and in doing so, +introduces a novel methodology for External Reasoning, exemplified by ChatPDF. +Central to this approach is the establishment of a tiered policy for +\textbf{External Reasoning based on Multiple LLM Interchange Assistance} in +\cref{fig:overall}, where the level of support rendered is modulated across +entry, intermediate, and advanced tiers based on the complexity of the query, +with adjustments made in response to human feedback. A comprehensive evaluation +of this methodology is conducted using multiple LLMs and the results indicate +state-of-the-art performance in \cref{comparison} , surpassing existing +solutions including ChatPDF.com. Moreover, the paper emphasizes that this +approach is more efficient compared to the direct processing of full text by +LLMs. The source code is publicly available at: +\url{https://github.com/AkideLiu/ANLP}. + +
+
+ comment: technical report, add code link. arXiv admin note: text overlap with + arXiv:2305.11206 by other authors +
+
+
+
+
+ + ♻ ☆ Exploring Linguistic Style Matching in Online Communities: The Role of + Social Context and Conversation Dynamics + + +
+ Linguistic style matching (LSM) in conversations can be reflective of several +aspects of social influence such as power or persuasion. However, how LSM +relates to the outcomes of online communication on platforms such as Reddit is +an unknown question. In this study, we analyze a large corpus of two-party +conversation threads in Reddit where we identify all occurrences of LSM using +two types of style: the use of function words and formality. Using this +framework, we examine how levels of LSM differ in conversations depending on +several social factors within Reddit: post and subreddit features, conversation +depth, user tenure, and the controversiality of a comment. Finally, we measure +the change of LSM following loss of status after community banning. Our +findings reveal the interplay of LSM in Reddit conversations with several +community metrics, suggesting the importance of understanding conversation +engagement when understanding community dynamics. + +
+
+ comment: Equal contributions from authors 1-9 (AA, HC, JY, KA, JP, AS, LD, MC, + BL) +
+
+
+
+
+ + ♻ ☆ Emoji Prediction in Tweets using BERT + + +
+ In recent years, the use of emojis in social media has increased +dramatically, making them an important element in understanding online +communication. However, predicting the meaning of emojis in a given text is a +challenging task due to their ambiguous nature. In this study, we propose a +transformer-based approach for emoji prediction using BERT, a widely-used +pre-trained language model. We fine-tuned BERT on a large corpus of text +(tweets) containing both text and emojis to predict the most appropriate emoji +for a given text. Our experimental results demonstrate that our approach +outperforms several state-of-the-art models in predicting emojis with an +accuracy of over 75 percent. This work has potential applications in natural +language processing, sentiment analysis, and social media marketing. + +
+
+ comment: This paper is focused on predicting emojis corresponding to tweets + using BERT +
+
+
+
+
+ + ♻ ☆ AspectCSE: Sentence Embeddings for Aspect-based Semantic Textual + Similarity Using Contrastive Learning and Structured Knowledge + + +
+ Generic sentence embeddings provide a coarse-grained approximation of +semantic textual similarity but ignore specific aspects that make texts +similar. Conversely, aspect-based sentence embeddings provide similarities +between texts based on certain predefined aspects. Thus, similarity predictions +of texts are more targeted to specific requirements and more easily +explainable. In this paper, we present AspectCSE, an approach for aspect-based +contrastive learning of sentence embeddings. Results indicate that AspectCSE +achieves an average improvement of 3.97% on information retrieval tasks across +multiple aspects compared to the previous best results. We also propose using +Wikidata knowledge graph properties to train models of multi-aspect sentence +embeddings in which multiple specific aspects are simultaneously considered +during similarity predictions. We demonstrate that multi-aspect embeddings +outperform single-aspect embeddings on aspect-specific information retrieval +tasks. Finally, we examine the aspect-based sentence embedding space and +demonstrate that embeddings of semantically similar aspect labels are often +close, even without explicit similarity training between different aspect +labels. + +
+
+ comment: Accepted to the 14th International Conference on Recent Advances in + Natural Language Processing (RANLP 2023) +
+
+
+
+
+ + ♻ ☆ Multi-View Reasoning: Consistent Contrastive Learning for Math Word + Problem + + +
+ Math word problem solver requires both precise relation reasoning about +quantities in the text and reliable generation for the diverse equation. +Current sequence-to-tree or relation extraction methods regard this only from a +fixed view, struggling to simultaneously handle complex semantics and diverse +equations. However, human solving naturally involves two consistent reasoning +views: top-down and bottom-up, just as math equations also can be expressed in +multiple equivalent forms: pre-order and post-order. We propose a multi-view +consistent contrastive learning for a more complete semantics-to-equation +mapping. The entire process is decoupled into two independent but consistent +views: top-down decomposition and bottom-up construction, and the two reasoning +views are aligned in multi-granularity for consistency, enhancing global +generation and precise reasoning. Experiments on multiple datasets across two +languages show our approach significantly outperforms the existing baselines, +especially on complex problems. We also show after consistent alignment, +multi-view can absorb the merits of both views and generate more diverse +results consistent with the mathematical laws. + +
+
+ comment: 14 pages, 5 figures, 3 appendix figures +
+
+
+
+
+ + ♻ ☆ PGTask: Introducing the Task of Profile Generation from Dialogues SIGDIAL 2023 + + +
+ Recent approaches have attempted to personalize dialogue systems by +leveraging profile information into models. However, this knowledge is scarce +and difficult to obtain, which makes the extraction/generation of profile +information from dialogues a fundamental asset. To surpass this limitation, we +introduce the Profile Generation Task (PGTask). We contribute with a new +dataset for this problem, comprising profile sentences aligned with related +utterances, extracted from a corpus of dialogues. Furthermore, using +state-of-the-art methods, we provide a benchmark for profile generation on this +novel dataset. Our experiments disclose the challenges of profile generation, +and we hope that this introduces a new research direction. + +
+
+ comment: Accepted at SIGDIAL 2023, 4 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Domain Specialization as the Key to Make Large Language Models + Disruptive: A Comprehensive Survey + + +
+ Large language models (LLMs) have significantly advanced the field of natural +language processing (NLP), providing a highly useful, task-agnostic foundation +for a wide range of applications. However, directly applying LLMs to solve +sophisticated problems in specific domains meets many hurdles, caused by the +heterogeneity of domain data, the sophistication of domain knowledge, the +uniqueness of domain objectives, and the diversity of the constraints (e.g., +various social norms, cultural conformity, religious beliefs, and ethical +standards in the domain applications). Domain specification techniques are key +to make large language models disruptive in many applications. Specifically, to +solve these hurdles, there has been a notable increase in research and +practices conducted in recent years on the domain specialization of LLMs. This +emerging field of study, with its substantial potential for impact, +necessitates a comprehensive and systematic review to better summarize and +guide ongoing work in this area. In this article, we present a comprehensive +survey on domain specification techniques for large language models, an +emerging direction critical for large language model applications. First, we +propose a systematic taxonomy that categorizes the LLM domain-specialization +techniques based on the accessibility to LLMs and summarizes the framework for +all the subcategories as well as their relations and differences to each other. +Second, we present an extensive taxonomy of critical application domains that +can benefit dramatically from specialized LLMs, discussing their practical +significance and open challenges. Last, we offer our insights into the current +research status and future trends in this area. + +
+
+
+
+
+ + ♻ ☆ A Survey on Knowledge Graphs for Healthcare: Resources, Applications, + and Promises + + +
+ Healthcare knowledge graphs (HKGs) have emerged as a promising tool for +organizing medical knowledge in a structured and interpretable way, which +provides a comprehensive view of medical concepts and their relationships. +However, challenges such as data heterogeneity and limited coverage remain, +emphasizing the need for further research in the field of HKGs. This survey +paper serves as the first comprehensive overview of HKGs. We summarize the +pipeline and key techniques for HKG construction (i.e., from scratch and +through integration), as well as the common utilization approaches (i.e., +model-free and model-based). To provide researchers with valuable resources, we +organize existing HKGs (The resource is available at +https://github.com/lujiaying/Awesome-HealthCare-KnowledgeBase) based on the +data types they capture and application domains, supplemented with pertinent +statistical information. In the application section, we delve into the +transformative impact of HKGs across various healthcare domains, spanning from +fine-grained basic science research to high-level clinical decision support. +Lastly, we shed light on the opportunities for creating comprehensive and +accurate HKGs in the era of large language models, presenting the potential to +revolutionize healthcare delivery and enhance the interpretability and +reliability of clinical prediction. + +
+
+
+
+
+ + ♻ ☆ Language Model Behavior: A Comprehensive Survey + + +
+ Transformer language models have received widespread public attention, yet +their generated text is often surprising even to NLP researchers. In this +survey, we discuss over 250 recent studies of English language model behavior +before task-specific fine-tuning. Language models possess basic capabilities in +syntax, semantics, pragmatics, world knowledge, and reasoning, but these +capabilities are sensitive to specific inputs and surface features. Despite +dramatic increases in generated text quality as models scale to hundreds of +billions of parameters, the models are still prone to unfactual responses, +commonsense errors, memorized text, and social biases. Many of these weaknesses +can be framed as over-generalizations or under-generalizations of learned +patterns in text. We synthesize recent results to highlight what is currently +known about large language model capabilities, thus providing a resource for +applied work and for research in adjacent fields that use language models. + +
+
+ comment: 32 pages, accepted to Computational Linguistics +
+
+
+
+
+
+
+
+ + Information Retrieval 5 + +
+
+
+ + ☆ Video and Audio are Images: A Cross-Modal Mixer for Original Data on + Video-Audio Retrieval + + +
+ Cross-modal retrieval has become popular in recent years, particularly with +the rise of multimedia. Generally, the information from each modality exhibits +distinct representations and semantic information, which makes feature tends to +be in separate latent spaces encoded with dual-tower architecture and makes it +difficult to establish semantic relationships between modalities, resulting in +poor retrieval performance. To address this issue, we propose a novel framework +for cross-modal retrieval which consists of a cross-modal mixer, a masked +autoencoder for pre-training, and a cross-modal retriever for downstream +tasks.In specific, we first adopt cross-modal mixer and mask modeling to fuse +the original modality and eliminate redundancy. Then, an encoder-decoder +architecture is applied to achieve a fuse-then-separate task in the +pre-training phase.We feed masked fused representations into the encoder and +reconstruct them with the decoder, ultimately separating the original data of +two modalities. In downstream tasks, we use the pre-trained encoder to build +the cross-modal retrieval method. Extensive experiments on 2 real-world +datasets show that our approach outperforms previous state-of-the-art methods +in video-audio matching tasks, improving retrieval accuracy by up to 2 times. +Furthermore, we prove our model performance by transferring it to other +downstream tasks as a universal model. + +
+
+
+
+
+ + ☆ Central Similarity Multi-View Hashing for Multimedia Retrieval APWeb + + +
+ Hash representation learning of multi-view heterogeneous data is the key to +improving the accuracy of multimedia retrieval. However, existing methods +utilize local similarity and fall short of deeply fusing the multi-view +features, resulting in poor retrieval accuracy. Current methods only use local +similarity to train their model. These methods ignore global similarity. +Furthermore, most recent works fuse the multi-view features via a weighted sum +or concatenation. We contend that these fusion methods are insufficient for +capturing the interaction between various views. We present a novel Central +Similarity Multi-View Hashing (CSMVH) method to address the mentioned problems. +Central similarity learning is used for solving the local similarity problem, +which can utilize the global similarity between the hash center and samples. We +present copious empirical data demonstrating the superiority of gate-based +fusion over conventional approaches. On the MS COCO and NUS-WIDE, the proposed +CSMVH performs better than the state-of-the-art methods by a large margin (up +to 11.41% mean Average Precision (mAP) improvement). + +
+
+ comment: accepted by the Asia Pacific Web (APWeb) and Web-Age Information + Management (WAIM) Joint International Conference on Web and Big Data + (APWeb-WAIM2023) +
+
+
+
+
+ + ☆ How Can Context Help? Exploring Joint Retrieval of Passage and + Personalized Context + + +
+ The integration of external personalized context information into +document-grounded conversational systems has significant potential business +value, but has not been well-studied. Motivated by the concept of personalized +context-aware document-grounded conversational systems, we introduce the task +of context-aware passage retrieval. We also construct a dataset specifically +curated for this purpose. We describe multiple baseline systems to address this +task, and propose a novel approach, Personalized Context-Aware Search (PCAS), +that effectively harnesses contextual information during passage retrieval. +Experimental evaluations conducted on multiple popular dense retrieval systems +demonstrate that our proposed approach not only outperforms the baselines in +retrieving the most relevant passage but also excels at identifying the +pertinent context among all the available contexts. We envision that our +contributions will serve as a catalyst for inspiring future research endeavors +in this promising direction. + +
+
+
+
+
+ + ☆ ZC3: Zero-Shot Cross-Language Code Clone Detection + + +
+ Developers introduce code clones to improve programming productivity. Many +existing studies have achieved impressive performance in monolingual code clone +detection. However, during software development, more and more developers write +semantically equivalent programs with different languages to support different +platforms and help developers translate projects from one language to another. +Considering that collecting cross-language parallel data, especially for +low-resource languages, is expensive and time-consuming, how designing an +effective cross-language model that does not rely on any parallel data is a +significant problem. In this paper, we propose a novel method named ZC3 for +Zero-shot Cross-language Code Clone detection. ZC3 designs the contrastive +snippet prediction to form an isomorphic representation space among different +programming languages. Based on this, ZC3 exploits domain-aware learning and +cycle consistency learning to further constrain the model to generate +representations that are aligned among different languages meanwhile are +diacritical for different types of clones. To evaluate our approach, we conduct +extensive experiments on four representative cross-language clone detection +datasets. Experimental results show that ZC3 outperforms the state-of-the-art +baselines by 67.12%, 51.39%, 14.85%, and 53.01% on the MAP score, respectively. +We further investigate the representational distribution of different languages +and discuss the effectiveness of our method. + +
+
+ comment: Accepted by the 38th IEEE/ACM International Conference on Automated + Software Engineering (ASE 2023) +
+
+
+
+
+ + ♻ ☆ MUSE: Music Recommender System with Shuffle Play Recommendation + Enhancement CIKM 2023 + + +
+ Recommender systems have become indispensable in music streaming services, +enhancing user experiences by personalizing playlists and facilitating the +serendipitous discovery of new music. However, the existing recommender systems +overlook the unique challenges inherent in the music domain, specifically +shuffle play, which provides subsequent tracks in a random sequence. Based on +our observation that the shuffle play sessions hinder the overall training +process of music recommender systems mainly due to the high unique transition +rates of shuffle play sessions, we propose a Music Recommender System with +Shuffle Play Recommendation Enhancement (MUSE). MUSE employs the +self-supervised learning framework that maximizes the agreement between the +original session and the augmented session, which is augmented by our novel +session augmentation method, called transition-based augmentation. To further +facilitate the alignment of the representations between the two views, we +devise two fine-grained matching strategies, i.e., item- and similarity-based +matching strategies. Through rigorous experiments conducted across diverse +environments, we demonstrate MUSE's efficacy over 12 baseline models on a +large-scale Music Streaming Sessions Dataset (MSSD) from Spotify. The source +code of MUSE is available at \url{https://github.com/yunhak0/MUSE}. + +
+
+ comment: CIKM 2023 +
+
+
+
+
+
+
+
+ + Multimedia 4 + +
+
+
+ + ☆ The DiffuseStyleGesture+ entry to the GENEA Challenge 2023 + + +
+ In this paper, we introduce the DiffuseStyleGesture+, our solution for the +Generation and Evaluation of Non-verbal Behavior for Embodied Agents (GENEA) +Challenge 2023, which aims to foster the development of realistic, automated +systems for generating conversational gestures. Participants are provided with +a pre-processed dataset and their systems are evaluated through crowdsourced +scoring. Our proposed model, DiffuseStyleGesture+, leverages a diffusion model +to generate gestures automatically. It incorporates a variety of modalities, +including audio, text, speaker ID, and seed gestures. These diverse modalities +are mapped to a hidden space and processed by a modified diffusion model to +produce the corresponding gesture for a given speech input. Upon evaluation, +the DiffuseStyleGesture+ demonstrated performance on par with the top-tier +models in the challenge, showing no significant differences with those models +in human-likeness, appropriateness for the interlocutor, and achieving +competitive performance with the best model on appropriateness for agent +speech. This indicates that our model is competitive and effective in +generating realistic and appropriate gestures for given speech. The code, +pre-trained models, and demos are available at +https://github.com/YoungSeng/DiffuseStyleGesture/tree/DiffuseStyleGesturePlus/BEAT-TWH-main. + +
+
+ comment: 7 pages, 8 figures, ICMI 2023 +
+
+
+
+
+ + ☆ Reinforcement Learning Based Multi-modal Feature Fusion Network for + Novel Class Discovery + + +
+ With the development of deep learning techniques, supervised learning has +achieved performances surpassing those of humans. Researchers have designed +numerous corresponding models for different data modalities, achieving +excellent results in supervised tasks. However, with the exponential increase +of data in multiple fields, the recognition and classification of unlabeled +data have gradually become a hot topic. In this paper, we employed a +Reinforcement Learning framework to simulate the cognitive processes of humans +for effectively addressing novel class discovery in the Open-set domain. We +deployed a Member-to-Leader Multi-Agent framework to extract and fuse features +from multi-modal information, aiming to acquire a more comprehensive +understanding of the feature space. Furthermore, this approach facilitated the +incorporation of self-supervised learning to enhance model training. We +employed a clustering method with varying constraint conditions, ranging from +strict to loose, allowing for the generation of dependable labels for a subset +of unlabeled data during the training phase. This iterative process is similar +to human exploratory learning of unknown data. These mechanisms collectively +update the network parameters based on rewards received from environmental +feedback. This process enables effective control over the extent of exploration +learning, ensuring the accuracy of learning in unknown data categories. We +demonstrate the performance of our approach in both the 3D and 2D domains by +employing the OS-MN40, OS-MN40-Miss, and Cifar10 datasets. Our approach +achieves competitive competitive results. + +
+
+
+
+
+ + ☆ Central Similarity Multi-View Hashing for Multimedia Retrieval APWeb + + +
+ Hash representation learning of multi-view heterogeneous data is the key to +improving the accuracy of multimedia retrieval. However, existing methods +utilize local similarity and fall short of deeply fusing the multi-view +features, resulting in poor retrieval accuracy. Current methods only use local +similarity to train their model. These methods ignore global similarity. +Furthermore, most recent works fuse the multi-view features via a weighted sum +or concatenation. We contend that these fusion methods are insufficient for +capturing the interaction between various views. We present a novel Central +Similarity Multi-View Hashing (CSMVH) method to address the mentioned problems. +Central similarity learning is used for solving the local similarity problem, +which can utilize the global similarity between the hash center and samples. We +present copious empirical data demonstrating the superiority of gate-based +fusion over conventional approaches. On the MS COCO and NUS-WIDE, the proposed +CSMVH performs better than the state-of-the-art methods by a large margin (up +to 11.41% mean Average Precision (mAP) improvement). + +
+
+ comment: accepted by the Asia Pacific Web (APWeb) and Web-Age Information + Management (WAIM) Joint International Conference on Web and Big Data + (APWeb-WAIM2023) +
+
+
+
+
+ + ♻ ☆ Towards Top-Down Stereoscopic Image Quality Assessment via Stereo + Attention + + +
+ Stereoscopic image quality assessment (SIQA) plays a crucial role in +evaluating and improving the visual experience of 3D content. Existing +binocular properties and attention-based methods for SIQA have achieved +promising performance. However, these bottom-up approaches are inadequate in +exploiting the inherent characteristics of the human visual system (HVS). This +paper presents a novel network for SIQA via stereo attention, employing a +top-down perspective to guide the quality assessment process. Our proposed +method realizes the guidance from high-level binocular signals down to +low-level monocular signals, while the binocular and monocular information can +be calibrated progressively throughout the processing pipeline. We design a +generalized Stereo AttenTion (SAT) block to implement the top-down philosophy +in stereo perception. This block utilizes the fusion-generated attention map as +a high-level binocular modulator, influencing the representation of two +low-level monocular features. Additionally, we introduce an Energy Coefficient +(EC) to account for recent findings indicating that binocular responses in the +primate primary visual cortex are less than the sum of monocular responses. The +adaptive EC can tune the magnitude of binocular response flexibly, thus +enhancing the formation of robust binocular features within our framework. To +extract the most discriminative quality information from the summation and +subtraction of the two branches of monocular features, we utilize a +dual-pooling strategy that applies min-pooling and max-pooling operations to +the respective branches. Experimental results highlight the superiority of our +top-down method in simulating the property of visual perception and advancing +the state-of-the-art in the SIQA field. The code of this work is available at +https://github.com/Fanning-Zhang/SATNet. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 56 + +
+
+
+ + ☆ ChatGPT as Data Augmentation for Compositional Generalization: A Case + Study in Open Intent Detection + + +
+ Open intent detection, a crucial aspect of natural language understanding, +involves the identification of previously unseen intents in user-generated +text. Despite the progress made in this field, challenges persist in handling +new combinations of language components, which is essential for compositional +generalization. In this paper, we present a case study exploring the use of +ChatGPT as a data augmentation technique to enhance compositional +generalization in open intent detection tasks. We begin by discussing the +limitations of existing benchmarks in evaluating this problem, highlighting the +need for constructing datasets for addressing compositional generalization in +open intent detection tasks. By incorporating synthetic data generated by +ChatGPT into the training process, we demonstrate that our approach can +effectively improve model performance. Rigorous evaluation of multiple +benchmarks reveals that our method outperforms existing techniques and +significantly enhances open intent detection capabilities. Our findings +underscore the potential of large language models like ChatGPT for data +augmentation in natural language understanding tasks. + +
+
+
+
+
+ + ☆ Training and Meta-Evaluating Machine Translation Evaluation Metrics at + the Paragraph Level + + +
+ As research on machine translation moves to translating text beyond the +sentence level, it remains unclear how effective automatic evaluation metrics +are at scoring longer translations. In this work, we first propose a method for +creating paragraph-level data for training and meta-evaluating metrics from +existing sentence-level data. Then, we use these new datasets to benchmark +existing sentence-level metrics as well as train learned metrics at the +paragraph level. Interestingly, our experimental results demonstrate that using +sentence-level metrics to score entire paragraphs is equally as effective as +using a metric designed to work at the paragraph level. We speculate this +result can be attributed to properties of the task of reference-based +evaluation as well as limitations of our datasets with respect to capturing all +types of phenomena that occur in paragraph-level translations. + +
+
+
+
+
+ + ☆ Ngambay-French Neural Machine Translation (sba-Fr) + + +
+ In Africa, and the world at large, there is an increasing focus on developing +Neural Machine Translation (NMT) systems to overcome language barriers. NMT for +Low-resource language is particularly compelling as it involves learning with +limited labelled data. However, obtaining a well-aligned parallel corpus for +low-resource languages can be challenging. The disparity between the +technological advancement of a few global languages and the lack of research on +NMT for local languages in Chad is striking. End-to-end NMT trials on +low-resource Chad languages have not been attempted. Additionally, there is a +dearth of online and well-structured data gathering for research in Natural +Language Processing, unlike some African languages. However, a guided approach +for data gathering can produce bitext data for many Chadian language +translation pairs with well-known languages that have ample data. In this +project, we created the first sba-Fr Dataset, which is a corpus of +Ngambay-to-French translations, and fine-tuned three pre-trained models using +this dataset. Our experiments show that the M2M100 model outperforms other +models with high BLEU scores on both original and original+synthetic data. The +publicly available bitext dataset can be used for research purposes. + +
+
+ comment: Accepted at RANLP 2023 - International Workshop NLP tools and + resources for translation and interpreting applications +
+
+
+
+
+ + ☆ Prompting a Large Language Model to Generate Diverse Motivational + Messages: A Comparison with Human-Written Messages + + +
+ Large language models (LLMs) are increasingly capable and prevalent, and can +be used to produce creative content. The quality of content is influenced by +the prompt used, with more specific prompts that incorporate examples generally +producing better results. On from this, it could be seen that using +instructions written for crowdsourcing tasks (that are specific and include +examples to guide workers) could prove effective LLM prompts. To explore this, +we used a previous crowdsourcing pipeline that gave examples to people to help +them generate a collectively diverse corpus of motivational messages. We then +used this same pipeline to generate messages using GPT-4, and compared the +collective diversity of messages from: (1) crowd-writers, (2) GPT-4 using the +pipeline, and (3 & 4) two baseline GPT-4 prompts. We found that the LLM prompts +using the crowdsourcing pipeline caused GPT-4 to produce more diverse messages +than the two baseline prompts. We also discuss implications from messages +generated by both human writers and LLMs. + +
+
+ comment: 3 pages, 1 figure, 1 table, to be published in Proceedings of the + 11th International Conference on Human-Agent Interaction (ACM HAI'23) +
+
+
+
+
+ + ☆ Leveraging Knowledge and Reinforcement Learning for Enhanced Reliability + of Language Models CIKM'23 + + +
+ The Natural Language Processing(NLP) community has been using crowd sourcing +techniques to create benchmark datasets such as General Language Understanding +and Evaluation(GLUE) for training modern Language Models such as BERT. GLUE +tasks measure the reliability scores using inter annotator metrics i.e. Cohens +Kappa. However, the reliability aspect of LMs has often been overlooked. To +counter this problem, we explore a knowledge-guided LM ensembling approach that +leverages reinforcement learning to integrate knowledge from ConceptNet and +Wikipedia as knowledge graph embeddings. This approach mimics human annotators +resorting to external knowledge to compensate for information deficits in the +datasets. Across nine GLUE datasets, our research shows that ensembling +strengthens reliability and accuracy scores, outperforming state of the art. + +
+
+ comment: Accepted at CIKM'23 +
+
+
+
+
+ + ☆ ARTIST: ARTificial Intelligence for Simplified Text + + +
+ Complex text is a major barrier for many citizens when accessing public +information and knowledge. While often done manually, Text Simplification is a +key Natural Language Processing task that aims for reducing the linguistic +complexity of a text while preserving the original meaning. Recent advances in +Generative Artificial Intelligence (AI) have enabled automatic text +simplification both on the lexical and syntactical levels. However, as +applications often focus on English, little is understood about the +effectiveness of Generative AI techniques on low-resource languages such as +Dutch. For this reason, we carry out empirical studies to understand the +benefits and limitations of applying generative technologies for text +simplification and provide the following outcomes: 1) the design and +implementation for a configurable text simplification pipeline that +orchestrates state-of-the-art generative text simplification models, domain and +reader adaptation, and visualisation modules; 2) insights and lessons learned, +showing the strengths of automatic text simplification while exposing the +challenges in handling cultural and commonsense knowledge. These outcomes +represent a first step in the exploration of Dutch text simplification and shed +light on future endeavours both for research and practice. + +
+
+ comment: 6 pages, 1 figure. Presented at the 'Generative AI and HCI' workshop + (https://generativeaiandhci.github.io/) at CHI 2023 in Hamburg, Germany +
+
+
+
+
+ + ☆ The Poison of Alignment + + +
+ From the perspective of content safety issues, alignment has shown to limit +large language models' (LLMs) harmful content generation. This intentional +method of reinforcing models to not respond to certain user inputs seem to be +present in many modern open-source instruction tuning datasets such as +OpenAssistant or Guanaco. We introduce a novel insight to an instruction-tuned +model's performance affected by the presence of alignment in supervised +fine-tuning dataset. To be specific, we noticed that alignment acts as if it is +poisoning the instruction dataset. Experimentally, we demonstrate that aligned +answers significantly worsen the performance of the resulting fine-tuned +model's on various reasoning benchmarks such as Big Bench (BBH), Massive +Multitask Language Understanding (MMLU), Human Eval, and Discrete Reasoning +Over Paragraphs (DROP), performing worse than the counterpart tuned without +alignment by 4-33%. + +
+
+
+
+
+ + ☆ EntropyRank: Unsupervised Keyphrase Extraction via Side-Information + Optimization for Language Model-based Text Compression + + +
+ We propose an unsupervised method to extract keywords and keyphrases from +texts based on a pre-trained language model (LM) and Shannon's information +maximization. Specifically, our method extracts phrases having the highest +conditional entropy under the LM. The resulting set of keyphrases turns out to +solve a relevant information-theoretic problem: if provided as side +information, it leads to the expected minimal binary code length in compressing +the text using the LM and an entropy encoder. Alternately, the resulting set is +an approximation via a causal LM to the set of phrases that minimize the +entropy of the text when conditioned upon it. Empirically, the method provides +results comparable to the most commonly used methods in various keyphrase +extraction benchmark challenges. + +
+
+
+
+
+ + ☆ Do-Not-Answer: A Dataset for Evaluating Safeguards in LLMs + + +
+ With the rapid evolution of large language models (LLMs), new and +hard-to-predict harmful capabilities are emerging. This requires developers to +be able to identify risks through the evaluation of "dangerous capabilities" in +order to responsibly deploy LLMs. In this work, we collect the first +open-source dataset to evaluate safeguards in LLMs, and deploy safer +open-source LLMs at a low cost. Our dataset is curated and filtered to consist +only of instructions that responsible language models should not follow. We +annotate and assess the responses of six popular LLMs to these instructions. +Based on our annotation, we proceed to train several BERT-like classifiers, and +find that these small classifiers can achieve results that are comparable with +GPT-4 on automatic safety evaluation. Warning: this paper contains example data +that may be offensive, harmful, or biased. + +
+
+ comment: 18 pages, 9 figures, 11 tables +
+
+
+
+
+ + ☆ Assessing Keyness using Permutation Tests + + +
+ We propose a resampling-based approach for assessing keyness in corpus +linguistics based on suggestions by Gries (2006, 2022). Traditional approaches +based on hypothesis tests (e.g. Likelihood Ratio) model the copora as +independent identically distributed samples of tokens. This model does not +account for the often observed uneven distribution of occurences of a word +across a corpus. When occurences of a word are concentrated in few documents, +large values of LLR and similar scores are in fact much more likely than +accounted for by the token-by-token sampling model, leading to false positives. + We replace the token-by-token sampling model by a model where corpora are +samples of documents rather than tokens, which is much closer to the way +corpora are actually assembled. We then use a permutation approach to +approximate the distribution of a given keyness score under the null hypothesis +of equal frequencies and obtain p-values for assessing significance. We do not +need any assumption on how the tokens are organized within or across documents, +and the approach works with basically *any* keyness score. Hence, appart from +obtaining more accurate p-values for scores like LLR, we can also assess +significance for e.g. the logratio which has been proposed as a measure of +effect size. + An efficient implementation of the proposed approach is provided in the `R` +package `keyperm` available from github. + +
+
+ comment: Software available under https://github.com/thmild/keyperm +
+
+
+
+
+ + ☆ On the Impact of Language Selection for Training and Evaluating + Programming Language Models SC + + +
+ The recent advancements in Transformer-based Language Models have +demonstrated significant potential in enhancing the multilingual capabilities +of these models. The remarkable progress made in this domain not only applies +to natural language tasks but also extends to the domain of programming +languages. Despite the ability of these models to learn from multiple +languages, evaluations typically focus on particular combinations of the same +languages. In this study, we evaluate the similarity of programming languages +by analyzing their representations using a CodeBERT-based model. Our +experiments reveal that token representation in languages such as C++, Python, +and Java exhibit proximity to one another, whereas the same tokens in languages +such as Mathematica and R display significant dissimilarity. Our findings +suggest that this phenomenon can potentially result in performance challenges +when dealing with diverse languages. Thus, we recommend using our similarity +measure to select a diverse set of programming languages when training and +evaluating future models. + +
+
+ comment: Accepted to 2023 IEEE 23rd International Working Conference on Source + Code Analysis and Manipulation (SCAM), NIER track +
+
+
+
+
+ + ☆ Construction Grammar and Language Models + + +
+ Recent progress in deep learning and natural language processing has given +rise to powerful models that are primarily trained on a cloze-like task and +show some evidence of having access to substantial linguistic information, +including some constructional knowledge. This groundbreaking discovery presents +an exciting opportunity for a synergistic relationship between computational +methods and Construction Grammar research. In this chapter, we explore three +distinct approaches to the interplay between computational methods and +Construction Grammar: (i) computational methods for text analysis, (ii) +computational Construction Grammar, and (iii) deep learning models, with a +particular focus on language models. We touch upon the first two approaches as +a contextual foundation for the use of computational methods before providing +an accessible, yet comprehensive overview of deep learning models, which also +addresses reservations construction grammarians may have. Additionally, we +delve into experiments that explore the emergence of constructionally relevant +information within these models while also examining the aspects of +Construction Grammar that may pose challenges for these models. This chapter +aims to foster collaboration between researchers in the fields of natural +language processing and Construction Grammar. By doing so, we hope to pave the +way for new insights and advancements in both these fields. + +
+
+ comment: Accepted for publication in The Cambridge Handbook of Construction + Grammar, edited by Mirjam Fried and Kiki Nikiforidou. To appear in 2024 +
+
+
+
+
+ + ☆ Knowledge-Driven CoT: Exploring Faithful Reasoning in LLMs for + Knowledge-intensive Question Answering + + +
+ Equipped with Chain-of-Thought (CoT), Large language models (LLMs) have shown +impressive reasoning ability in various downstream tasks. Even so, suffering +from hallucinations and the inability to access external knowledge, LLMs often +come with incorrect or unfaithful intermediate reasoning steps, especially in +the context of answering knowledge-intensive tasks such as KBQA. To alleviate +this issue, we propose a framework called Knowledge-Driven Chain-of-Thought +(KD-CoT) to verify and modify reasoning traces in CoT via interaction with +external knowledge, and thus overcome the hallucinations and error propagation. +Concretely, we formulate the CoT rationale process of LLMs into a structured +multi-round QA format. In each round, LLMs interact with a QA system that +retrieves external knowledge and produce faithful reasoning traces based on +retrieved precise answers. The structured CoT reasoning of LLMs is facilitated +by our developed KBQA CoT collection, which serves as in-context learning +demonstrations and can also be utilized as feedback augmentation to train a +robust retriever. Extensive experiments on WebQSP and ComplexWebQuestion +datasets demonstrate the effectiveness of proposed KD-CoT in task-solving +reasoning generation, which outperforms the vanilla CoT ICL with an absolute +success rate of 8.0% and 5.1%. Furthermore, our proposed feedback-augmented +retriever outperforms the state-of-the-art baselines for retrieving knowledge, +achieving significant improvement in Hit performance. + +
+
+
+
+
+ + ☆ LLM2KB: Constructing Knowledge Bases using instruction tuned context + aware Large Language Models + + +
+ The advent of Large Language Models (LLM) has revolutionized the field of +natural language processing, enabling significant progress in various +applications. One key area of interest is the construction of Knowledge Bases +(KB) using these powerful models. Knowledge bases serve as repositories of +structured information, facilitating information retrieval and inference tasks. +Our paper proposes LLM2KB, a system for constructing knowledge bases using +large language models, with a focus on the Llama 2 architecture and the +Wikipedia dataset. We perform parameter efficient instruction tuning for +Llama-2-13b-chat and StableBeluga-13B by training small injection models that +have only 0.05 % of the parameters of the base models using the Low Rank +Adaptation (LoRA) technique. These injection models have been trained with +prompts that are engineered to utilize Wikipedia page contexts of subject +entities fetched using a Dense Passage Retrieval (DPR) algorithm, to answer +relevant object entities for a given subject entity and relation. Our best +performing model achieved an average F1 score of 0.6185 across 21 relations in +the LM-KBC challenge held at the ISWC 2023 conference. + +
+
+ comment: 16 pages, 1 figure, LM-KBC 2023 Challenge at International Semantic + Web Conference 2023 +
+
+
+
+
+ + ☆ Journey to the Center of the Knowledge Neurons: Discoveries of + Language-Independent Knowledge Neurons and Degenerate Knowledge Neurons + + +
+ Pre-trained language models (PLMs) contain vast amounts of factual knowledge, +but how the knowledge is stored in the parameters remains unclear. This paper +delves into the complex task of understanding how factual knowledge is stored +in multilingual PLMs, and introduces the Architecture-adapted Multilingual +Integrated Gradients method, which successfully localizes knowledge neurons +more precisely compared to current methods, and is more universal across +various architectures and languages. Moreover, we conduct an in-depth +exploration of knowledge neurons, leading to the following two important +discoveries: (1) The discovery of Language-Independent Knowledge Neurons, which +store factual knowledge in a form that transcends language. We design +cross-lingual knowledge editing experiments, demonstrating that the PLMs can +accomplish this task based on language-independent neurons; (2) The discovery +of Degenerate Knowledge Neurons, a novel type of neuron showing that different +knowledge neurons can store the same fact. Its property of functional overlap +endows the PLMs with a robust mastery of factual knowledge. We design +fact-checking experiments, proving that the degenerate knowledge neurons can +help the PLMs to detect wrong facts. Experiments corroborate these findings, +shedding light on the mechanisms of factual knowledge storage in multilingual +PLMs, and contribute valuable insights to the field. The source code will be +made publicly available for further research. + +
+
+
+
+
+ + ☆ Formalising Natural Language Quantifiers for Human-Robot Interactions + + +
+ We present a method for formalising quantifiers in natural language in the +context of human-robot interactions. The solution is based on first-order logic +extended with capabilities to represent the cardinality of variables, operating +similarly to generalised quantifiers. To demonstrate the method, we designed an +end-to-end system able to receive input as natural language, convert it into a +formal logical representation, evaluate it, and return a result or send a +command to a simulated robot. + +
+
+
+
+
+ + ☆ Chunk, Align, Select: A Simple Long-sequence Processing Method for + Transformers + + +
+ Although dominant in natural language processing, transformer-based models +remain challenged by the task of long-sequence processing, because the +computational cost of self-attention operations in transformers swells +quadratically with the input sequence length. To alleviate the complexity of +long-sequence processing, we propose a simple framework to enable the +offthe-shelf pre-trained transformers to process much longer sequences, while +the computation and memory costs remain growing linearly with the input +sequence lengths. More specifically, our method divides each long-sequence +input into a batch of chunks, then aligns the interchunk information during the +encoding steps, and finally selects the most representative hidden states from +the encoder for the decoding process. To extract inter-chunk semantic +information, we align the start and end token embeddings among chunks in each +encoding transformer block. To learn an effective hidden selection policy, we +design a dual updating scheme inspired by reinforcement learning, which regards +the decoders of transformers as environments, and the downstream performance +metrics as the rewards to evaluate the hidden selection actions. Our empirical +results on real-world long-text summarization and reading comprehension tasks +demonstrate effective improvements compared to prior longsequence processing +baselines. + +
+
+
+
+
+ + ☆ How to Evaluate the Generalization of Detection? A Benchmark for + Comprehensive Open-Vocabulary Detection + + +
+ Object detection (OD) in computer vision has made significant progress in +recent years, transitioning from closed-set labels to open-vocabulary detection +(OVD) based on large-scale vision-language pre-training (VLP). However, current +evaluation methods and datasets are limited to testing generalization over +object types and referral expressions, which do not provide a systematic, +fine-grained, and accurate benchmark of OVD models' abilities. In this paper, +we propose a new benchmark named OVDEval, which includes 9 sub-tasks and +introduces evaluations on commonsense knowledge, attribute understanding, +position understanding, object relation comprehension, and more. The dataset is +meticulously created to provide hard negatives that challenge models' true +understanding of visual and linguistic input. Additionally, we identify a +problem with the popular Average Precision (AP) metric when benchmarking models +on these fine-grained label datasets and propose a new metric called +Non-Maximum Suppression Average Precision (NMS-AP) to address this issue. +Extensive experimental results show that existing top OVD models all fail on +the new tasks except for simple object types, demonstrating the value of the +proposed dataset in pinpointing the weakness of current OVD models and guiding +future research. Furthermore, the proposed NMS-AP metric is verified by +experiments to provide a much more truthful evaluation of OVD models, whereas +traditional AP metrics yield deceptive results. Data is available at +\url{https://github.com/om-ai-lab/OVDEval} + +
+
+
+
+
+ + ☆ DISGO: Automatic End-to-End Evaluation for Scene Text OCR + + +
+ This paper discusses the challenges of optical character recognition (OCR) on +natural scenes, which is harder than OCR on documents due to the wild content +and various image backgrounds. We propose to uniformly use word error rates +(WER) as a new measurement for evaluating scene-text OCR, both end-to-end (e2e) +performance and individual system component performances. Particularly for the +e2e metric, we name it DISGO WER as it considers Deletion, Insertion, +Substitution, and Grouping/Ordering errors. Finally we propose to utilize the +concept of super blocks to automatically compute BLEU scores for e2e OCR +machine translation. The small SCUT public test set is used to demonstrate WER +performance by a modularized OCR system. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ Measuring Spurious Correlation in Classification: 'Clever Hans' in + Translationese + + +
+ Recent work has shown evidence of 'Clever Hans' behavior in high-performance +neural translationese classifiers, where BERT-based classifiers capitalize on +spurious correlations, in particular topic information, between data and target +classification labels, rather than genuine translationese signals. +Translationese signals are subtle (especially for professional translation) and +compete with many other signals in the data such as genre, style, author, and, +in particular, topic. This raises the general question of how much of the +performance of a classifier is really due to spurious correlations in the data +versus the signals actually targeted for by the classifier, especially for +subtle target signals and in challenging (low resource) data settings. We focus +on topic-based spurious correlation and approach the question from two +directions: (i) where we have no knowledge about spurious topic information and +its distribution in the data, (ii) where we have some indication about the +nature of spurious topic correlations. For (i) we develop a measure from first +principles capturing alignment of unsupervised topics with target +classification labels as an indication of spurious topic information in the +data. We show that our measure is the same as purity in clustering and propose +a 'topic floor' (as in a 'noise floor') for classification. For (ii) we +investigate masking of known spurious topic carriers in classification. Both +(i) and (ii) contribute to quantifying and (ii) to mitigating spurious +correlations. + +
+
+
+
+
+ + ☆ SciEval: A Multi-Level Large Language Model Evaluation Benchmark for + Scientific Research + + +
+ Recently, there has been growing interest in using Large Language Models +(LLMs) for scientific research. Numerous benchmarks have been proposed to +evaluate the ability of LLMs for scientific research. However, current +benchmarks are mostly based on pre-collected objective questions. This design +suffers from data leakage problem and lacks the evaluation of subjective Q/A +ability. In this paper, we propose SciEval, a comprehensive and +multi-disciplinary evaluation benchmark to address these issues. Based on +Bloom's taxonomy, SciEval covers four dimensions to systematically evaluate +scientific research ability. In particular, we design a "dynamic" subset based +on scientific principles to prevent evaluation from potential data leakage. +Both objective and subjective questions are included in SciEval. These +characteristics make SciEval a more effective benchmark for scientific research +ability evaluation of LLMs. Comprehensive experiments on most advanced LLMs +show that, although GPT-4 achieves SOTA performance compared to other LLMs, +there is still substantial room for improvement, especially for dynamic +questions. The data and codes are now publicly available. + +
+
+ comment: 12 pages, 17 figures, 12 tables. Under Review +
+
+
+
+
+ + ☆ MatchXML: An Efficient Text-label Matching Framework for Extreme + Multi-label Text Classification + + +
+ The eXtreme Multi-label text Classification(XMC) refers to training a +classifier that assigns a text sample with relevant labels from an extremely +large-scale label set (e.g., millions of labels). We propose MatchXML, an +efficient text-label matching framework for XMC. We observe that the label +embeddings generated from the sparse Term Frequency-Inverse Document +Frequency(TF-IDF) features have several limitations. We thus propose label2vec +to effectively train the semantic dense label embeddings by the Skip-gram +model. The dense label embeddings are then used to build a Hierarchical Label +Tree by clustering. In fine-tuning the pre-trained encoder Transformer, we +formulate the multi-label text classification as a text-label matching problem +in a bipartite graph. We then extract the dense text representations from the +fine-tuned Transformer. Besides the fine-tuned dense text embeddings, we also +extract the static dense sentence embeddings from a pre-trained Sentence +Transformer. Finally, a linear ranker is trained by utilizing the sparse TF-IDF +features, the fine-tuned dense text representations and static dense sentence +features. Experimental results demonstrate that MatchXML achieves +state-of-the-art accuracy on five out of six datasets. As for the speed, +MatchXML outperforms the competing methods on all the six datasets. Our source +code is publicly available at https://github.com/huiyegit/MatchXML. + +
+
+
+
+
+ + ☆ OmniQuant: Omnidirectionally Calibrated Quantization for Large Language + Models + + +
+ Large language models (LLMs) have revolutionized natural language processing +tasks. However, their practical deployment is hindered by their immense memory +and computation requirements. Although recent post-training quantization (PTQ) +methods are effective in reducing memory footprint and improving the +computational efficiency of LLM, they hand-craft quantization parameters, which +leads to low performance and fails to deal with extremely low-bit quantization. +To tackle this issue, we introduce an Omnidirectionally calibrated Quantization +(OmniQuant) technique for LLMs, which achieves good performance in diverse +quantization settings while maintaining the computational efficiency of PTQ by +efficiently optimizing various quantization parameters. OmniQuant comprises two +innovative components including Learnable Weight Clipping (LWC) and Learnable +Equivalent Transformation (LET). LWC modulates the extreme values of weights by +optimizing the clipping threshold. Meanwhile, LET tackles activation outliers +by shifting the challenge of quantization from activations to weights through a +learnable equivalent transformation. Operating within a differentiable +framework using block-wise error minimization, OmniQuant can optimize the +quantization process efficiently for both weight-only and weight-activation +quantization. For instance, the LLaMA-2 model family with the size of 7-70B can +be processed with OmniQuant on a single A100-40G GPU within 1-16 hours using +128 samples. Extensive experiments validate OmniQuant's superior performance +across diverse quantization configurations such as W4A4, W6A6, W4A16, W3A16, +and W2A16. Additionally, OmniQuant demonstrates effectiveness in +instruction-tuned models and delivers notable improvements in inference speed +and memory reduction on real devices. Codes and models are available at +\url{https://github.com/OpenGVLab/OmniQuant}. + +
+
+ comment: A differentiable quantization method for LLM +
+
+
+
+
+ + ☆ WellXplain: Wellness Concept Extraction and Classification in Reddit + Posts for Mental Health Analysis + + +
+ During the current mental health crisis, the importance of identifying +potential indicators of mental issues from social media content has surged. +Overlooking the multifaceted nature of mental and social well-being can have +detrimental effects on one's mental state. In traditional therapy sessions, +professionals manually pinpoint the origins and outcomes of underlying mental +challenges, a process both detailed and time-intensive. We introduce an +approach to this intricate mental health analysis by framing the identification +of wellness dimensions in Reddit content as a wellness concept extraction and +categorization challenge. We've curated a unique dataset named WELLXPLAIN, +comprising 3,092 entries and totaling 72,813 words. Drawing from Halbert L. +Dunn's well-regarded wellness theory, our team formulated an annotation +framework along with guidelines. This dataset also includes human-marked +textual segments, offering clear reasoning for decisions made in the wellness +concept categorization process. Our aim in publishing this dataset and +analyzing initial benchmarks is to spearhead the creation of advanced language +models tailored for healthcare-focused concept extraction and categorization. + +
+
+
+
+
+ + ☆ On the Depth between Beam Search and Exhaustive Search for Text + Generation + + +
+ Beam search and exhaustive search are two extreme ends of text decoding +algorithms with respect to the search depth. Beam search is limited in both +search width and depth, whereas exhaustive search is a global search that has +no such limitations. Surprisingly, beam search is not only computationally +cheaper but also performs better than exhaustive search despite its higher +search error. Plenty of research has investigated a range of beam widths, from +small to large, and reported that a beam width that is neither too large nor +too small is desirable. However, in terms of search depth, only the two extreme +ends, beam search and exhaustive search are studied intensively. In this paper, +we examine a range of search depths between the two extremes to discover the +desirable search depth. To this end, we introduce Lookahead Beam Search (LBS), +a multi-step lookahead search that optimizes the objective considering a fixed +number of future steps. Beam search and exhaustive search are special cases of +LBS where the lookahead depth is set to $0$ and $\infty$, respectively. We +empirically evaluate the performance of LBS and find that it outperforms beam +search overall on machine translation tasks. The result suggests there is room +for improvement in beam search by searching deeper. Inspired by the analysis, +we propose Lookbehind Heuristic Beam Search, a computationally feasible search +algorithm that heuristically simulates LBS with 1-step lookahead. The empirical +results show that the proposed method outperforms vanilla beam search on +machine translation and text summarization tasks. + +
+
+
+
+
+ + ☆ 1.5 million materials narratives generated by chatbots + + +
+ The advent of artificial intelligence (AI) has enabled a comprehensive +exploration of materials for various applications. However, AI models often +prioritize frequently encountered materials in the scientific literature, +limiting the selection of suitable candidates based on inherent physical and +chemical properties. To address this imbalance, we have generated a dataset of +1,494,017 natural language-material paragraphs based on combined OQMD, +Materials Project, JARVIS, COD and AFLOW2 databases, which are dominated by ab +initio calculations and tend to be much more evenly distributed on the periodic +table. The generated text narratives were then polled and scored by both human +experts and ChatGPT-4, based on three rubrics: technical accuracy, language and +structure, and relevance and depth of content, showing similar scores but with +human-scored depth of content being the most lagging. The merger of +multi-modality data sources and large language model (LLM) holds immense +potential for AI frameworks to help the exploration and discovery of +solid-state materials for specific applications. + +
+
+
+
+
+ + ☆ Rethinking Language Models as Symbolic Knowledge Graphs + + +
+ Symbolic knowledge graphs (KGs) play a pivotal role in knowledge-centric +applications such as search, question answering and recommendation. As +contemporary language models (LMs) trained on extensive textual data have +gained prominence, researchers have extensively explored whether the parametric +knowledge within these models can match up to that present in knowledge graphs. +Various methodologies have indicated that enhancing the size of the model or +the volume of training data enhances its capacity to retrieve symbolic +knowledge, often with minimal or no human supervision. Despite these +advancements, there is a void in comprehensively evaluating whether LMs can +encompass the intricate topological and semantic attributes of KGs, attributes +crucial for reasoning processes. In this work, we provide an exhaustive +evaluation of language models of varying sizes and capabilities. We construct +nine qualitative benchmarks that encompass a spectrum of attributes including +symmetry, asymmetry, hierarchy, bidirectionality, compositionality, paths, +entity-centricity, bias and ambiguity. Additionally, we propose novel +evaluation metrics tailored for each of these attributes. Our extensive +evaluation of various LMs shows that while these models exhibit considerable +potential in recalling factual information, their ability to capture intricate +topological and semantic traits of KGs remains significantly constrained. We +note that our proposed evaluation metrics are more reliable in evaluating these +abilities than the existing metrics. Lastly, some of our benchmarks challenge +the common notion that larger LMs (e.g., GPT-4) universally outshine their +smaller counterparts (e.g., BERT). + +
+
+
+
+
+ + ☆ GRASP: A Rehearsal Policy for Efficient Online Continual Learning + + +
+ Continual learning (CL) in deep neural networks (DNNs) involves incrementally +accumulating knowledge in a DNN from a growing data stream. A major challenge +in CL is that non-stationary data streams cause catastrophic forgetting of +previously learned abilities. Rehearsal is a popular and effective way to +mitigate this problem, which is storing past observations in a buffer and +mixing them with new observations during learning. This leads to a question: +Which stored samples should be selected for rehearsal? Choosing samples that +are best for learning, rather than simply selecting them at random, could lead +to significantly faster learning. For class incremental learning, prior work +has shown that a simple class balanced random selection policy outperforms more +sophisticated methods. Here, we revisit this question by exploring a new sample +selection policy called GRASP. GRASP selects the most prototypical (class +representative) samples first and then gradually selects less prototypical +(harder) examples to update the DNN. GRASP has little additional compute or +memory overhead compared to uniform selection, enabling it to scale to large +datasets. We evaluate GRASP and other policies by conducting CL experiments on +the large-scale ImageNet-1K and Places-LT image classification datasets. GRASP +outperforms all other rehearsal policies. Beyond vision, we also demonstrate +that GRASP is effective for CL on five text classification datasets. + +
+
+
+
+
+ + ☆ LSTM-based QoE Evaluation for Web Microservices' Reputation Scoring + + +
+ Sentiment analysis is the task of mining the authors' opinions about specific +entities. It allows organizations to monitor different services in real time +and act accordingly. Reputation is what is generally said or believed about +people or things. Informally, reputation combines the measure of reliability +derived from feedback, reviews, and ratings gathered from users, which reflect +their quality of experience (QoE) and can either increase or harm the +reputation of the provided services. In this study, we propose to perform +sentiment analysis on web microservices reviews to exploit the provided +information to assess and score the microservices' reputation. Our proposed +approach uses the Long Short-Term Memory (LSTM) model to perform sentiment +analysis and the Net Brand Reputation (NBR) algorithm to assess reputation +scores for microservices. This approach is tested on a set of more than 10,000 +reviews related to 15 Amazon Web microservices, and the experimental results +have shown that our approach is more accurate than existing approaches, with an +accuracy and precision of 93% obtained after applying an oversampling strategy +and a resulting reputation score of the considered microservices community of +89%. + +
+
+
+
+
+ + ☆ Text Style Transfer Evaluation Using Large Language Models + + +
+ Text Style Transfer (TST) is challenging to evaluate because the quality of +the generated text manifests itself in multiple aspects, each of which is hard +to measure individually: style transfer accuracy, content preservation, and +overall fluency of the text. Human evaluation is the gold standard in TST +evaluation; however, it is expensive, and the results are difficult to +reproduce. Numerous automated metrics are employed to assess performance in +these aspects, serving as substitutes for human evaluation. However, the +correlation between many of these automated metrics and human evaluations +remains unclear, raising doubts about their effectiveness as reliable +benchmarks. Recent advancements in Large Language Models (LLMs) have +demonstrated their ability to not only match but also surpass the average human +performance across a wide range of unseen tasks. This suggests that LLMs have +the potential to serve as a viable alternative to human evaluation and other +automated metrics. We assess the performance of different LLMs on TST +evaluation by employing multiple input prompts and comparing their results. Our +findings indicate that (even zero-shot) prompting correlates strongly with +human evaluation and often surpasses the performance of (other) automated +metrics. Additionally, we propose the ensembling of prompts and show it +increases the robustness of TST evaluation.This work contributes to the ongoing +efforts in evaluating LLMs on diverse tasks, which includes a discussion of +failure cases and limitations. + +
+
+
+
+
+ + ☆ An Ensemble Approach to Personalized Real Time Predictive Writing for + Experts KDD + + +
+ Completing a sentence, phrase or word after typing few words / characters is +very helpful for Intuit financial experts, while taking notes or having a live +chat with users, since they need to write complex financial concepts more +efficiently and accurately many times in a day. In this paper, we tie together +different approaches like large language models, traditional Markov Models and +char level models to create an end-to-end system to provide personalised +sentence/word auto-complete suggestions to experts, under strict latency +constraints. Proposed system can auto-complete sentences, phrases or words +while writing with personalisation and can be trained with very less data and +resources with good efficiency. Our proposed system is not only efficient and +personalized but also robust as it leverages multiple machine learning +techniques along with transfer learning approach to fine tune large language +model with Intuit specific data. This ensures that even in cases of rare or +unusual phrases, the system can provide relevant auto-complete suggestions in +near real time. Survey has showed that this system saves expert note-taking +time and boosts expert confidence in their communication with teammates and +clients. Since enabling this predictive writing feature for QBLive experts, +more than a million keystrokes have been saved based on these suggestions. We +have done comparative study for our ensemble choice. Moreover this feature can +be integrated with any product which has writing facility within a very short +period of time. + +
+
+ comment: ACM SIGKDD Workshop on Machine Learning in Finance, 2023 +
+
+
+
+
+ + ☆ Decoupled Structure for Improved Adaptability of End-to-End Models + + +
+ Although end-to-end (E2E) trainable automatic speech recognition (ASR) has +shown great success by jointly learning acoustic and linguistic information, it +still suffers from the effect of domain shifts, thus limiting potential +applications. The E2E ASR model implicitly learns an internal language model +(LM) which characterises the training distribution of the source domain, and +the E2E trainable nature makes the internal LM difficult to adapt to the target +domain with text-only data To solve this problem, this paper proposes decoupled +structures for attention-based encoder-decoder (Decoupled-AED) and neural +transducer (Decoupled-Transducer) models, which can achieve flexible domain +adaptation in both offline and online scenarios while maintaining robust +intra-domain performance. To this end, the acoustic and linguistic parts of the +E2E model decoder (or prediction network) are decoupled, making the linguistic +component (i.e. internal LM) replaceable. When encountering a domain shift, the +internal LM can be directly replaced during inference by a target-domain LM, +without re-training or using domain-specific paired speech-text data. +Experiments for E2E ASR models trained on the LibriSpeech-100h corpus showed +that the proposed decoupled structure gave 15.1% and 17.2% relative word error +rate reductions on the TED-LIUM 2 and AESRC2020 corpora while still maintaining +performance on intra-domain data. + +
+
+
+
+
+ + ☆ Transforming the Output of Generative Pre-trained Transformer: The + Influence of the PGI Framework on Attention Dynamics + + +
+ This paper presents a novel approach named Persona-Grouping-Intelligence +(PGI), which has been crafted to tackle the challenges posed by GPT models when +applied to real-world business issues. PGI leverages the inherent capabilities +of the GPT model to comprehend intricate language structures and generate +responses that are contextually relevant. The experiment occurred in a business +scenario where human intelligence was being underutilized due to less optimized +business processes. The primary objective of this approach is to leverage GPT +models to reduce the workload on humans in tasks that are extensive, +monotonous, and repetitive. Instead, the focus is redirected toward +decision-making activities. Remarkably, the experiment yielded an accuracy rate +of 93.81% in validating 4,000 responses generated by the model, underscoring +the effectiveness of the PGI strategies. Effectively addressing the issue of +underutilized human intelligence, this paradigm shift aligns business +environments with dynamic machine intelligence, enabling them to navigate the +intricacies of real-world challenges. This approach facilitates the practical +utilization of these models to tackle actual problems. The methodology offers +an opportunity to reshape the fundamental structure of business processes by +seamlessly integrating human decision-making with adaptable machine +intelligence. Consequently, this optimization enhances operational efficiency +and elevates strategic decision-making across diverse business contexts. + +
+
+
+
+
+ + ☆ Discovering Mental Health Research Topics with Topic Modeling ICML + + +
+ Mental health significantly influences various aspects of our daily lives, +and its importance has been increasingly recognized by the research community +and the general public, particularly in the wake of the COVID-19 pandemic. This +heightened interest is evident in the growing number of publications dedicated +to mental health in the past decade. In this study, our goal is to identify +general trends in the field and pinpoint high-impact research topics by +analyzing a large dataset of mental health research papers. To accomplish this, +we collected abstracts from various databases and trained a customized +Sentence-BERT based embedding model leveraging the BERTopic framework. Our +dataset comprises 96,676 research papers pertaining to mental health, enabling +us to examine the relationships between different topics using their abstracts. +To evaluate the effectiveness of the model, we compared it against two other +state-of-the-art methods: Top2Vec model and LDA-BERT model. The model +demonstrated superior performance in metrics that measure topic diversity and +coherence. To enhance our analysis, we also generated word clouds to provide a +comprehensive overview of the machine learning models applied in mental health +research, shedding light on commonly utilized techniques and emerging trends. +Furthermore, we provide a GitHub link* to the dataset used in this paper, +ensuring its accessibility for further research endeavors. + +
+
+ comment: Workshop on Interpretable ML in Healthcare at International + Conference on Machine Learning (ICML) +
+
+
+
+
+ + ☆ MLLM-DataEngine: An Iterative Refinement Approach for MLLM + + +
+ Despite the great advance of Multimodal Large Language Models (MLLMs) in both +instruction dataset building and benchmarking, the independence of training and +evaluation makes current MLLMs hard to further improve their capability under +the guidance of evaluation results with a relatively low human cost. In this +paper, we propose MLLM-DataEngine, a novel closed-loop system that bridges data +generation, model training, and evaluation. Within each loop iteration, the +MLLM-DataEngine first analyze the weakness of the model based on the evaluation +results, then generate a proper incremental dataset for the next training +iteration and enhance the model capability iteratively. Compared with previous +data collection methods which are separate from the benchmarking, the data +generated by MLLM-DataEngine shows better targeting, quality, and correctness. +For targeting, we propose an Adaptive Bad-case Sampling module, which adjusts +the ratio of different types of data within each incremental dataset based on +the benchmarking results. For quality, we resort to GPT-4 to generate +high-quality data with each given data type. For correctness, prompt design is +critical for the data generation results. Rather than previous hand-crafted +prompt, we propose an Interactive Prompt Optimization strategy, which optimizes +the prompt with the multi-round interaction between human and GPT, and improve +the correctness of generated data greatly. Through extensive experiments, we +find our MLLM-DataEngine could boost the MLLM capability in a targeted and +automatic manner, with only a few human participation. The MLLM-DataEngine will +be released and we hope it could be a general solution for the following MLLMs +building. + +
+
+
+
+
+ + ☆ DARWIN Series: Domain Specific Large Language Models for Natural Science + + +
+ Emerging tools bring forth fresh approaches to work, and the field of natural +science is no different. In natural science, traditional manual, serial, and +labour-intensive work is being augmented by automated, parallel, and iterative +processes driven by artificial intelligence-based experimental automation and +more. To add new capabilities in natural science, enabling the acceleration and +enrichment of automation of the discovery process, we present DARWIN, a series +of tailored LLMs for natural science, mainly in physics, chemistry, and +material science. This series relies on open-source LLM, incorporating +structured and unstructured scientific knowledge from public datasets and +literature. We fine-tuned the models using over 60,000 instruction data points, +emphasizing factual correctness. During the fine-tuning, we introduce the +Scientific Instruction Generation (SIG) model, automating instruction +generation from scientific texts. This eliminates the need for manual +extraction or domain-specific knowledge graphs and efficiently injects +scientific knowledge into the model. We also explore multi-task training +strategies, revealing interconnections between scientific tasks. DARWIN series +not only achieves state-of-the-art results on various scientific tasks but also +diminishes reliance on closed-source AI models. Our research showcases the +ability of LLM in the scientific domain, with the overarching goal of fostering +prosperity within the broader AI for science community. + +
+
+
+
+
+ + ☆ Large Language Models in Analyzing Crash Narratives -- A Comparative + Study of ChatGPT, BARD and GPT-4 + + +
+ In traffic safety research, extracting information from crash narratives +using text analysis is a common practice. With recent advancements of large +language models (LLM), it would be useful to know how the popular LLM +interfaces perform in classifying or extracting information from crash +narratives. To explore this, our study has used the three most popular publicly +available LLM interfaces- ChatGPT, BARD and GPT4. This study investigated their +usefulness and boundaries in extracting information and answering queries +related to accidents from 100 crash narratives from Iowa and Kansas. During the +investigation, their capabilities and limitations were assessed and their +responses to the queries were compared. Five questions were asked related to +the narratives: 1) Who is at-fault? 2) What is the manner of collision? 3) Has +the crash occurred in a work-zone? 4) Did the crash involve pedestrians? and 5) +What are the sequence of harmful events in the crash? For questions 1 through +4, the overall similarity among the LLMs were 70%, 35%, 96% and 89%, +respectively. The similarities were higher while answering direct questions +requiring binary responses and significantly lower for complex questions. To +compare the responses to question 5, network diagram and centrality measures +were analyzed. The network diagram from the three LLMs were not always similar +although they sometimes have the same influencing events with high in-degree, +out-degree and betweenness centrality. This study suggests using multiple +models to extract viable information from narratives. Also, caution must be +practiced while using these interfaces to obtain crucial safety related +information. + +
+
+
+
+
+ + ♻ ☆ Diffusion Language Models Can Perform Many Tasks with Scaling and + Instruction-Finetuning + + +
+ The recent surge of generative AI has been fueled by the generative power of +diffusion probabilistic models and the scalable capabilities of large language +models. Despite their potential, it remains elusive whether diffusion language +models can solve general language tasks comparable to their autoregressive +counterparts. This paper demonstrates that scaling diffusion models w.r.t. +data, sizes, and tasks can effectively make them strong language learners. We +build competent diffusion language models at scale by first acquiring knowledge +from massive data via masked language modeling pretraining thanks to their +intrinsic connections. We then reprogram pretrained masked language models into +diffusion language models via diffusive adaptation, wherein task-specific +finetuning and instruction finetuning are explored to unlock their versatility +in solving general language tasks. Experiments show that scaling diffusion +language models consistently improves performance across downstream language +tasks. We further discover that instruction finetuning can elicit zero-shot and +few-shot in-context learning abilities that help tackle many unseen tasks by +following natural language instructions, and show promise in advanced and +challenging abilities such as reasoning. + +
+
+ comment: added references +
+
+
+
+
+ + ♻ ☆ SpeechGen: Unlocking the Generative Power of Speech Language Models with + Prompts + + +
+ Large language models (LLMs) have gained considerable attention for +Artificial Intelligence Generated Content (AIGC), particularly with the +emergence of ChatGPT. However, the direct adaptation of continuous speech to +LLMs that process discrete tokens remains an unsolved challenge, hindering the +application of LLMs for speech generation. The advanced speech LMs are in the +corner, as that speech signals encapsulate a wealth of information, including +speaker and emotion, beyond textual data alone. Prompt tuning has demonstrated +notable gains in parameter efficiency and competitive performance on some +speech classification tasks. However, the extent to which prompts can +effectively elicit generation tasks from speech LMs remains an open question. +In this paper, we present pioneering research that explores the application of +prompt tuning to stimulate speech LMs for various generation tasks, within a +unified framework called SpeechGen, with around 10M trainable parameters. The +proposed unified framework holds great promise for efficiency and +effectiveness, particularly with the imminent arrival of advanced speech LMs, +which will significantly enhance the capabilities of the framework. The code +and demos of SpeechGen will be available on the project website: +\url{https://ga642381.github.io/SpeechPrompt/speechgen} + +
+
+ comment: Work in progress. The first three authors contributed equally +
+
+
+
+
+ + ♻ ☆ ChatMOF: An Autonomous AI System for Predicting and Generating + Metal-Organic Frameworks + + +
+ ChatMOF is an autonomous Artificial Intelligence (AI) system that is built to +predict and generate metal-organic frameworks (MOFs). By leveraging a +large-scale language model (GPT-4 and GPT-3.5-turbo), ChatMOF extracts key +details from textual inputs and delivers appropriate responses, thus +eliminating the necessity for rigid structured queries. The system is comprised +of three core components (i.e. an agent, a toolkit, and an evaluator) and it +forms a robust pipeline that manages a variety of tasks, including data +retrieval, property prediction, and structure generations. The study further +explores the merits and constraints of using large language models (LLMs) AI +system in material sciences using and showcases its transformative potential +for future advancements. + +
+
+
+
+
+ + ♻ ☆ Grimm in Wonderland: Prompt Engineering with Midjourney to Illustrate + Fairytales + + +
+ The quality of text-to-image generation is continuously improving, yet the +boundaries of its applicability are still unclear. In particular, refinement of +the text input with the objective of achieving better results - commonly called +prompt engineering - so far seems to have not been geared towards work with +pre-existing texts. We investigate whether text-to-image generation and prompt +engineering could be used to generate basic illustrations of popular +fairytales. Using Midjourney v4, we engage in action research with a dual aim: +to attempt to generate 5 believable illustrations for each of 5 popular +fairytales, and to define a prompt engineering process that starts from a +pre-existing text and arrives at an illustration of it. We arrive at a +tentative 4-stage process: i) initial prompt, ii) composition adjustment, iii) +style refinement, and iv) variation selection. We also discuss three reasons +why the generation model struggles with certain illustrations: difficulties +with counts, bias from stereotypical configurations and inability to depict +overly fantastic situations. Our findings are not limited to the specific +generation model and are intended to be generalisable to future ones. + +
+
+ comment: 19th Conference on Information and Research science Connecting to + Digital and Library Science, February 23-24, 2023, Bari, Italy +
+
+
+
+
+ + ♻ ☆ PMC-LLaMA: Towards Building Open-source Language Models for Medicine + + +
+ Recently, Large Language Models (LLMs) have showcased remarkable capabilities +in natural language understanding. While demonstrating proficiency in everyday +conversations and question-answering situations, these models frequently +struggle in domains that require precision, such as medical applications, due +to their lack of domain-specific knowledge. In this paper, we describe the +procedure for building a powerful, open-source language model specifically +designed for medicine applications, termed as PMC-LLaMA. Our contributions are +threefold: (i) we systematically investigate the process of adapting a +general-purpose foundation language model towards medical domain, this involves +data-centric knowledge injection through the integration of 4.8M biomedical +academic papers and 30K medical textbooks, as well as comprehensive fine-tuning +for alignment with domain-specific instructions; (ii) we contribute a +large-scale, comprehensive dataset for instruction tuning. This dataset +encompasses medical question-answering (QA), rationale for reasoning, and +conversational dialogues, comprising a total of 202M tokens; (iii) we conduct +thorough ablation studies to demonstrate the effectiveness of each proposed +component. While evaluating on various public medical question-answering +benchmarks, our lightweight PMCLLaMA, which consists of only 13 billion +parameters, exhibits superior performance, even surpassing ChatGPT. All models, +codes, datasets can be found in https://github.com/chaoyi-wu/PMC-LLaMA. + +
+
+
+
+
+ + ♻ ☆ ACTI at EVALITA 2023: Overview of the Conspiracy Theory Identification + Task + + +
+ Conspiracy Theory Identication task is a new shared task proposed for the +first time at the Evalita 2023. The ACTI challenge, based exclusively on +comments published on conspiratorial channels of telegram, is divided into two +subtasks: (i) Conspiratorial Content Classification: identifying conspiratorial +content and (ii) Conspiratorial Category Classification about specific +conspiracy theory classification. A total of fifteen teams participated in the +task for a total of 81 submissions. We illustrate the best performing +approaches were based on the utilization of large language models. We finally +draw conclusions about the utilization of these models for counteracting the +spreading of misinformation in online platforms. + +
+
+ comment: Accepted at the Evalita Workshop 2023 +
+
+
+
+
+ + ♻ ☆ How to Estimate Model Transferability of Pre-Trained Speech Models? + + +
+ In this work, we introduce a "score-based assessment" framework for +estimating the transferability of pre-trained speech models (PSMs) for +fine-tuning target tasks. We leverage upon two representation theories, +Bayesian likelihood estimation and optimal transport, to generate rank scores +for the PSM candidates using the extracted representations. Our framework +efficiently computes transferability scores without actual fine-tuning of +candidate models or layers by making a temporal independent hypothesis. We +evaluate some popular supervised speech models (e.g., Conformer RNN-Transducer) +and self-supervised speech models (e.g., HuBERT) in cross-layer and cross-model +settings using public data. Experimental results show a high Spearman's rank +correlation and low $p$-value between our estimation framework and fine-tuning +ground truth. Our proposed transferability framework requires less +computational time and resources, making it a resource-saving and +time-efficient approach for tuning speech foundation models. + +
+
+ comment: Accepted to Interspeech. Code is available at: + https://github.com/virginiakm1988/LogME-CTC +
+
+
+
+
+ + ♻ ☆ Benchmarking Neural Network Generalization for Grammar Induction + + +
+ How well do neural networks generalize? Even for grammar induction tasks, +where the target generalization is fully known, previous works have left the +question open, testing very limited ranges beyond the training set and using +different success criteria. We provide a measure of neural network +generalization based on fully specified formal languages. Given a model and a +formal grammar, the method assigns a generalization score representing how well +a model generalizes to unseen samples in inverse relation to the amount of data +it was trained on. The benchmark includes languages such as $a^nb^n$, +$a^nb^nc^n$, $a^nb^mc^{n+m}$, and Dyck-1 and 2. We evaluate selected +architectures using the benchmark and find that networks trained with a Minimum +Description Length objective (MDL) generalize better and using less data than +networks trained using standard loss functions. The benchmark is available at +https://github.com/taucompling/bliss. + +
+
+ comment: 10 pages, 4 figures, 2 tables. Conference: Learning with Small Data + 2023 +
+
+
+
+
+ + ♻ ☆ Approximating Online Human Evaluation of Social Chatbots with Prompting SIGDIAL 2023 + + +
+ As conversational models become increasingly available to the general public, +users are engaging with this technology in social interactions. Such +unprecedented interaction experiences may pose considerable social and +psychological risks to the users unless the technology is properly controlled. +This highlights the need for scalable and robust evaluation metrics for +conversational chatbots. Existing evaluation metrics aim to automate offline +user evaluation and approximate human judgment of pre-curated dialogs. However, +they are limited in their ability to capture subjective perceptions of users +who actually interact with the bots and might not generalize to real-world +settings. To address this limitation, we propose an approach to approximate +online human evaluation leveraging large language models (LLMs) from the GPT +family. We introduce a new Dialog system Evaluation framework based on +Prompting (DEP), which enables a fully automatic evaluation pipeline that +replicates live user studies and achieves an impressive correlation with human +judgment (up to Pearson r=0.95 on a system level). The DEP approach involves +collecting synthetic chat logs of evaluated bots with an LLM in the other-play +setting, where the LLM is carefully conditioned to follow a specific scenario. +We further explore different prompting approaches to produce evaluation scores +with the same LLM. The best performing prompts, which contain few-shot +demonstrations and instructions, show outstanding performance on the tested +dataset and demonstrate the ability to generalize to other dialog corpora. + +
+
+ comment: accepted to SIGDIAL 2023 (long paper) +
+
+
+
+
+ + ♻ ☆ CryCeleb: A Speaker Verification Dataset Based on Infant Cry Sounds + + +
+ This paper describes the Ubenwa CryCeleb dataset - a labeled collection of +infant cries, and the accompanying CryCeleb 2023 task - a public speaker +verification challenge based on infant cry sounds. We release for academic +usage more than 6 hours of manually segmented cry sounds from 786 newborns to +encourage research in infant cry analysis. + +
+
+
+
+
+ + ♻ ☆ Can Linguistic Knowledge Improve Multimodal Alignment in Vision-Language + Pretraining? + + +
+ The multimedia community has shown a significant interest in perceiving and +representing the physical world with multimodal pretrained neural network +models, and among them, the visual-language pertaining (VLP) is, currently, the +most captivating topic. However, there have been few endeavors dedicated to the +exploration of 1) whether essential linguistic knowledge (e.g., semantics and +syntax) can be extracted during VLP, and 2) how such linguistic knowledge +impact or enhance the multimodal alignment. In response, here we aim to +elucidate the impact of comprehensive linguistic knowledge, including semantic +expression and syntactic structure, on multimodal alignment. Specifically, we +design and release the SNARE, the first large-scale multimodal alignment +probing benchmark, to detect the vital linguistic components, e.g., lexical, +semantic, and syntax knowledge, containing four tasks: Semantic structure, +Negation logic, Attribute ownership, and Relationship composition. Based on our +proposed probing benchmarks, our holistic analyses of five advanced VLP models +illustrate that the VLP model: i) shows insensitivity towards complex syntax +structures and relies on content words for sentence comprehension; ii) +demonstrates limited comprehension of combinations between sentences and +negations; iii) faces challenges in determining the presence of actions or +spatial relationships within visual information and struggles with verifying +the correctness of triple combinations. We make our benchmark and code +available at \url{https://github.com/WangFei-2019/SNARE/}. + +
+
+ comment: [TL;DR] we design and release the SNARE, the first large-scale + multimodal alignment probing benchmark for current vision-language pretrained + models +
+
+
+
+
+ + ♻ ☆ TMR: Text-to-Motion Retrieval Using Contrastive 3D Human Motion + Synthesis ICCV 2023 + + +
+ In this paper, we present TMR, a simple yet effective approach for text to 3D +human motion retrieval. While previous work has only treated retrieval as a +proxy evaluation metric, we tackle it as a standalone task. Our method extends +the state-of-the-art text-to-motion synthesis model TEMOS, and incorporates a +contrastive loss to better structure the cross-modal latent space. We show that +maintaining the motion generation loss, along with the contrastive training, is +crucial to obtain good performance. We introduce a benchmark for evaluation and +provide an in-depth analysis by reporting results on several protocols. Our +extensive experiments on the KIT-ML and HumanML3D datasets show that TMR +outperforms the prior work by a significant margin, for example reducing the +median rank from 54 to 19. Finally, we showcase the potential of our approach +on moment retrieval. Our code and models are publicly available at +https://mathis.petrovich.fr/tmr. + +
+
+ comment: ICCV 2023 Camera Ready, project page: + https://mathis.petrovich.fr/tmr/ +
+
+
+
+
+ + ♻ ☆ Decoding ChatGPT: A Taxonomy of Existing Research, Current Challenges, + and Possible Future Directions + + +
+ Chat Generative Pre-trained Transformer (ChatGPT) has gained significant +interest and attention since its launch in November 2022. It has shown +impressive performance in various domains, including passing exams and creative +writing. However, challenges and concerns related to biases and trust persist. +In this work, we present a comprehensive review of over 100 Scopus-indexed +publications on ChatGPT, aiming to provide a taxonomy of ChatGPT research and +explore its applications. We critically analyze the existing literature, +identifying common approaches employed in the studies. Additionally, we +investigate diverse application areas where ChatGPT has found utility, such as +healthcare, marketing and financial services, software engineering, academic +and scientific writing, research and education, environmental science, and +natural language processing. Through examining these applications, we gain +valuable insights into the potential of ChatGPT in addressing real-world +challenges. We also discuss crucial issues related to ChatGPT, including biases +and trustworthiness, emphasizing the need for further research and development +in these areas. Furthermore, we identify potential future directions for +ChatGPT research, proposing solutions to current challenges and speculating on +expected advancements. By fully leveraging the capabilities of ChatGPT, we can +unlock its potential across various domains, leading to advancements in +conversational AI and transformative impacts in society. + +
+
+ comment: 31 pages. 8 figures and 3 tables +
+
+
+
+
+ + ♻ ☆ Code Llama: Open Foundation Models for Code + + +
+ We release Code Llama, a family of large language models for code based on +Llama 2 providing state-of-the-art performance among open models, infilling +capabilities, support for large input contexts, and zero-shot instruction +following ability for programming tasks. We provide multiple flavors to cover a +wide range of applications: foundation models (Code Llama), Python +specializations (Code Llama - Python), and instruction-following models (Code +Llama - Instruct) with 7B, 13B and 34B parameters each. All models are trained +on sequences of 16k tokens and show improvements on inputs with up to 100k +tokens. 7B and 13B Code Llama and Code Llama - Instruct variants support +infilling based on surrounding content. Code Llama reaches state-of-the-art +performance among open models on several code benchmarks, with scores of up to +53% and 55% on HumanEval and MBPP, respectively. Notably, Code Llama - Python +7B outperforms Llama 2 70B on HumanEval and MBPP, and all our models outperform +every other publicly available model on MultiPL-E. We release Code Llama under +a permissive license that allows for both research and commercial use. + +
+
+
+
+
+ + ♻ ☆ A Simplified Variant of Gödel's Ontological Argument + + +
+ A simplified variant of G\"odel's ontological argument is presented. The +simplified argument is valid already in basic modal logics K or KT, it does not +suffer from modal collapse, and it avoids the rather complex predicates of +essence (Ess.) and necessary existence (NE) as used by G\"odel. The variant +presented has been obtained as a side result of a series of theory +simplification experiments conducted in interaction with a modern proof +assistant system. The starting point for these experiments was the computer +encoding of G\"odel's argument, and then automated reasoning techniques were +systematically applied to arrive at the simplified variant presented. The +presented work thus exemplifies a fruitful human-computer interaction in +computational metaphysics. Whether the presented result increases or decreases +the attractiveness and persuasiveness of the ontological argument is a question +I would like to pass on to philosophy and theology. + +
+
+ comment: 15 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ cantnlp@LT-EDI-2023: Homophobia/Transphobia Detection in Social Media + Comments using Spatio-Temporally Retrained Language Models + + +
+ This paper describes our multiclass classification system developed as part +of the LTEDI@RANLP-2023 shared task. We used a BERT-based language model to +detect homophobic and transphobic content in social media comments across five +language conditions: English, Spanish, Hindi, Malayalam, and Tamil. We +retrained a transformer-based crosslanguage pretrained language model, +XLMRoBERTa, with spatially and temporally relevant social media language data. +We also retrained a subset of models with simulated script-mixed social media +language data with varied performance. We developed the best performing +seven-label classification system for Malayalam based on weighted macro +averaged F1 score (ranked first out of six) with variable performance for other +language and class-label conditions. We found the inclusion of this +spatio-temporal data improved the classification performance for all language +and task conditions when compared with the baseline. The results suggests that +transformer-based language classification systems are sensitive to +register-specific and language-specific retraining. + +
+
+
+
+
+ + ♻ ☆ Self-Deception: Reverse Penetrating the Semantic Firewall of Large + Language Models + + +
+ Large language models (LLMs), such as ChatGPT, have emerged with astonishing +capabilities approaching artificial general intelligence. While providing +convenience for various societal needs, LLMs have also lowered the cost of +generating harmful content. Consequently, LLM developers have deployed +semantic-level defenses to recognize and reject prompts that may lead to +inappropriate content. Unfortunately, these defenses are not foolproof, and +some attackers have crafted "jailbreak" prompts that temporarily hypnotize the +LLM into forgetting content defense rules and answering any improper questions. +To date, there is no clear explanation of the principles behind these +semantic-level attacks and defenses in both industry and academia. + This paper investigates the LLM jailbreak problem and proposes an automatic +jailbreak method for the first time. We propose the concept of a semantic +firewall and provide three technical implementation approaches. Inspired by the +attack that penetrates traditional firewalls through reverse tunnels, we +introduce a "self-deception" attack that can bypass the semantic firewall by +inducing LLM to generate prompts that facilitate jailbreak. We generated a +total of 2,520 attack payloads in six languages (English, Russian, French, +Spanish, Chinese, and Arabic) across seven virtual scenarios, targeting the +three most common types of violations: violence, hate, and pornography. The +experiment was conducted on two models, namely the GPT-3.5-Turbo and GPT-4. The +success rates on the two models were 86.2% and 67%, while the failure rates +were 4.7% and 2.2%, respectively. This highlighted the effectiveness of the +proposed attack method. All experimental code and raw data will be released as +open-source to inspire future research. We believe that manipulating AI +behavior through carefully crafted prompts will become an important research +direction in the future. + +
+
+ comment: Serious errors were found in the experiment, which may lead to the + overturning of the overall conclusions of the paper +
+
+
+
+
+ + ♻ ☆ Unsupervised Prototype Adapter for Vision-Language Models + + +
+ Recently, large-scale pre-trained vision-language models (e.g. CLIP and +ALIGN) have demonstrated remarkable effectiveness in acquiring transferable +visual representations. To leverage the valuable knowledge encoded within these +models for downstream tasks, several fine-tuning approaches, including prompt +tuning methods and adapter-based methods, have been developed to adapt +vision-language models effectively with supervision. However, these methods +rely on the availability of annotated samples, which can be labor-intensive and +time-consuming to acquire, thus limiting scalability. To address this issue, in +this work, we design an unsupervised fine-tuning approach for vision-language +models called Unsupervised Prototype Adapter (UP-Adapter). Specifically, for +the unannotated target datasets, we leverage the text-image aligning capability +of CLIP to automatically select the most confident samples for each class. +Utilizing these selected samples, we generate class prototypes, which serve as +the initialization for the learnable prototype model. After fine-tuning, the +prototype model prediction is combined with the original CLIP's prediction by a +residual connection to perform downstream recognition tasks. Our extensive +experimental results on image recognition and domain generalization show that +the proposed unsupervised method outperforms 8-shot CoOp, 8-shot Tip-Adapter, +and also the state-of-the-art UPL method by large margins. + +
+
+ comment: Accepted by PRCV 2023 +
+
+
+
+
+ + ♻ ☆ An Ensemble Approach to Question Classification: Integrating Electra + Transformer, GloVe, and LSTM + + +
+ Natural Language Processing (NLP) has emerged as a crucial technology for +understanding and generating human language, playing an essential role in tasks +such as machine translation, sentiment analysis, and more pertinently, question +classification. As a subfield within NLP, question classification focuses on +determining the type of information being sought, a fundamental step for +downstream applications like question answering systems. This study presents an +innovative ensemble approach for question classification, combining the +strengths of Electra, GloVe, and LSTM models. Rigorously tested on the +well-regarded TREC dataset, the model demonstrates how the integration of these +disparate technologies can lead to superior results. Electra brings in its +transformer-based capabilities for complex language understanding, GloVe offers +global vector representations for capturing word-level semantics, and LSTM +contributes its sequence learning abilities to model long-term dependencies. By +fusing these elements strategically, our ensemble model delivers a robust and +efficient solution for the complex task of question classification. Through +rigorous comparisons with well-known models like BERT, RoBERTa, and DistilBERT, +the ensemble approach verifies its effectiveness by attaining an 80% accuracy +score on the test dataset. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 109 + +
+
+
+ + ☆ Joint Modeling of Feature, Correspondence, and a Compressed Memory for + Video Object Segmentation + + +
+ Current prevailing Video Object Segmentation (VOS) methods usually perform +dense matching between the current and reference frames after extracting their +features. One on hand, the decoupled modeling restricts the targets information +propagation only at high-level feature space. On the other hand, the pixel-wise +matching leads to a lack of holistic understanding of the targets. To overcome +these issues, we propose a unified VOS framework, coined as JointFormer, for +joint modeling the three elements of feature, correspondence, and a compressed +memory. The core design is the Joint Block, utilizing the flexibility of +attention to simultaneously extract feature and propagate the targets +information to the current tokens and the compressed memory token. This scheme +allows to perform extensive information propagation and discriminative feature +learning. To incorporate the long-term temporal targets information, we also +devise a customized online updating mechanism for the compressed memory token, +which can prompt the information flow along the temporal dimension and thus +improve the global modeling capability. Under the design, our method achieves a +new state-of-art performance on DAVIS 2017 val/test-dev (89.7% and 87.6%) and +YouTube-VOS 2018/2019 val (87.0% and 87.0%) benchmarks, outperforming existing +works by a large margin. + +
+
+ comment: 9 pages, 8 figures +
+
+
+
+
+ + ☆ A2Q: Accumulator-Aware Quantization with Guaranteed Overflow Avoidance + + +
+ We present accumulator-aware quantization (A2Q), a novel weight quantization +method designed to train quantized neural networks (QNNs) to avoid overflow +when using low-precision accumulators during inference. A2Q introduces a unique +formulation inspired by weight normalization that constrains the L1-norm of +model weights according to accumulator bit width bounds that we derive. Thus, +in training QNNs for low-precision accumulation, A2Q also inherently promotes +unstructured weight sparsity to guarantee overflow avoidance. We apply our +method to deep learning-based computer vision tasks to show that A2Q can train +QNNs for low-precision accumulators while maintaining model accuracy +competitive with a floating-point baseline. In our evaluations, we consider the +impact of A2Q on both general-purpose platforms and programmable hardware. +However, we primarily target model deployment on FPGAs because they can be +programmed to fully exploit custom accumulator bit widths. Our experimentation +shows accumulator bit width significantly impacts the resource efficiency of +FPGA-based accelerators. On average across our benchmarks, A2Q offers up to a +2.3x reduction in resource utilization over 32-bit accumulator counterparts +with 99.2% of the floating-point model accuracy. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2301.13376 +
+
+
+
+
+ + ☆ Attending Generalizability in Course of Deep Fake Detection by Exploring + Multi-task Learning + + +
+ This work explores various ways of exploring multi-task learning (MTL) +techniques aimed at classifying videos as original or manipulated in +cross-manipulation scenario to attend generalizability in deep fake scenario. +The dataset used in our evaluation is FaceForensics++, which features 1000 +original videos manipulated by four different techniques, with a total of 5000 +videos. We conduct extensive experiments on multi-task learning and contrastive +techniques, which are well studied in literature for their generalization +benefits. It can be concluded that the proposed detection model is quite +generalized, i.e., accurately detects manipulation methods not encountered +during training as compared to the state-of-the-art. + +
+
+
+
+
+ + ☆ Open Gaze: An Open-Source Implementation Replicating Google's Eye + Tracking Paper + + +
+ Eye tracking has been a pivotal tool in diverse fields such as vision +research, language analysis, and usability assessment. The majority of prior +investigations, however, have concentrated on expansive desktop displays +employing specialized, costly eye tracking hardware that lacks scalability. +Remarkably little insight exists into ocular movement patterns on smartphones, +despite their widespread adoption and significant usage. In this manuscript, we +present an open-source implementation of a smartphone-based gaze tracker that +emulates the methodology proposed by a GooglePaper (whose source code remains +proprietary). Our focus is on attaining accuracy comparable to that attained +through the GooglePaper's methodology, without the necessity for supplementary +hardware. Through the integration of machine learning techniques, we unveil an +accurate eye tracking solution that is native to smartphones. Our approach +demonstrates precision akin to the state-of-the-art mobile eye trackers, which +are characterized by a cost that is two orders of magnitude higher. Leveraging +the vast MIT GazeCapture dataset, which is available through registration on +the dataset's website, we successfully replicate crucial findings from previous +studies concerning ocular motion behavior in oculomotor tasks and saliency +analyses during natural image observation. Furthermore, we emphasize the +applicability of smartphone-based gaze tracking in discerning reading +comprehension challenges. Our findings exhibit the inherent potential to +amplify eye movement research by significant proportions, accommodating +participation from thousands of subjects with explicit consent. This +scalability not only fosters advancements in vision research, but also extends +its benefits to domains such as accessibility enhancement and healthcare +applications. + +
+
+ comment: 17 pages , 15 figures +
+
+
+
+
+ + ☆ Eventful Transformers: Leveraging Temporal Redundancy in Vision + Transformers ICCV 2023 + + +
+ Vision Transformers achieve impressive accuracy across a range of visual +recognition tasks. Unfortunately, their accuracy frequently comes with high +computational costs. This is a particular issue in video recognition, where +models are often applied repeatedly across frames or temporal chunks. In this +work, we exploit temporal redundancy between subsequent inputs to reduce the +cost of Transformers for video processing. We describe a method for identifying +and re-processing only those tokens that have changed significantly over time. +Our proposed family of models, Eventful Transformers, can be converted from +existing Transformers (often without any re-training) and give adaptive control +over the compute cost at runtime. We evaluate our method on large-scale +datasets for video object detection (ImageNet VID) and action recognition +(EPIC-Kitchens 100). Our approach leads to significant computational savings +(on the order of 2-4x) with only minor reductions in accuracy. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Ultrafast-and-Ultralight ConvNet-Based Intelligent Monitoring System for + Diagnosing Early-Stage Mpox Anytime and Anywhere + + +
+ Due to the lack of more efficient diagnostic tools for monkeypox, its spread +remains unchecked, presenting a formidable challenge to global health. While +the high efficacy of deep learning models for monkeypox diagnosis has been +demonstrated in related studies, the overlook of inference speed, the parameter +size and diagnosis performance for early-stage monkeypox renders the models +inapplicable in real-world settings. To address these challenges, we proposed +an ultrafast and ultralight network named Fast-MpoxNet. Fast-MpoxNet possesses +only 0.27M parameters and can process input images at 68 frames per second +(FPS) on the CPU. To counteract the diagnostic performance limitation brought +about by the small model capacity, it integrates the attention-based feature +fusion module and the multiple auxiliary losses enhancement strategy for better +detecting subtle image changes and optimizing weights. Using transfer learning +and five-fold cross-validation, Fast-MpoxNet achieves 94.26% Accuracy on the +Mpox dataset. Notably, its recall for early-stage monkeypox achieves 93.65%. By +adopting data augmentation, our model's Accuracy rises to 98.40% and attains a +Practicality Score (A new metric for measuring model practicality in real-time +diagnosis application) of 0.80. We also developed an application system named +Mpox-AISM V2 for both personal computers and mobile phones. Mpox-AISM V2 +features ultrafast responses, offline functionality, and easy deployment, +enabling accurate and real-time diagnosis for both the public and individuals +in various real-world settings, especially in populous settings during the +outbreak. Our work could potentially mitigate future monkeypox outbreak and +illuminate a fresh paradigm for developing real-time diagnostic tools in the +healthcare field. + +
+
+ comment: This paper has been submitted to Neurocomputing +
+
+
+
+
+ + ☆ Temporal Uncertainty Localization to Enable Human-in-the-loop Analysis + of Dynamic Contrast-enhanced Cardiac MRI Datasets MICCAI 2023 + + +
+ Dynamic contrast-enhanced (DCE) cardiac magnetic resonance imaging (CMRI) is +a widely used modality for diagnosing myocardial blood flow (perfusion) +abnormalities. During a typical free-breathing DCE-CMRI scan, close to 300 +time-resolved images of myocardial perfusion are acquired at various contrast +"wash in/out" phases. Manual segmentation of myocardial contours in each +time-frame of a DCE image series can be tedious and time-consuming, +particularly when non-rigid motion correction has failed or is unavailable. +While deep neural networks (DNNs) have shown promise for analyzing DCE-CMRI +datasets, a "dynamic quality control" (dQC) technique for reliably detecting +failed segmentations is lacking. Here we propose a new space-time uncertainty +metric as a dQC tool for DNN-based segmentation of free-breathing DCE-CMRI +datasets by validating the proposed metric on an external dataset and +establishing a human-in-the-loop framework to improve the segmentation results. +In the proposed approach, we referred the top 10% most uncertain segmentations +as detected by our dQC tool to the human expert for refinement. This approach +resulted in a significant increase in the Dice score (p<0.001) and a notable +decrease in the number of images with failed segmentation (16.2% to 11.3%) +whereas the alternative approach of randomly selecting the same number of +segmentations for human referral did not achieve any significant improvement. +Our results suggest that the proposed dQC framework has the potential to +accurately identify poor-quality segmentations and may enable efficient +DNN-based analysis of DCE-CMRI in a human-in-the-loop pipeline for clinical +interpretation and reporting of dynamic CMRI datasets. + +
+
+ comment: Accepted for publication in MICCAI 2023 +
+
+
+
+
+ + ☆ Unlocking the Performance of Proximity Sensors by Utilizing Transient + Histograms + + +
+ We provide methods which recover planar scene geometry by utilizing the +transient histograms captured by a class of close-range time-of-flight (ToF) +distance sensor. A transient histogram is a one dimensional temporal waveform +which encodes the arrival time of photons incident on the ToF sensor. +Typically, a sensor processes the transient histogram using a proprietary +algorithm to produce distance estimates, which are commonly used in several +robotics applications. Our methods utilize the transient histogram directly to +enable recovery of planar geometry more accurately than is possible using only +proprietary distance estimates, and consistent recovery of the albedo of the +planar surface, which is not possible with proprietary distance estimates +alone. This is accomplished via a differentiable rendering pipeline, which +simulates the transient imaging process, allowing direct optimization of scene +geometry to match observations. To validate our methods, we capture 3,800 +measurements of eight planar surfaces from a wide range of viewpoints, and show +that our method outperforms the proprietary-distance-estimate baseline by an +order of magnitude in most scenarios. We demonstrate a simple robotics +application which uses our method to sense the distance to and slope of a +planar surface from a sensor mounted on the end effector of a robot arm. + +
+
+ comment: Accepted for publication at IEEE Robotics and Automation Letters + (RA-L) +
+
+
+
+
+ + ☆ A Fast Minimization Algorithm for the Euler Elastica Model Based on a + Bilinear Decomposition + + +
+ The Euler Elastica (EE) model with surface curvature can generate +artifact-free results compared with the traditional total variation +regularization model in image processing. However, strong nonlinearity and +singularity due to the curvature term in the EE model pose a great challenge +for one to design fast and stable algorithms for the EE model. In this paper, +we propose a new, fast, hybrid alternating minimization (HALM) algorithm for +the EE model based on a bilinear decomposition of the gradient of the +underlying image and prove the global convergence of the minimizing sequence +generated by the algorithm under mild conditions. The HALM algorithm comprises +three sub-minimization problems and each is either solved in the closed form or +approximated by fast solvers making the new algorithm highly accurate and +efficient. We also discuss the extension of the HALM strategy to deal with +general curvature-based variational models, especially with a Lipschitz smooth +functional of the curvature. A host of numerical experiments are conducted to +show that the new algorithm produces good results with much-improved efficiency +compared to other state-of-the-art algorithms for the EE model. As one of the +benchmarks, we show that the average running time of the HALM algorithm is at +most one-quarter of that of the fast operator-splitting-based +Deng-Glowinski-Tai algorithm. + +
+
+
+
+
+ + ☆ RestNet: Boosting Cross-Domain Few-Shot Segmentation with Residual + Transformation Network BMVC 2023 + + +
+ Cross-domain few-shot segmentation (CD-FSS) aims to achieve semantic +segmentation in previously unseen domains with a limited number of annotated +samples. Although existing CD-FSS models focus on cross-domain feature +transformation, relying exclusively on inter-domain knowledge transfer may lead +to the loss of critical intra-domain information. To this end, we propose a +novel residual transformation network (RestNet) that facilitates knowledge +transfer while retaining the intra-domain support-query feature information. +Specifically, we propose a Semantic Enhanced Anchor Transform (SEAT) module +that maps features to a stable domain-agnostic space using advanced semantics. +Additionally, an Intra-domain Residual Enhancement (IRE) module is designed to +maintain the intra-domain representation of the original discriminant space in +the new space. We also propose a mask prediction strategy based on prototype +fusion to help the model gradually learn how to segment. Our RestNet can +transfer cross-domain knowledge from both inter-domain and intra-domain without +requiring additional fine-tuning. Extensive experiments on ISIC, Chest X-ray, +and FSS-1000 show that our RestNet achieves state-of-the-art performance. Our +code will be available soon. + +
+
+ comment: BMVC 2023 +
+
+
+
+
+ + ☆ Unlocking Fine-Grained Details with Wavelet-based High-Frequency + Enhancement in Transformers MICCAI 2023 + + +
+ Medical image segmentation is a critical task that plays a vital role in +diagnosis, treatment planning, and disease monitoring. Accurate segmentation of +anatomical structures and abnormalities from medical images can aid in the +early detection and treatment of various diseases. In this paper, we address +the local feature deficiency of the Transformer model by carefully re-designing +the self-attention map to produce accurate dense prediction in medical images. +To this end, we first apply the wavelet transformation to decompose the input +feature map into low-frequency (LF) and high-frequency (HF) subbands. The LF +segment is associated with coarse-grained features while the HF components +preserve fine-grained features such as texture and edge information. Next, we +reformulate the self-attention operation using the efficient Transformer to +perform both spatial and context attention on top of the frequency +representation. Furthermore, to intensify the importance of the boundary +information, we impose an additional attention map by creating a Gaussian +pyramid on top of the HF components. Moreover, we propose a multi-scale context +enhancement block within skip connections to adaptively model inter-scale +dependencies to overcome the semantic gap among stages of the encoder and +decoder modules. Throughout comprehensive experiments, we demonstrate the +effectiveness of our strategy on multi-organ and skin lesion segmentation +benchmarks. The implementation code will be available upon acceptance. +\href{https://github.com/mindflow-institue/WaveFormer}{GitHub}. + +
+
+ comment: Accepted in MICCAI 2023 workshop MLMI +
+
+
+
+
+ + ☆ Mesh-Wise Prediction of Demographic Composition from Satellite Images + Using Multi-Head Convolutional Neural Network + + +
+ Population aging is one of the most serious problems in certain countries. In +order to implement its countermeasures, understanding its rapid progress is of +urgency with a granular resolution. However, a detailed and rigorous survey +with high frequency is not feasible due to the constraints of financial and +human resources. Nowadays, Deep Learning is prevalent for pattern recognition +with significant accuracy, with its application to remote sensing. This paper +proposes a multi-head Convolutional Neural Network model with transfer learning +from pre-trained ResNet50 for estimating mesh-wise demographics of Japan as one +of the most aged countries in the world, with satellite images from +Landsat-8/OLI and Suomi NPP/VIIRS-DNS as inputs and census demographics as +labels. The trained model was performed on a testing dataset with a test score +of at least 0.8914 in $\text{R}^2$ for all the demographic composition groups, +and the estimated demographic composition was generated and visualised for 2022 +as a non-census year. + +
+
+
+
+
+ + ☆ Position-Enhanced Visual Instruction Tuning for Multimodal Large + Language Models + + +
+ Recently, Multimodal Large Language Models (MLLMs) that enable Large Language +Models (LLMs) to interpret images through visual instruction tuning have +achieved significant success. However, existing visual instruction tuning +methods only utilize image-language instruction data to align the language and +image modalities, lacking a more fine-grained cross-modal alignment. In this +paper, we propose Position-enhanced Visual Instruction Tuning (PVIT), which +extends the functionality of MLLMs by integrating an additional region-level +vision encoder. This integration promotes a more detailed comprehension of +images for the MLLM. In addition, to efficiently achieve a fine-grained +alignment between the vision modules and the LLM, we design multiple data +generation strategies to construct an image-region-language instruction +dataset. Finally, we present both quantitative experiments and qualitative +analysis that demonstrate the superiority of the proposed model. Code and data +will be released at https://github.com/THUNLP-MT/PVIT. + +
+
+
+
+
+ + ☆ Exploiting Diverse Feature for Multimodal Sentiment Analysis + + +
+ In this paper, we present our solution to the MuSe-Personalisation +sub-challenge in the MuSe 2023 Multimodal Sentiment Analysis Challenge. The +task of MuSe-Personalisation aims to predict the continuous arousal and valence +values of a participant based on their audio-visual, language, and +physiological signal modalities data. Considering different people have +personal characteristics, the main challenge of this task is how to build +robustness feature presentation for sentiment prediction. To address this +issue, we propose exploiting diverse features. Specifically, we proposed a +series of feature extraction methods to build a robust representation and model +ensemble. We empirically evaluate the performance of the utilized method on the +officially provided dataset. \textbf{As a result, we achieved 3rd place in the +MuSe-Personalisation sub-challenge.} Specifically, we achieve the results of +0.8492 and 0.8439 for MuSe-Personalisation in terms of arousal and valence CCC. + +
+
+
+
+
+ + ☆ Nougat: Neural Optical Understanding for Academic Documents + + +
+ Scientific knowledge is predominantly stored in books and scientific +journals, often in the form of PDFs. However, the PDF format leads to a loss of +semantic information, particularly for mathematical expressions. We propose +Nougat (Neural Optical Understanding for Academic Documents), a Visual +Transformer model that performs an Optical Character Recognition (OCR) task for +processing scientific documents into a markup language, and demonstrate the +effectiveness of our model on a new dataset of scientific documents. The +proposed approach offers a promising solution to enhance the accessibility of +scientific knowledge in the digital age, by bridging the gap between +human-readable documents and machine-readable text. We release the models and +code to accelerate future work on scientific text recognition. + +
+
+ comment: 17 pages, 10 figures +
+
+
+
+
+ + ☆ An investigation into the impact of deep learning model choice on sex + and race bias in cardiac MR segmentation + + +
+ In medical imaging, artificial intelligence (AI) is increasingly being used +to automate routine tasks. However, these algorithms can exhibit and exacerbate +biases which lead to disparate performances between protected groups. We +investigate the impact of model choice on how imbalances in subject sex and +race in training datasets affect AI-based cine cardiac magnetic resonance image +segmentation. We evaluate three convolutional neural network-based models and +one vision transformer model. We find significant sex bias in three of the four +models and racial bias in all of the models. However, the severity and nature +of the bias varies between the models, highlighting the importance of model +choice when attempting to train fair AI-based segmentation models for medical +imaging tasks. + +
+
+
+
+
+ + ☆ Harvard Glaucoma Detection and Progression: A Multimodal Multitask + Dataset and Generalization-Reinforced Semi-Supervised Learning ICCV 2023 + + +
+ Glaucoma is the number one cause of irreversible blindness globally. A major +challenge for accurate glaucoma detection and progression forecasting is the +bottleneck of limited labeled patients with the state-of-the-art (SOTA) 3D +retinal imaging data of optical coherence tomography (OCT). To address the data +scarcity issue, this paper proposes two solutions. First, we develop a novel +generalization-reinforced semi-supervised learning (SSL) model called pseudo +supervisor to optimally utilize unlabeled data. Compared with SOTA models, the +proposed pseudo supervisor optimizes the policy of predicting pseudo labels +with unlabeled samples to improve empirical generalization. Our pseudo +supervisor model is evaluated with two clinical tasks consisting of glaucoma +detection and progression forecasting. The progression forecasting task is +evaluated both unimodally and multimodally. Our pseudo supervisor model +demonstrates superior performance than SOTA SSL comparison models. Moreover, +our model also achieves the best results on the publicly available LAG fundus +dataset. Second, we introduce the Harvard Glaucoma Detection and Progression +(Harvard-GDP) Dataset, a multimodal multitask dataset that includes data from +1,000 patients with OCT imaging data, as well as labels for glaucoma detection +and progression. This is the largest glaucoma detection dataset with 3D OCT +imaging data and the first glaucoma progression forecasting dataset that is +publicly available. Detailed sex and racial analysis are provided, which can be +used by interested researchers for fairness learning studies. Our released +dataset is benchmarked with several SOTA supervised CNN and transformer deep +learning models. The dataset and code are made publicly available via +\url{https://ophai.hms.harvard.edu/datasets/harvard-gdp1000}. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Using Visual and Vehicular Sensors for Driver Behavior Analysis: A + Survey + + +
+ Risky drivers account for 70% of fatal accidents in the United States. With +recent advances in sensors and intelligent vehicular systems, there has been +significant research on assessing driver behavior to improve driving +experiences and road safety. This paper examines the various techniques used to +analyze driver behavior using visual and vehicular data, providing an overview +of the latest research in this field. The paper also discusses the challenges +and open problems in the field and offers potential recommendations for future +research. The survey concludes that integrating vision and vehicular +information can significantly enhance the accuracy and effectiveness of driver +behavior analysis, leading to improved safety measures and reduced traffic +accidents. + +
+
+ comment: 10 pages, 2 figures, 5 tables +
+
+
+
+
+ + ☆ Relighting Neural Radiance Fields with Shadow and Highlight Hints SIGGRAPH 2023 + + +
+ This paper presents a novel neural implicit radiance representation for free +viewpoint relighting from a small set of unstructured photographs of an object +lit by a moving point light source different from the view position. We express +the shape as a signed distance function modeled by a multi layer perceptron. In +contrast to prior relightable implicit neural representations, we do not +disentangle the different reflectance components, but model both the local and +global reflectance at each point by a second multi layer perceptron that, in +addition, to density features, the current position, the normal (from the +signed distace function), view direction, and light position, also takes shadow +and highlight hints to aid the network in modeling the corresponding high +frequency light transport effects. These hints are provided as a suggestion, +and we leave it up to the network to decide how to incorporate these in the +final relit result. We demonstrate and validate our neural implicit +representation on synthetic and real scenes exhibiting a wide variety of +shapes, material properties, and global illumination light transport. + +
+
+ comment: Accepted to SIGGRAPH 2023. Author's version. Project page: + https://nrhints.github.io/ +
+
+
+
+
+ + ☆ Self-Supervised Representation Learning with Cross-Context Learning + between Global and Hypercolumn Features + + +
+ Whilst contrastive learning yields powerful representations by matching +different augmented views of the same instance, it lacks the ability to capture +the similarities between different instances. One popular way to address this +limitation is by learning global features (after the global pooling) to capture +inter-instance relationships based on knowledge distillation, where the global +features of the teacher are used to guide the learning of the global features +of the student. Inspired by cross-modality learning, we extend this existing +framework that only learns from global features by encouraging the global +features and intermediate layer features to learn from each other. This leads +to our novel self-supervised framework: cross-context learning between global +and hypercolumn features (CGH), that enforces the consistency of instance +relations between low- and high-level semantics. Specifically, we stack the +intermediate feature maps to construct a hypercolumn representation so that we +can measure instance relations using two contexts (hypercolumn and global +feature) separately, and then use the relations of one context to guide the +learning of the other. This cross-context learning allows the model to learn +from the differences between the two contexts. The experimental results on +linear classification and downstream tasks show that our method outperforms the +state-of-the-art methods. + +
+
+
+
+
+ + ☆ Direction-aware Video Demoireing with Temporal-guided Bilateral Learning + + +
+ Moire patterns occur when capturing images or videos on screens, severely +degrading the quality of the captured images or videos. Despite the recent +progresses, existing video demoireing methods neglect the physical +characteristics and formation process of moire patterns, significantly limiting +the effectiveness of video recovery. This paper presents a unified framework, +DTNet, a direction-aware and temporal-guided bilateral learning network for +video demoireing. DTNet effectively incorporates the process of moire pattern +removal, alignment, color correction, and detail refinement. Our proposed DTNet +comprises two primary stages: Frame-level Direction-aware Demoireing and +Alignment (FDDA) and Tone and Detail Refinement (TDR). In FDDA, we employ +multiple directional DCT modes to perform the moire pattern removal process in +the frequency domain, effectively detecting the prominent moire edges. Then, +the coarse and fine-grained alignment is applied on the demoired features for +facilitating the utilization of neighboring information. In TDR, we propose a +temporal-guided bilateral learning pipeline to mitigate the degradation of +color and details caused by the moire patterns while preserving the restored +frequency information in FDDA. Guided by the aligned temporal features from +FDDA, the affine transformations for the recovery of the ultimate clean frames +are learned in TDR. Extensive experiments demonstrate that our video demoireing +method outperforms state-of-the-art approaches by 2.3 dB in PSNR, and also +delivers a superior visual experience. + +
+
+
+
+
+ + ☆ Prompting Visual-Language Models for Dynamic Facial Expression + Recognition BMVC 2023 + + +
+ This paper presents a novel visual-language model called DFER-CLIP, which is +based on the CLIP model and designed for in-the-wild Dynamic Facial Expression +Recognition (DFER). Specifically, the proposed DFER-CLIP consists of a visual +part and a textual part. For the visual part, based on the CLIP image encoder, +a temporal model consisting of several Transformer encoders is introduced for +extracting temporal facial expression features, and the final feature embedding +is obtained as a learnable "class" token. For the textual part, we use as +inputs textual descriptions of the facial behaviour that is related to the +classes (facial expressions) that we are interested in recognising -- those +descriptions are generated using large language models, like ChatGPT. This, in +contrast to works that use only the class names and more accurately captures +the relationship between them. Alongside the textual description, we introduce +a learnable token which helps the model learn relevant context information for +each expression during training. Extensive experiments demonstrate the +effectiveness of the proposed method and show that our DFER-CLIP also achieves +state-of-the-art results compared with the current supervised DFER methods on +the DFEW, FERV39k, and MAFW benchmarks. Code is publicly available at +https://github.com/zengqunzhao/DFER-CLIP. + +
+
+ comment: Accepted at BMVC 2023 +
+
+
+
+
+ + ☆ Enhanced Mortality Prediction In Patients With Subarachnoid Haemorrhage + Using A Deep Learning Model Based On The Initial CT Scan + + +
+ PURPOSE: Subarachnoid hemorrhage (SAH) entails high morbidity and mortality +rates. Convolutional neural networks (CNN), a form of deep learning, are +capable of generating highly accurate predictions from imaging data. Our +objective was to predict mortality in SAH patients by processing the initial CT +scan on a CNN based algorithm. + METHODS: Retrospective multicentric study of a consecutive cohort of patients +with SAH between 2011-2022. Demographic, clinical and radiological variables +were analyzed. Pre-processed baseline CT scan images were used as the input for +training a CNN using AUCMEDI Framework. Our model's architecture leverages the +DenseNet-121 structure, employing transfer learning principles. The output +variable was mortality in the first three months. Performance of the model was +evaluated by statistical parameters conventionally used in studies involving +artificial intelligence methods. + RESULTS: Images from 219 patients were processed, 175 for training and +validation of the CNN and 44 for its evaluation. 52%(115/219) of patients were +female, and the median age was 58(SD=13.06) years. 18.5%(39/219) were +idiopathic SAH. Mortality rate was 28.5%(63/219). The model showed good +accuracy at predicting mortality in SAH patients exclusively using the images +of the initial CT scan (Accuracy=74%, F1=75% and AUC=82%). CONCLUSION: Modern +image processing techniques based on AI and CNN make possible to predict +mortality in SAH patients with high accuracy using CT scan images as the only +input. These models might be optimized by including more data and patients +resulting in better training, development and performance on tasks which are +beyond the skills of conventional clinical knowledge. + +
+
+
+
+
+ + ☆ Distribution-Aligned Diffusion for Human Mesh Recovery ICCV 2023 + + +
+ Recovering a 3D human mesh from a single RGB image is a challenging task due +to depth ambiguity and self-occlusion, resulting in a high degree of +uncertainty. Meanwhile, diffusion models have recently seen much success in +generating high-quality outputs by progressively denoising noisy inputs. +Inspired by their capability, we explore a diffusion-based approach for human +mesh recovery, and propose a Human Mesh Diffusion (HMDiff) framework which +frames mesh recovery as a reverse diffusion process. We also propose a +Distribution Alignment Technique (DAT) that injects input-specific distribution +information into the diffusion process, and provides useful prior knowledge to +simplify the mesh recovery task. Our method achieves state-of-the-art +performance on three widely used datasets. Project page: +https://gongjia0208.github.io/HMDiff/. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ Burnt area extraction from high-resolution satellite images based on + anomaly detection ACL + + +
+ Wildfire detection using satellite images is a widely studied task in remote +sensing with many applications to fire delineation and mapping. Recently, deep +learning methods have become a scalable solution to automate this task, +especially in the field of unsupervised learning where no training data is +available. This is particularly important in the context of emergency risk +monitoring where fast and effective detection is needed, generally based on +high-resolution satellite data. Among various approaches, Anomaly Detection +(AD) appears to be highly potential thanks to its broad applications in +computer vision, medical imaging, as well as remote sensing. In this work, we +build upon the framework of Vector Quantized Variational Autoencoder (VQ-VAE), +a popular reconstruction-based AD method with discrete latent spaces, to +perform unsupervised burnt area extraction. We integrate VQ-VAE into an +end-to-end framework with an intensive post-processing step using dedicated +vegetation, water and brightness indexes. Our experiments conducted on +high-resolution SPOT-6/7 images provide promising results of the proposed +technique, showing its high potential in future research on unsupervised burnt +area extraction. + +
+
+ comment: 10 pages, accepted to the MACLEAN workshop of ECML/PKDD 2023 +
+
+
+
+
+ + ☆ CS-Mixer: A Cross-Scale Vision MLP Model with Spatial-Channel Mixing + + +
+ Despite their simpler information fusion designs compared with Vision +Transformers and Convolutional Neural Networks, Vision MLP architectures have +demonstrated strong performance and high data efficiency in recent research. +However, existing works such as CycleMLP and Vision Permutator typically model +spatial information in equal-size spatial regions and do not consider +cross-scale spatial interactions. Further, their token mixers only model 1- or +2-axis correlations, avoiding 3-axis spatial-channel mixing due to its +computational demands. We therefore propose CS-Mixer, a hierarchical Vision MLP +that learns dynamic low-rank transformations for spatial-channel mixing through +cross-scale local and global aggregation. The proposed methodology achieves +competitive results on popular image recognition benchmarks without incurring +substantially more compute. Our largest model, CS-Mixer-L, reaches 83.2% top-1 +accuracy on ImageNet-1k with 13.7 GFLOPs and 94 M parameters. + +
+
+ comment: 8 page, 5 figures, developed under Penn State University's + Multi-Campus Research Experience for Undergraduates Symposium, 2023 +
+
+
+
+
+ + ☆ CEIMVEN: An Approach of Cutting Edge Implementation of Modified Versions + of EfficientNet (V1-V2) Architecture for Breast Cancer Detection and + Classification from Ultrasound Images + + +
+ Undoubtedly breast cancer identifies itself as one of the most widespread and +terrifying cancers across the globe. Millions of women are getting affected +each year from it. Breast cancer remains the major one for being the reason of +largest number of demise of women. In the recent time of research, Medical +Image Computing and Processing has been playing a significant role for +detecting and classifying breast cancers from ultrasound images and mammograms, +along with the celestial touch of deep neural networks. In this research, we +focused mostly on our rigorous implementations and iterative result analysis of +different cutting-edge modified versions of EfficientNet architectures namely +EfficientNet-V1 (b0-b7) and EfficientNet-V2 (b0-b3) with ultrasound image, +named as CEIMVEN. We utilized transfer learning approach here for using the +pre-trained models of EfficientNet versions. We activated the hyper-parameter +tuning procedures, added fully connected layers, discarded the unprecedented +outliers and recorded the accuracy results from our custom modified +EfficientNet architectures. Our deep learning model training approach was +related to both identifying the cancer affected areas with region of interest +(ROI) techniques and multiple classifications (benign, malignant and normal). +The approximate testing accuracies we got from the modified versions of +EfficientNet-V1 (b0- 99.15%, b1- 98.58%, b2- 98.43%, b3- 98.01%, b4- 98.86%, +b5- 97.72%, b6- 97.72%, b7- 98.72%) and EfficientNet-V2 (b0- 99.29%, b1- +99.01%, b2- 98.72%, b3- 99.43%) are showing very bright future and strong +potentials of deep learning approach for the successful detection and +classification of breast cancers from the ultrasound images at a very early +stage. + +
+
+
+
+
+ + ☆ Squeeze aggregated excitation network + + +
+ Convolutional neural networks have spatial representations which read +patterns in the vision tasks. Squeeze and excitation links the channel wise +representations by explicitly modeling on channel level. Multi layer +perceptrons learn global representations and in most of the models it is used +often at the end after all convolutional layers to gather all the information +learned before classification. We propose a method of inducing the global +representations within channels to have better performance of the model. We +propose SaEnet, Squeeze aggregated excitation network, for learning global +channelwise representation in between layers. The proposed module takes +advantage of passing important information after squeeze by having aggregated +excitation before regaining its shape. We also introduce a new idea of having a +multibranch linear(dense) layer in the network. This learns global +representations from the condensed information which enhances the +representational power of the network. The proposed module have undergone +extensive experiments by using Imagenet and CIFAR100 datasets and compared with +closely related architectures. The analyzes results that proposed models +outputs are comparable and in some cases better than existing state of the art +architectures. + +
+
+ comment: 8 pages, 5 figures, 3 tables +
+
+
+
+
+ + ☆ TriGait: Aligning and Fusing Skeleton and Silhouette Gait Data via a + Tri-Branch Network + + +
+ Gait recognition is a promising biometric technology for identification due +to its non-invasiveness and long-distance. However, external variations such as +clothing changes and viewpoint differences pose significant challenges to gait +recognition. Silhouette-based methods preserve body shape but neglect internal +structure information, while skeleton-based methods preserve structure +information but omit appearance. To fully exploit the complementary nature of +the two modalities, a novel triple branch gait recognition framework, TriGait, +is proposed in this paper. It effectively integrates features from the skeleton +and silhouette data in a hybrid fusion manner, including a two-stream network +to extract static and motion features from appearance, a simple yet effective +module named JSA-TC to capture dependencies between all joints, and a third +branch for cross-modal learning by aligning and fusing low-level features of +two modalities. Experimental results demonstrate the superiority and +effectiveness of TriGait for gait recognition. The proposed method achieves a +mean rank-1 accuracy of 96.0% over all conditions on CASIA-B dataset and 94.3% +accuracy for CL, significantly outperforming all the state-of-the-art methods. +The source code will be available at https://github.com/feng-xueling/TriGait/. + +
+
+ comment: Accepted by IJCB 2023 +
+
+
+
+
+ + ☆ A Re-Parameterized Vision Transformer (ReVT) for Domain-Generalized + Semantic Segmentation + + +
+ The task of semantic segmentation requires a model to assign semantic labels +to each pixel of an image. However, the performance of such models degrades +when deployed in an unseen domain with different data distributions compared to +the training domain. We present a new augmentation-driven approach to domain +generalization for semantic segmentation using a re-parameterized vision +transformer (ReVT) with weight averaging of multiple models after training. We +evaluate our approach on several benchmark datasets and achieve +state-of-the-art mIoU performance of 47.3% (prior art: 46.3%) for small models +and of 50.1% (prior art: 47.8%) for midsized models on commonly used benchmark +datasets. At the same time, our method requires fewer parameters and reaches a +higher frame rate than the best prior art. It is also easy to implement and, +unlike network ensembles, does not add any computational complexity during +inference. + +
+
+
+
+
+ + ☆ 3D Face Alignment Through Fusion of Head Pose Information and Features + + +
+ The ability of humans to infer head poses from face shapes, and vice versa, +indicates a strong correlation between the two. Accordingly, recent studies on +face alignment have employed head pose information to predict facial landmarks +in computer vision tasks. In this study, we propose a novel method that employs +head pose information to improve face alignment performance by fusing said +information with the feature maps of a face alignment network, rather than +simply using it to initialize facial landmarks. Furthermore, the proposed +network structure performs robust face alignment through a dual-dimensional +network using multidimensional features represented by 2D feature maps and a 3D +heatmap. For effective dense face alignment, we also propose a prediction +method for facial geometric landmarks through training based on knowledge +distillation using predicted keypoints. We experimentally assessed the +correlation between the predicted facial landmarks and head pose information, +as well as variations in the accuracy of facial landmarks with respect to the +quality of head pose information. In addition, we demonstrated the +effectiveness of the proposed method through a competitive performance +comparison with state-of-the-art methods on the AFLW2000-3D, AFLW, and BIWI +datasets. + +
+
+
+
+
+ + ☆ ConSlide: Asynchronous Hierarchical Interaction Transformer with + Breakup-Reorganize Rehearsal for Continual Whole Slide Image Analysis ICCV 2023 + + +
+ Whole slide image (WSI) analysis has become increasingly important in the +medical imaging community, enabling automated and objective diagnosis, +prognosis, and therapeutic-response prediction. However, in clinical practice, +the ever-evolving environment hamper the utility of WSI analysis models. In +this paper, we propose the FIRST continual learning framework for WSI analysis, +named ConSlide, to tackle the challenges of enormous image size, utilization of +hierarchical structure, and catastrophic forgetting by progressive model +updating on multiple sequential datasets. Our framework contains three key +components. The Hierarchical Interaction Transformer (HIT) is proposed to model +and utilize the hierarchical structural knowledge of WSI. The +Breakup-Reorganize (BuRo) rehearsal method is developed for WSI data replay +with efficient region storing buffer and WSI reorganizing operation. The +asynchronous updating mechanism is devised to encourage the network to learn +generic and specific knowledge respectively during the replay stage, based on a +nested cross-scale similarity learning (CSSL) module. We evaluated the proposed +ConSlide on four public WSI datasets from TCGA projects. It performs best over +other state-of-the-art methods with a fair WSI-based continual learning setting +and achieves a better trade-off of the overall performance and forgetting on +previous task + +
+
+ comment: To be appeared in ICCV 2023 +
+
+
+
+
+ + ☆ SVQNet: Sparse Voxel-Adjacent Query Network for 4D Spatio-Temporal LiDAR + Semantic Segmentation ICCV2023 + + +
+ LiDAR-based semantic perception tasks are critical yet challenging for +autonomous driving. Due to the motion of objects and static/dynamic occlusion, +temporal information plays an essential role in reinforcing perception by +enhancing and completing single-frame knowledge. Previous approaches either +directly stack historical frames to the current frame or build a 4D +spatio-temporal neighborhood using KNN, which duplicates computation and +hinders realtime performance. Based on our observation that stacking all the +historical points would damage performance due to a large amount of redundant +and misleading information, we propose the Sparse Voxel-Adjacent Query Network +(SVQNet) for 4D LiDAR semantic segmentation. To take full advantage of the +historical frames high-efficiently, we shunt the historical points into two +groups with reference to the current points. One is the Voxel-Adjacent +Neighborhood carrying local enhancing knowledge. The other is the Historical +Context completing the global knowledge. Then we propose new modules to select +and extract the instructive features from the two groups. Our SVQNet achieves +state-of-the-art performance in LiDAR semantic segmentation of the +SemanticKITTI benchmark and the nuScenes dataset. + +
+
+ comment: Received by ICCV2023 +
+
+
+
+
+ + ☆ Fine-tuning can cripple your foundation model; preserving features may + be the solution + + +
+ Pre-trained foundation models, owing primarily to their enormous capacity and +exposure to vast amount of training data scraped from the internet, enjoy the +advantage of storing knowledge about plenty of real-world concepts. Such models +are typically fine-tuned on downstream datasets to produce remarkable +state-of-the-art performances. While various fine-tuning methods have been +devised and are shown to be highly effective, we observe that a fine-tuned +model's ability to recognize concepts on tasks $\textit{different}$ from the +downstream one is reduced significantly compared to its pre-trained +counterpart. This is clearly undesirable as a huge amount of time and money +went into learning those very concepts in the first place. We call this +undesirable phenomenon "concept forgetting" and via experiments show that most +end-to-end fine-tuning approaches suffer heavily from this side effect. To this +end, we also propose a rather simple fix to this problem by designing a method +called LDIFS (short for $\ell_2$ distance in feature space) that simply +preserves the features of the original foundation model during fine-tuning. We +show that LDIFS significantly reduces concept forgetting without having +noticeable impact on the downstream task performance. + +
+
+
+
+
+ + ☆ Dynamic Residual Classifier for Class Incremental Learning + + +
+ The rehearsal strategy is widely used to alleviate the catastrophic +forgetting problem in class incremental learning (CIL) by preserving limited +exemplars from previous tasks. With imbalanced sample numbers between old and +new classes, the classifier learning can be biased. Existing CIL methods +exploit the long-tailed (LT) recognition techniques, e.g., the adjusted losses +and the data re-sampling methods, to handle the data imbalance issue within +each increment task. In this work, the dynamic nature of data imbalance in CIL +is shown and a novel Dynamic Residual Classifier (DRC) is proposed to handle +this challenging scenario. Specifically, DRC is built upon a recent advance +residual classifier with the branch layer merging to handle the model-growing +problem. Moreover, DRC is compatible with different CIL pipelines and +substantially improves them. Combining DRC with the model adaptation and fusion +(MAF) pipeline, this method achieves state-of-the-art results on both the +conventional CIL and the LT-CIL benchmarks. Extensive experiments are also +conducted for a detailed analysis. The code is publicly available. + +
+
+
+
+
+ + ☆ Bang and the Artefacts are Gone! Rapid Artefact Removal and Tissue + Segmentation in Haematoxylin and Eosin Stained Biopsies + + +
+ We present H&E Otsu thresholding, a scheme for rapidly detecting tissue in +whole-slide images (WSIs) that eliminates a wide range of undesirable artefacts +such as pen marks and scanning artefacts. Our method involves obtaining a +bid-modal representation of a low-magnification RGB overview image which +enables simple Otsu thresholding to separate tissue from background and +artefacts. We demonstrate our method on WSIs prepared from a wide range of +institutions and WSI digital scanners, each containing substantial artefacts +that cause other methods to fail. The beauty of our approach lies in its +simplicity: manipulating RGB colour space and using Otsu thresholding allows +for the rapid removal of artefacts and segmentation of tissue. + +
+
+ comment: 4 pages, 2 figures +
+
+
+
+
+ + ☆ Learning Compact Neural Networks with Deep Overparameterised Multitask + Learning IJCAI2023 + + +
+ Compact neural network offers many benefits for real-world applications. +However, it is usually challenging to train the compact neural networks with +small parameter sizes and low computational costs to achieve the same or better +model performance compared to more complex and powerful architecture. This is +particularly true for multitask learning, with different tasks competing for +resources. We present a simple, efficient and effective multitask learning +overparameterisation neural network design by overparameterising the model +architecture in training and sharing the overparameterised model parameters +more effectively across tasks, for better optimisation and generalisation. +Experiments on two challenging multitask datasets (NYUv2 and COCO) demonstrate +the effectiveness of the proposed method across various convolutional networks +and parameter sizes. + +
+
+ comment: Accepted for IJCAI2023 workshop, 1st International Workshop on + Generalizing from Limited Resources in the Open World +
+
+
+
+
+ + ☆ Unsupervised Domain Adaptation for Anatomical Landmark Detection MICCAI 2023 + + +
+ Recently, anatomical landmark detection has achieved great progresses on +single-domain data, which usually assumes training and test sets are from the +same domain. However, such an assumption is not always true in practice, which +can cause significant performance drop due to domain shift. To tackle this +problem, we propose a novel framework for anatomical landmark detection under +the setting of unsupervised domain adaptation (UDA), which aims to transfer the +knowledge from labeled source domain to unlabeled target domain. The framework +leverages self-training and domain adversarial learning to address the domain +gap during adaptation. Specifically, a self-training strategy is proposed to +select reliable landmark-level pseudo-labels of target domain data with dynamic +thresholds, which makes the adaptation more effective. Furthermore, a domain +adversarial learning module is designed to handle the unaligned data +distributions of two domains by learning domain-invariant features via +adversarial training. Our experiments on cephalometric and lung landmark +detection show the effectiveness of the method, which reduces the domain gap by +a large margin and outperforms other UDA methods consistently. The code is +available at https://github.com/jhb86253817/UDA_Med_Landmark. + +
+
+ comment: Accepted to MICCAI 2023 +
+
+
+
+
+ + ☆ Bridging the Gap: Fine-to-Coarse Sketch Interpolation Network for + High-Quality Animation Sketch Inbetweening + + +
+ The 2D animation workflow is typically initiated with the creation of +keyframes using sketch-based drawing. Subsequent inbetweens (i.e., intermediate +sketch frames) are crafted through manual interpolation for smooth animations, +which is a labor-intensive process. Thus, the prospect of automatic animation +sketch interpolation has become highly appealing. However, existing video +interpolation methods are generally hindered by two key issues for sketch +inbetweening: 1) limited texture and colour details in sketches, and 2) +exaggerated alterations between two sketch keyframes. To overcome these issues, +we propose a novel deep learning method, namely Fine-to-Coarse Sketch +Interpolation Network (FC-SIN). This approach incorporates multi-level guidance +that formulates region-level correspondence, sketch-level correspondence and +pixel-level dynamics. A multi-stream U-Transformer is then devised to +characterize sketch inbewteening patterns using these multi-level guides +through the integration of both self-attention and cross-attention mechanisms. +Additionally, to facilitate future research on animation sketch inbetweening, +we constructed a large-scale dataset - STD-12K, comprising 30 sketch animation +series in diverse artistic styles. Comprehensive experiments on this dataset +convincingly show that our proposed FC-SIN surpasses the state-of-the-art +interpolation methods. Our code and dataset will be publicly available. + +
+
+ comment: 7pages,6figures +
+
+
+
+
+ + ☆ A Game of Bundle Adjustment -- Learning Efficient Convergence + + +
+ Bundle adjustment is the common way to solve localization and mapping. It is +an iterative process in which a system of non-linear equations is solved using +two optimization methods, weighted by a damping factor. In the classic +approach, the latter is chosen heuristically by the Levenberg-Marquardt +algorithm on each iteration. This might take many iterations, making the +process computationally expensive, which might be harmful to real-time +applications. We propose to replace this heuristic by viewing the problem in a +holistic manner, as a game, and formulating it as a reinforcement-learning +task. We set an environment which solves the non-linear equations and train an +agent to choose the damping factor in a learned manner. We demonstrate that our +approach considerably reduces the number of iterations required to reach the +bundle adjustment's convergence, on both synthetic and real-life scenarios. We +show that this reduction benefits the classic approach and can be integrated +with other bundle adjustment acceleration methods. + +
+
+
+
+
+ + ☆ Integrating Boxes and Masks: A Multi-Object Framework for Unified Visual + Tracking and Segmentation ICCV2023 + + +
+ Tracking any given object(s) spatially and temporally is a common purpose in +Visual Object Tracking (VOT) and Video Object Segmentation (VOS). Joint +tracking and segmentation have been attempted in some studies but they often +lack full compatibility of both box and mask in initialization and prediction, +and mainly focus on single-object scenarios. To address these limitations, this +paper proposes a Multi-object Mask-box Integrated framework for unified +Tracking and Segmentation, dubbed MITS. Firstly, the unified identification +module is proposed to support both box and mask reference for initialization, +where detailed object information is inferred from boxes or directly retained +from masks. Additionally, a novel pinpoint box predictor is proposed for +accurate multi-object box prediction, facilitating target-oriented +representation learning. All target objects are processed simultaneously from +encoding to propagation and decoding, as a unified pipeline for VOT and VOS. +Experimental results show MITS achieves state-of-the-art performance on both +VOT and VOS benchmarks. Notably, MITS surpasses the best prior VOT competitor +by around 6% on the GOT-10k test set, and significantly improves the +performance of box initialization on VOS benchmarks. The code is available at +https://github.com/yoxu515/MITS. + +
+
+ comment: Accepted to ICCV2023 +
+
+
+
+
+ + ☆ Kissing to Find a Match: Efficient Low-Rank Permutation Representation + + +
+ Permutation matrices play a key role in matching and assignment problems +across the fields, especially in computer vision and robotics. However, memory +for explicitly representing permutation matrices grows quadratically with the +size of the problem, prohibiting large problem instances. In this work, we +propose to tackle the curse of dimensionality of large permutation matrices by +approximating them using low-rank matrix factorization, followed by a +nonlinearity. To this end, we rely on the Kissing number theory to infer the +minimal rank required for representing a permutation matrix of a given size, +which is significantly smaller than the problem size. This leads to a drastic +reduction in computation and memory costs, e.g., up to $3$ orders of magnitude +less memory for a problem of size $n=20000$, represented using $8.4\times10^5$ +elements in two small matrices instead of using a single huge matrix with +$4\times 10^8$ elements. The proposed representation allows for accurate +representations of large permutation matrices, which in turn enables handling +large problems that would have been infeasible otherwise. We demonstrate the +applicability and merits of the proposed approach through a series of +experiments on a range of problems that involve predicting permutation +matrices, from linear and quadratic assignment to shape matching problems. + +
+
+ comment: 13 pages, 6 figures +
+
+
+
+
+ + ☆ Unpaired Multi-domain Attribute Translation of 3D Facial Shapes with a + Square and Symmetric Geometric Map + + +
+ While impressive progress has recently been made in image-oriented facial +attribute translation, shape-oriented 3D facial attribute translation remains +an unsolved issue. This is primarily limited by the lack of 3D generative +models and ineffective usage of 3D facial data. We propose a learning framework +for 3D facial attribute translation to relieve these limitations. Firstly, we +customize a novel geometric map for 3D shape representation and embed it in an +end-to-end generative adversarial network. The geometric map represents 3D +shapes symmetrically on a square image grid, while preserving the neighboring +relationship of 3D vertices in a local least-square sense. This enables +effective learning for the latent representation of data with different +attributes. Secondly, we employ a unified and unpaired learning framework for +multi-domain attribute translation. It not only makes effective usage of data +correlation from multiple domains, but also mitigates the constraint for hardly +accessible paired data. Finally, we propose a hierarchical architecture for the +discriminator to guarantee robust results against both global and local +artifacts. We conduct extensive experiments to demonstrate the advantage of the +proposed framework over the state-of-the-art in generating high-fidelity facial +shapes. Given an input 3D facial shape, the proposed framework is able to +synthesize novel shapes of different attributes, which covers some downstream +applications, such as expression transfer, gender translation, and aging. Code +at https://github.com/NaughtyZZ/3D_facial_shape_attribute_translation_ssgmap. + +
+
+
+
+
+ + ☆ Black-box Unsupervised Domain Adaptation with Bi-directional + Atkinson-Shiffrin Memory ICCV2023 + + +
+ Black-box unsupervised domain adaptation (UDA) learns with source predictions +of target data without accessing either source data or source models during +training, and it has clear superiority in data privacy and flexibility in +target network selection. However, the source predictions of target data are +often noisy and training with them is prone to learning collapses. We propose +BiMem, a bi-directional memorization mechanism that learns to remember useful +and representative information to correct noisy pseudo labels on the fly, +leading to robust black-box UDA that can generalize across different visual +recognition tasks. BiMem constructs three types of memory, including sensory +memory, short-term memory, and long-term memory, which interact in a +bi-directional manner for comprehensive and robust memorization of learnt +features. It includes a forward memorization flow that identifies and stores +useful features and a backward calibration flow that rectifies features' pseudo +labels progressively. Extensive experiments show that BiMem achieves superior +domain adaptation performance consistently across various visual recognition +tasks such as image classification, semantic segmentation and object detection. + +
+
+ comment: Accepted to ICCV2023 +
+
+
+
+
+ + ☆ ReST: A Reconfigurable Spatial-Temporal Graph Model for Multi-Camera + Multi-Object Tracking ICCV2023 + + +
+ Multi-Camera Multi-Object Tracking (MC-MOT) utilizes information from +multiple views to better handle problems with occlusion and crowded scenes. +Recently, the use of graph-based approaches to solve tracking problems has +become very popular. However, many current graph-based methods do not +effectively utilize information regarding spatial and temporal consistency. +Instead, they rely on single-camera trackers as input, which are prone to +fragmentation and ID switch errors. In this paper, we propose a novel +reconfigurable graph model that first associates all detected objects across +cameras spatially before reconfiguring it into a temporal graph for Temporal +Association. This two-stage association approach enables us to extract robust +spatial and temporal-aware features and address the problem with fragmented +tracklets. Furthermore, our model is designed for online tracking, making it +suitable for real-world applications. Experimental results show that the +proposed graph model is able to extract more discriminating features for object +tracking, and our model achieves state-of-the-art performance on several public +datasets. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ DPF-Net: Combining Explicit Shape Priors in Deformable Primitive Field + for Unsupervised Structural Reconstruction of 3D Objects + + +
+ Unsupervised methods for reconstructing structures face significant +challenges in capturing the geometric details with consistent structures among +diverse shapes of the same category. To address this issue, we present a novel +unsupervised structural reconstruction method, named DPF-Net, based on a new +Deformable Primitive Field (DPF) representation, which allows for high-quality +shape reconstruction using parameterized geometric primitives. We design a +two-stage shape reconstruction pipeline which consists of a primitive +generation module and a primitive deformation module to approximate the target +shape of each part progressively. The primitive generation module estimates the +explicit orientation, position, and size parameters of parameterized geometric +primitives, while the primitive deformation module predicts a dense deformation +field based on a parameterized primitive field to recover shape details. The +strong shape prior encoded in parameterized geometric primitives enables our +DPF-Net to extract high-level structures and recover fine-grained shape details +consistently. The experimental results on three categories of objects in +diverse shapes demonstrate the effectiveness and generalization ability of our +DPF-Net on structural reconstruction and shape segmentation. + +
+
+ comment: 9 pages, 6 figures +
+
+
+
+
+ + ☆ EfficientDreamer: High-Fidelity and Robust 3D Creation via + Orthogonal-view Diffusion Prior + + +
+ While the image diffusion model has made significant strides in text-driven +3D content creation, it often falls short in accurately capturing the intended +meaning of the text prompt, particularly with respect to direction information. +This shortcoming gives rise to the Janus problem, where multi-faced 3D models +are produced with the guidance of such diffusion models. In this paper, we +present a robust pipeline for generating high-fidelity 3D content with +orthogonal-view image guidance. Specifically, we introduce a novel 2D diffusion +model that generates an image consisting of four orthogonal-view sub-images for +the given text prompt. The 3D content is then created with this diffusion +model, which enhances 3D consistency and provides strong structured semantic +priors. This addresses the infamous Janus problem and significantly promotes +generation efficiency. Additionally, we employ a progressive 3D synthesis +strategy that results in substantial improvement in the quality of the created +3D contents. Both quantitative and qualitative evaluations show that our method +demonstrates a significant improvement over previous text-to-3D techniques. + +
+
+
+
+
+ + ☆ MultiCapCLIP: Auto-Encoding Prompts for Zero-Shot Multilingual Visual + Captioning ACL'2023 + + +
+ Supervised visual captioning models typically require a large scale of images +or videos paired with descriptions in a specific language (i.e., the +vision-caption pairs) for training. However, collecting and labeling +large-scale datasets is time-consuming and expensive for many scenarios and +languages. Therefore, sufficient labeled pairs are usually not available. To +deal with the label shortage problem, we present a simple yet effective +zero-shot approach MultiCapCLIP that can generate visual captions for different +scenarios and languages without any labeled vision-caption pairs of downstream +datasets. In the training stage, MultiCapCLIP only requires text data for +input. Then it conducts two main steps: 1) retrieving concept prompts that +preserve the corresponding domain knowledge of new scenarios; 2) auto-encoding +the prompts to learn writing styles to output captions in a desired language. +In the testing stage, MultiCapCLIP instead takes visual data as input directly +to retrieve the concept prompts to generate the final visual descriptions. The +extensive experiments on image and video captioning across four benchmarks and +four languages (i.e., English, Chinese, German, and French) confirm the +effectiveness of our approach. Compared with state-of-the-art zero-shot and +weakly-supervised methods, our method achieves 4.8% and 21.5% absolute +improvements in terms of BLEU@4 and CIDEr metrics. Our code is available at +https://github.com/yangbang18/MultiCapCLIP. + +
+
+ comment: ACL'2023, 13 pages, 4 figures +
+
+
+
+
+ + ☆ GEMTrans: A General, Echocardiography-based, Multi-Level Transformer + Framework for Cardiovascular Diagnosis + + +
+ Echocardiography (echo) is an ultrasound imaging modality that is widely used +for various cardiovascular diagnosis tasks. Due to inter-observer variability +in echo-based diagnosis, which arises from the variability in echo image +acquisition and the interpretation of echo images based on clinical experience, +vision-based machine learning (ML) methods have gained popularity to act as +secondary layers of verification. For such safety-critical applications, it is +essential for any proposed ML method to present a level of explainability along +with good accuracy. In addition, such methods must be able to process several +echo videos obtained from various heart views and the interactions among them +to properly produce predictions for a variety of cardiovascular measurements or +interpretation tasks. Prior work lacks explainability or is limited in scope by +focusing on a single cardiovascular task. To remedy this, we propose a General, +Echo-based, Multi-Level Transformer (GEMTrans) framework that provides +explainability, while simultaneously enabling multi-video training where the +inter-play among echo image patches in the same frame, all frames in the same +video, and inter-video relationships are captured based on a downstream task. +We show the flexibility of our framework by considering two critical tasks +including ejection fraction (EF) and aortic stenosis (AS) severity detection. +Our model achieves mean absolute errors of 4.15 and 4.84 for single and +dual-video EF estimation and an accuracy of 96.5 % for AS detection, while +providing informative task-specific attention maps and prototypical +explainability. + +
+
+ comment: To be published in MLMI 2023 +
+
+
+
+
+ + ☆ Self-supervised learning for hotspot detection and isolation from + thermal images + + +
+ Hotspot detection using thermal imaging has recently become essential in +several industrial applications, such as security applications, health +applications, and equipment monitoring applications. Hotspot detection is of +utmost importance in industrial safety where equipment can develop anomalies. +Hotspots are early indicators of such anomalies. We address the problem of +hotspot detection in thermal images by proposing a self-supervised learning +approach. Self-supervised learning has shown potential as a competitive +alternative to their supervised learning counterparts but their application to +thermography has been limited. This has been due to lack of diverse data +availability, domain specific pre-trained models, standardized benchmarks, etc. +We propose a self-supervised representation learning approach followed by +fine-tuning that improves detection of hotspots by classification. The SimSiam +network based ensemble classifier decides whether an image contains hotspots or +not. Detection of hotspots is followed by precise hotspot isolation. By doing +so, we are able to provide a highly accurate and precise hotspot +identification, applicable to a wide range of applications. We created a novel +large thermal image dataset to address the issue of paucity of easily +accessible thermal images. Our experiments with the dataset created by us and a +publicly available segmentation dataset show the potential of our approach for +hotspot detection and its ability to isolate hotspots with high accuracy. We +achieve a Dice Coefficient of 0.736, the highest when compared with existing +hotspot identification techniques. Our experiments also show self-supervised +learning as a strong contender of supervised learning, providing competitive +metrics for hotspot detection, with the highest accuracy of our approach being +97%. + +
+
+
+
+
+ + ☆ Deep Active Audio Feature Learning in Resource-Constrained Environments + + +
+ The scarcity of labelled data makes training Deep Neural Network (DNN) models +in bioacoustic applications challenging. In typical bioacoustics applications, +manually labelling the required amount of data can be prohibitively expensive. +To effectively identify both new and current classes, DNN models must continue +to learn new features from a modest amount of fresh data. Active Learning (AL) +is an approach that can help with this learning while requiring little +labelling effort. Nevertheless, the use of fixed feature extraction approaches +limits feature quality, resulting in underutilization of the benefits of AL. We +describe an AL framework that addresses this issue by incorporating feature +extraction into the AL loop and refining the feature extractor after each round +of manual annotation. In addition, we use raw audio processing rather than +spectrograms, which is a novel approach. Experiments reveal that the proposed +AL framework requires 14.3%, 66.7%, and 47.4% less labelling effort on +benchmark audio datasets ESC-50, UrbanSound8k, and InsectWingBeat, +respectively, for a large DNN model and similar savings on a +microcontroller-based counterpart. Furthermore, we showcase the practical +relevance of our study by incorporating data from conservation biology +projects. + +
+
+
+
+
+ + ☆ STRIDE: Street View-based Environmental Feature Detection and Pedestrian + Collision Prediction + + +
+ This paper introduces a novel benchmark to study the impact and relationship +of built environment elements on pedestrian collision prediction, intending to +enhance environmental awareness in autonomous driving systems to prevent +pedestrian injuries actively. We introduce a built environment detection task +in large-scale panoramic images and a detection-based pedestrian collision +frequency prediction task. We propose a baseline method that incorporates a +collision prediction module into a state-of-the-art detection model to tackle +both tasks simultaneously. Our experiments demonstrate a significant +correlation between object detection of built environment elements and +pedestrian collision frequency prediction. Our results are a stepping stone +towards understanding the interdependencies between built environment +conditions and pedestrian safety. + +
+
+
+
+
+ + ☆ Structural Cycle GAN for Virtual Immunohistochemistry Staining of Gland + Markers in the Colon MICCAI + + +
+ With the advent of digital scanners and deep learning, diagnostic operations +may move from a microscope to a desktop. Hematoxylin and Eosin (H&E) staining +is one of the most frequently used stains for disease analysis, diagnosis, and +grading, but pathologists do need different immunohistochemical (IHC) stains to +analyze specific structures or cells. Obtaining all of these stains (H&E and +different IHCs) on a single specimen is a tedious and time-consuming task. +Consequently, virtual staining has emerged as an essential research direction. +Here, we propose a novel generative model, Structural Cycle-GAN (SC-GAN), for +synthesizing IHC stains from H&E images, and vice versa. Our method expressly +incorporates structural information in the form of edges (in addition to color +data) and employs attention modules exclusively in the decoder of the proposed +generator model. This integration enhances feature localization and preserves +contextual information during the generation process. In addition, a structural +loss is incorporated to ensure accurate structure alignment between the +generated and input markers. To demonstrate the efficacy of the proposed model, +experiments are conducted with two IHC markers emphasizing distinct structures +of glands in the colon: the nucleus of epithelial cells (CDX2) and the +cytoplasm (CK818). Quantitative metrics such as FID and SSIM are frequently +used for the analysis of generative models, but they do not correlate +explicitly with higher-quality virtual staining results. Therefore, we propose +two new quantitative metrics that correlate directly with the virtual staining +specificity of IHC markers. + +
+
+ comment: Accepted to MICCAI Workshop 2023 +
+
+
+
+
+ + ☆ Self-supervised Scene Text Segmentation with Object-centric Layered + Representations Augmented by Text Regions + + +
+ Text segmentation tasks have a very wide range of application values, such as +image editing, style transfer, watermark removal, etc.However, existing public +datasets are of poor quality of pixel-level labels that have been shown to be +notoriously costly to acquire, both in terms of money and time. At the same +time, when pretraining is performed on synthetic datasets, the data +distribution of the synthetic datasets is far from the data distribution in the +real scene. These all pose a huge challenge to the current pixel-level text +segmentation algorithms.To alleviate the above problems, we propose a +self-supervised scene text segmentation algorithm with layered decoupling of +representations derived from the object-centric manner to segment images into +texts and background. In our method, we propose two novel designs which include +Region Query Module and Representation Consistency Constraints adapting to the +unique properties of text as complements to Auto Encoder, which improves the +network's sensitivity to texts.For this unique design, we treat the +polygon-level masks predicted by the text localization model as extra input +information, and neither utilize any pixel-level mask annotations for training +stage nor pretrain on synthetic datasets.Extensive experiments show the +effectiveness of the method proposed. On several public scene text datasets, +our method outperforms the state-of-the-art unsupervised segmentation +algorithms. + +
+
+
+
+
+ + ☆ How to Evaluate the Generalization of Detection? A Benchmark for + Comprehensive Open-Vocabulary Detection + + +
+ Object detection (OD) in computer vision has made significant progress in +recent years, transitioning from closed-set labels to open-vocabulary detection +(OVD) based on large-scale vision-language pre-training (VLP). However, current +evaluation methods and datasets are limited to testing generalization over +object types and referral expressions, which do not provide a systematic, +fine-grained, and accurate benchmark of OVD models' abilities. In this paper, +we propose a new benchmark named OVDEval, which includes 9 sub-tasks and +introduces evaluations on commonsense knowledge, attribute understanding, +position understanding, object relation comprehension, and more. The dataset is +meticulously created to provide hard negatives that challenge models' true +understanding of visual and linguistic input. Additionally, we identify a +problem with the popular Average Precision (AP) metric when benchmarking models +on these fine-grained label datasets and propose a new metric called +Non-Maximum Suppression Average Precision (NMS-AP) to address this issue. +Extensive experimental results show that existing top OVD models all fail on +the new tasks except for simple object types, demonstrating the value of the +proposed dataset in pinpointing the weakness of current OVD models and guiding +future research. Furthermore, the proposed NMS-AP metric is verified by +experiments to provide a much more truthful evaluation of OVD models, whereas +traditional AP metrics yield deceptive results. Data is available at +\url{https://github.com/om-ai-lab/OVDEval} + +
+
+
+
+
+ + ☆ GridPull: Towards Scalability in Learning Implicit Representations from + 3D Point Clouds ICCV 2023 + + +
+ Learning implicit representations has been a widely used solution for surface +reconstruction from 3D point clouds. The latest methods infer a distance or +occupancy field by overfitting a neural network on a single point cloud. +However, these methods suffer from a slow inference due to the slow convergence +of neural networks and the extensive calculation of distances to surface +points, which limits them to small scale points. To resolve the scalability +issue in surface reconstruction, we propose GridPull to improve the efficiency +of learning implicit representations from large scale point clouds. Our novelty +lies in the fast inference of a discrete distance field defined on grids +without using any neural components. To remedy the lack of continuousness +brought by neural networks, we introduce a loss function to encourage +continuous distances and consistent gradients in the field during pulling +queries onto the surface in grids near to the surface. We use uniform grids for +a fast grid search to localize sampled queries, and organize surface points in +a tree structure to speed up the calculation of distances to the surface. We do +not rely on learning priors or normal supervision during optimization, and +achieve superiority over the latest methods in terms of complexity and +accuracy. We evaluate our method on shape and scene benchmarks, and report +numerical and visual comparisons with the latest methods to justify our +effectiveness and superiority. The code is available at +https://github.com/chenchao15/GridPull. + +
+
+ comment: 13pages,14figures. Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Interactive segmentation in aerial images: a new benchmark and an open + access web-based tool + + +
+ In recent years, deep learning has emerged as a powerful approach in remote +sensing applications, particularly in segmentation and classification +techniques that play a crucial role in extracting significant land features +from satellite and aerial imagery. However, only a limited number of papers +have discussed the use of deep learning for interactive segmentation in land +cover classification tasks. In this study, we aim to bridge the gap between +interactive segmentation and remote sensing image analysis by conducting a +benchmark study on various deep learning-based interactive segmentation models. +We assessed the performance of five state-of-the-art interactive segmentation +methods (SimpleClick, FocalClick, Iterative Click Loss (ICL), Reviving +Iterative Training with Mask Guidance for Interactive Segmentation (RITM), and +Segment Anything (SAM)) on two high-resolution aerial imagery datasets. To +enhance the segmentation results without requiring multiple models, we +introduced the Cascade-Forward Refinement (CFR) approach, an innovative +inference strategy for interactive segmentation. We evaluated these interactive +segmentation methods on various land cover types, object sizes, and band +combinations in remote sensing. Surprisingly, the popularly discussed method, +SAM, proved to be ineffective for remote sensing images. Conversely, the +point-based approach used in the SimpleClick models consistently outperformed +the other methods in all experiments. Building upon these findings, we +developed a dedicated online tool called RSISeg for interactive segmentation of +remote sensing data. RSISeg incorporates a well-performing interactive model, +fine-tuned with remote sensing data. Additionally, we integrated the SAM model +into this tool. Compared to existing interactive segmentation tools, RSISeg +offers strong interactivity, modifiability, and adaptability to remote sensing +data. + +
+
+
+
+
+ + ☆ DISGO: Automatic End-to-End Evaluation for Scene Text OCR + + +
+ This paper discusses the challenges of optical character recognition (OCR) on +natural scenes, which is harder than OCR on documents due to the wild content +and various image backgrounds. We propose to uniformly use word error rates +(WER) as a new measurement for evaluating scene-text OCR, both end-to-end (e2e) +performance and individual system component performances. Particularly for the +e2e metric, we name it DISGO WER as it considers Deletion, Insertion, +Substitution, and Grouping/Ordering errors. Finally we propose to utilize the +concept of super blocks to automatically compute BLEU scores for e2e OCR +machine translation. The small SCUT public test set is used to demonstrate WER +performance by a modularized OCR system. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ IOMatch: Simplifying Open-Set Semi-Supervised Learning with Joint + Inliers and Outliers Utilization ICCV 2023 + + +
+ Semi-supervised learning (SSL) aims to leverage massive unlabeled data when +labels are expensive to obtain. Unfortunately, in many real-world applications, +the collected unlabeled data will inevitably contain unseen-class outliers not +belonging to any of the labeled classes. To deal with the challenging open-set +SSL task, the mainstream methods tend to first detect outliers and then filter +them out. However, we observe a surprising fact that such approach could result +in more severe performance degradation when labels are extremely scarce, as the +unreliable outlier detector may wrongly exclude a considerable portion of +valuable inliers. To tackle with this issue, we introduce a novel open-set SSL +framework, IOMatch, which can jointly utilize inliers and outliers, even when +it is difficult to distinguish exactly between them. Specifically, we propose +to employ a multi-binary classifier in combination with the standard closed-set +classifier for producing unified open-set classification targets, which regard +all outliers as a single new class. By adopting these targets as open-set +pseudo-labels, we optimize an open-set classifier with all unlabeled samples +including both inliers and outliers. Extensive experiments have shown that +IOMatch significantly outperforms the baseline methods across different +benchmark datasets and different settings despite its remarkable simplicity. +Our code and models are available at https://github.com/nukezil/IOMatch. + +
+
+ comment: Accepted by ICCV 2023, selected for an Oral presentation +
+
+
+
+
+ + ☆ Dual Compensation Residual Networks for Class Imbalanced Learning + + +
+ Learning generalizable representation and classifier for class-imbalanced +data is challenging for data-driven deep models. Most studies attempt to +re-balance the data distribution, which is prone to overfitting on tail classes +and underfitting on head classes. In this work, we propose Dual Compensation +Residual Networks to better fit both tail and head classes. Firstly, we propose +dual Feature Compensation Module (FCM) and Logit Compensation Module (LCM) to +alleviate the overfitting issue. The design of these two modules is based on +the observation: an important factor causing overfitting is that there is +severe feature drift between training and test data on tail classes. In +details, the test features of a tail category tend to drift towards feature +cloud of multiple similar head categories. So FCM estimates a multi-mode +feature drift direction for each tail category and compensate for it. +Furthermore, LCM translates the deterministic feature drift vector estimated by +FCM along intra-class variations, so as to cover a larger effective +compensation space, thereby better fitting the test features. Secondly, we +propose a Residual Balanced Multi-Proxies Classifier (RBMC) to alleviate the +under-fitting issue. Motivated by the observation that re-balancing strategy +hinders the classifier from learning sufficient head knowledge and eventually +causes underfitting, RBMC utilizes uniform learning with a residual path to +facilitate classifier learning. Comprehensive experiments on Long-tailed and +Class-Incremental benchmarks validate the efficacy of our method. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ Diff-Retinex: Rethinking Low-light Image Enhancement with A Generative + Diffusion Model ICCV 2023 + + +
+ In this paper, we rethink the low-light image enhancement task and propose a +physically explainable and generative diffusion model for low-light image +enhancement, termed as Diff-Retinex. We aim to integrate the advantages of the +physical model and the generative network. Furthermore, we hope to supplement +and even deduce the information missing in the low-light image through the +generative network. Therefore, Diff-Retinex formulates the low-light image +enhancement problem into Retinex decomposition and conditional image +generation. In the Retinex decomposition, we integrate the superiority of +attention in Transformer and meticulously design a Retinex Transformer +decomposition network (TDN) to decompose the image into illumination and +reflectance maps. Then, we design multi-path generative diffusion networks to +reconstruct the normal-light Retinex probability distribution and solve the +various degradations in these components respectively, including dark +illumination, noise, color deviation, loss of scene contents, etc. Owing to +generative diffusion model, Diff-Retinex puts the restoration of low-light +subtle detail into practice. Extensive experiments conducted on real-world +low-light datasets qualitatively and quantitatively demonstrate the +effectiveness, superiority, and generalization of the proposed method. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Enhancing Breast Cancer Classification Using Transfer ResNet with + Lightweight Attention Mechanism + + +
+ Deep learning models have revolutionized image classification by learning +complex feature hierarchies in raw pixel data. This paper introduces an image +classification method based on the ResNet model, and introduces a lightweight +attention mechanism framework to improve performance. The framework optimizes +feature representation, enhances classification capabilities, and improves +feature discriminativeness. We verified the effectiveness of the algorithm on +the Breakhis dataset, showing its superior performance in many aspects. Not +only in terms of conventional models, our method also shows advantages on +state-of-the-art methods such as contemporary visual transformers. Significant +improvements have been achieved in metrics such as precision, accuracy, recall, +F1-score, and G-means, while also performing well in terms of convergence time. +These results strengthen the performance of the algorithm and solidify its +application prospects in practical image classification tasks. Keywords: ResNet +model, Lightweight attention mechanism + +
+
+ comment: 6 pages, 4 figures,6 tables +
+
+
+
+
+ + ☆ A Survey of Diffusion Based Image Generation Models: Issues and Their + Solutions + + +
+ Recently, there has been significant progress in the development of large +models. Following the success of ChatGPT, numerous language models have been +introduced, demonstrating remarkable performance. Similar advancements have +also been observed in image generation models, such as Google's Imagen model, +OpenAI's DALL-E 2, and stable diffusion models, which have exhibited impressive +capabilities in generating images. However, similar to large language models, +these models still encounter unresolved challenges. Fortunately, the +availability of open-source stable diffusion models and their underlying +mathematical principles has enabled the academic community to extensively +analyze the performance of current image generation models and make +improvements based on this stable diffusion framework. This survey aims to +examine the existing issues and the current solutions pertaining to image +generation models. + +
+
+
+
+
+ + ☆ AccFlow: Backward Accumulation for Long-Range Optical Flow + + +
+ Recent deep learning-based optical flow estimators have exhibited impressive +performance in generating local flows between consecutive frames. However, the +estimation of long-range flows between distant frames, particularly under +complex object deformation and large motion occlusion, remains a challenging +task. One promising solution is to accumulate local flows explicitly or +implicitly to obtain the desired long-range flow. Nevertheless, the +accumulation errors and flow misalignment can hinder the effectiveness of this +approach. This paper proposes a novel recurrent framework called AccFlow, which +recursively backward accumulates local flows using a deformable module called +as AccPlus. In addition, an adaptive blending module is designed along with +AccPlus to alleviate the occlusion effect by backward accumulation and rectify +the accumulation error. Notably, we demonstrate the superiority of backward +accumulation over conventional forward accumulation, which to the best of our +knowledge has not been explicitly established before. To train and evaluate the +proposed AccFlow, we have constructed a large-scale high-quality dataset named +CVO, which provides ground-truth optical flow labels between adjacent and +distant frames. Extensive experiments validate the effectiveness of AccFlow in +handling long-range optical flow estimation. Codes are available at +https://github.com/mulns/AccFlow . + +
+
+
+
+
+ + ♻ ☆ Uncertainty Estimation using the Local Lipschitz for Deep Learning Image + Reconstruction Models + + +
+ The use of supervised deep neural network approaches has been investigated to +solve inverse problems in all domains, especially radiology where imaging +technologies are at the heart of diagnostics. However, in deployment, these +models are exposed to input distributions that are widely shifted from training +data, due in part to data biases or drifts. It becomes crucial to know whether +a given input lies outside the training data distribution before relying on the +reconstruction for diagnosis. The goal of this work is three-fold: (i) +demonstrate use of the local Lipshitz value as an uncertainty estimation +threshold for determining suitable performance, (ii) provide method for +identifying out-of-distribution (OOD) images where the model may not have +generalized, and (iii) use the local Lipschitz values to guide proper data +augmentation through identifying false positives and decrease epistemic +uncertainty. We provide results for both MRI reconstruction and CT sparse view +to full view reconstruction using AUTOMAP and UNET architectures due to it +being pertinent in the medical domain that reconstructed images remain +diagnostically accurate. + +
+
+
+
+
+ + ♻ ☆ Federated Object Detection for Quality Inspection in Shared Production + + +
+ Federated learning (FL) has emerged as a promising approach for training +machine learning models on decentralized data without compromising data +privacy. In this paper, we propose a FL algorithm for object detection in +quality inspection tasks using YOLOv5 as the object detection algorithm and +Federated Averaging (FedAvg) as the FL algorithm. We apply this approach to a +manufacturing use-case where multiple factories/clients contribute data for +training a global object detection model while preserving data privacy on a +non-IID dataset. Our experiments demonstrate that our FL approach achieves +better generalization performance on the overall clients' test dataset and +generates improved bounding boxes around the objects compared to models trained +using local clients' datasets. This work showcases the potential of FL for +quality inspection tasks in the manufacturing industry and provides valuable +insights into the performance and feasibility of utilizing YOLOv5 and FedAvg +for federated object detection. + +
+
+ comment: Will submit it to an IEEE conference +
+
+
+
+
+ + ♻ ☆ Federated Ensemble YOLOv5 -- A Better Generalized Object Detection + Algorithm + + +
+ Federated learning (FL) has gained significant traction as a +privacy-preserving algorithm, but the underlying resemblances of federated +learning algorithms like Federated averaging (FedAvg) or Federated SGD (Fed +SGD) to ensemble learning algorithms have not been fully explored. The purpose +of this paper is to examine the application of FL to object detection as a +method to enhance generalizability, and to compare its performance against a +centralized training approach for an object detection algorithm. Specifically, +we investigate the performance of a YOLOv5 model trained using FL across +multiple clients and employ a random sampling strategy without replacement, so +each client holds a portion of the same dataset used for centralized training. +Our experimental results showcase the superior efficiency of the FL object +detector's global model in generating accurate bounding boxes for unseen +objects, with the test set being a mixture of objects from two distinct clients +not represented in the training dataset. These findings suggest that FL can be +viewed from an ensemble algorithm perspective, akin to a synergistic blend of +Bagging and Boosting techniques. As a result, FL can be seen not only as a +method to enhance privacy, but also as a method to enhance the performance of a +machine learning model. + +
+
+ comment: 8 pages and submitted to FLTA2023 symposium under IEEE +
+
+
+
+
+ + ♻ ☆ Editing Implicit Assumptions in Text-to-Image Diffusion Models + + +
+ Text-to-image diffusion models often make implicit assumptions about the +world when generating images. While some assumptions are useful (e.g., the sky +is blue), they can also be outdated, incorrect, or reflective of social biases +present in the training data. Thus, there is a need to control these +assumptions without requiring explicit user input or costly re-training. In +this work, we aim to edit a given implicit assumption in a pre-trained +diffusion model. Our Text-to-Image Model Editing method, TIME for short, +receives a pair of inputs: a "source" under-specified prompt for which the +model makes an implicit assumption (e.g., "a pack of roses"), and a +"destination" prompt that describes the same setting, but with a specified +desired attribute (e.g., "a pack of blue roses"). TIME then updates the model's +cross-attention layers, as these layers assign visual meaning to textual +tokens. We edit the projection matrices in these layers such that the source +prompt is projected close to the destination prompt. Our method is highly +efficient, as it modifies a mere 2.2% of the model's parameters in under one +second. To evaluate model editing approaches, we introduce TIMED (TIME +Dataset), containing 147 source and destination prompt pairs from various +domains. Our experiments (using Stable Diffusion) show that TIME is successful +in model editing, generalizes well for related prompts unseen during editing, +and imposes minimal effect on unrelated generations. + +
+
+ comment: Project page: https://time-diffusion.github.io/ +
+
+
+
+
+ + ♻ ☆ Overcoming Adversarial Attacks for Human-in-the-Loop Applications ICML 2022 + + +
+ Including human analysis has the potential to positively affect the +robustness of Deep Neural Networks and is relatively unexplored in the +Adversarial Machine Learning literature. Neural network visual explanation maps +have been shown to be prone to adversarial attacks. Further research is needed +in order to select robust visualizations of explanations for the image analyst +to evaluate a given model. These factors greatly impact Human-In-The-Loop +(HITL) evaluation tools due to their reliance on adversarial images, including +explanation maps and measurements of robustness. We believe models of human +visual attention may improve interpretability and robustness of human-machine +imagery analysis systems. Our challenge remains, how can HITL evaluation be +robust in this adversarial landscape? + +
+
+ comment: New Frontiers in Adversarial Machine Learning, ICML 2022 +
+
+
+
+
+ + ♻ ☆ 360BEV: Panoramic Semantic Mapping for Indoor Bird's-Eye View WACV 2024 + + +
+ Seeing only a tiny part of the whole is not knowing the full circumstance. +Bird's-eye-view (BEV) perception, a process of obtaining allocentric maps from +egocentric views, is restricted when using a narrow Field of View (FoV) alone. +In this work, mapping from 360{\deg} panoramas to BEV semantics, the 360BEV +task, is established for the first time to achieve holistic representations of +indoor scenes in a top-down view. Instead of relying on narrow-FoV image +sequences, a panoramic image with depth information is sufficient to generate a +holistic BEV semantic map. To benchmark 360BEV, we present two indoor datasets, +360BEV-Matterport and 360BEV-Stanford, both of which include egocentric +panoramic images and semantic segmentation labels, as well as allocentric +semantic maps. Besides delving deep into different mapping paradigms, we +propose a dedicated solution for panoramic semantic mapping, namely 360Mapper. +Through extensive experiments, our methods achieve 44.32% and 45.78% in mIoU on +both datasets respectively, surpassing previous counterparts with gains of ++7.60% and +9.70% in mIoU. Code and datasets are available at the project page: +https://jamycheung.github.io/360BEV.html. + +
+
+ comment: Code and datasets are available at the project page: + https://jamycheung.github.io/360BEV.html. Accepted to WACV 2024 +
+
+
+
+
+ + ♻ ☆ Non-exemplar Class-incremental Learning by Random Auxiliary Classes + Augmentation and Mixed Features + + +
+ Non-exemplar class-incremental learning refers to classifying new and old +classes without storing samples of old classes. Since only new class samples +are available for optimization, it often occurs catastrophic forgetting of old +knowledge. To alleviate this problem, many new methods are proposed such as +model distillation, class augmentation. In this paper, we propose an effective +non-exemplar method called RAMF consisting of Random Auxiliary classes +augmentation and Mixed Feature. On the one hand, we design a novel random +auxiliary classes augmentation method, where one augmentation is randomly +selected from three augmentations and applied on the input to generate +augmented samples and extra class labels. By extending data and label space, it +allows the model to learn more diverse representations, which can prevent the +model from being biased towards learning task-specific features. When learning +new tasks, it will reduce the change of feature space and improve model +generalization. On the other hand, we employ mixed feature to replace the new +features since only using new feature to optimize the model will affect the +representation that was previously embedded in the feature space. Instead, by +mixing new and old features, old knowledge can be retained without increasing +the computational complexity. Extensive experiments on three benchmarks +demonstrate the superiority of our approach, which outperforms the +state-of-the-art non-exemplar methods and is comparable to high-performance +replay-based methods. + +
+
+ comment: 12 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ LATFormer: Locality-Aware Point-View Fusion Transformer for 3D Shape + Recognition + + +
+ Recently, 3D shape understanding has achieved significant progress due to the +advances of deep learning models on various data formats like images, voxels, +and point clouds. Among them, point clouds and multi-view images are two +complementary modalities of 3D objects and learning representations by fusing +both of them has been proven to be fairly effective. While prior works +typically focus on exploiting global features of the two modalities, herein we +argue that more discriminative features can be derived by modeling ``where to +fuse''. To investigate this, we propose a novel Locality-Aware Point-View +Fusion Transformer (LATFormer) for 3D shape retrieval and classification. The +core component of LATFormer is a module named Locality-Aware Fusion (LAF) which +integrates the local features of correlated regions across the two modalities +based on the co-occurrence scores. We further propose to filter out scores with +low values to obtain salient local co-occurring regions, which reduces +redundancy for the fusion process. In our LATFormer, we utilize the LAF module +to fuse the multi-scale features of the two modalities both bidirectionally and +hierarchically to obtain more informative features. Comprehensive experiments +on four popular 3D shape benchmarks covering 3D object retrieval and +classification validate its effectiveness. + +
+
+
+
+
+ + ♻ ☆ What's the Difference? The potential for Convolutional Neural Networks + for transient detection without template subtraction + + +
+ We present a study of the potential for Convolutional Neural Networks (CNNs) +to enable separation of astrophysical transients from image artifacts, a task +known as "real-bogus" classification without requiring a template subtracted +(or difference) image which requires a computationally expensive process to +generate, involving image matching on small spatial scales in large volumes of +data. Using data from the Dark Energy Survey, we explore the use of CNNs to (1) +automate the "real-bogus" classification, (2) reduce the computational costs of +transient discovery. We compare the efficiency of two CNNs with similar +architectures, one that uses "image triplets" (templates, search, and +difference image) and one that takes as input the template and search only. We +measure the decrease in efficiency associated with the loss of information in +input finding that the testing accuracy is reduced from 96% to 91.1%. We +further investigate how the latter model learns the required information from +the template and search by exploring the saliency maps. Our work (1) confirms +that CNNs are excellent models for "real-bogus" classification that rely +exclusively on the imaging data and require no feature engineering task; (2) +demonstrates that high-accuracy (> 90%) models can be built without the need to +construct difference images, but some accuracy is lost. Since once trained, +neural networks can generate predictions at minimal computational costs, we +argue that future implementations of this methodology could dramatically reduce +the computational costs in the detection of transients in synoptic surveys like +Rubin Observatory's Legacy Survey of Space and Time by bypassing the Difference +Image Analysis entirely. + +
+
+
+
+
+ + ♻ ☆ Single-Stage Diffusion NeRF: A Unified Approach to 3D Generation and + Reconstruction ICCV 2023 + + +
+ 3D-aware image synthesis encompasses a variety of tasks, such as scene +generation and novel view synthesis from images. Despite numerous task-specific +methods, developing a comprehensive model remains challenging. In this paper, +we present SSDNeRF, a unified approach that employs an expressive diffusion +model to learn a generalizable prior of neural radiance fields (NeRF) from +multi-view images of diverse objects. Previous studies have used two-stage +approaches that rely on pretrained NeRFs as real data to train diffusion +models. In contrast, we propose a new single-stage training paradigm with an +end-to-end objective that jointly optimizes a NeRF auto-decoder and a latent +diffusion model, enabling simultaneous 3D reconstruction and prior learning, +even from sparsely available views. At test time, we can directly sample the +diffusion prior for unconditional generation, or combine it with arbitrary +observations of unseen objects for NeRF reconstruction. SSDNeRF demonstrates +robust results comparable to or better than leading task-specific methods in +unconditional generation and single/sparse-view 3D reconstruction. + +
+
+ comment: ICCV 2023 final version. Project page: + https://lakonik.github.io/ssdnerf +
+
+
+
+
+ + ♻ ☆ Weakly-Supervised Text-driven Contrastive Learning for Facial Behavior + Understanding + + +
+ Contrastive learning has shown promising potential for learning robust +representations by utilizing unlabeled data. However, constructing effective +positive-negative pairs for contrastive learning on facial behavior datasets +remains challenging. This is because such pairs inevitably encode the +subject-ID information, and the randomly constructed pairs may push similar +facial images away due to the limited number of subjects in facial behavior +datasets. To address this issue, we propose to utilize activity descriptions, +coarse-grained information provided in some datasets, which can provide +high-level semantic information about the image sequences but is often +neglected in previous studies. More specifically, we introduce a two-stage +Contrastive Learning with Text-Embeded framework for Facial behavior +understanding (CLEF). The first stage is a weakly-supervised contrastive +learning method that learns representations from positive-negative pairs +constructed using coarse-grained activity information. The second stage aims to +train the recognition of facial expressions or facial action units by +maximizing the similarity between image and the corresponding text label names. +The proposed CLEF achieves state-of-the-art performance on three in-the-lab +datasets for AU recognition and three in-the-wild datasets for facial +expression recognition. + +
+
+
+
+
+ + ♻ ☆ Experts Weights Averaging: A New General Training Scheme for Vision + Transformers + + +
+ Structural re-parameterization is a general training scheme for Convolutional +Neural Networks (CNNs), which achieves performance improvement without +increasing inference cost. As Vision Transformers (ViTs) are gradually +surpassing CNNs in various visual tasks, one may question: if a training scheme +specifically for ViTs exists that can also achieve performance improvement +without increasing inference cost? Recently, Mixture-of-Experts (MoE) has +attracted increasing attention, as it can efficiently scale up the capacity of +Transformers at a fixed cost through sparsely activated experts. Considering +that MoE can also be viewed as a multi-branch structure, can we utilize MoE to +implement a ViT training scheme similar to structural re-parameterization? In +this paper, we affirmatively answer these questions, with a new general +training strategy for ViTs. Specifically, we decouple the training and +inference phases of ViTs. During training, we replace some Feed-Forward +Networks (FFNs) of the ViT with specially designed, more efficient MoEs that +assign tokens to experts by random uniform partition, and perform Experts +Weights Averaging (EWA) on these MoEs at the end of each iteration. After +training, we convert each MoE into an FFN by averaging the experts, +transforming the model back into original ViT for inference. We further provide +a theoretical analysis to show why and how it works. Comprehensive experiments +across various 2D and 3D visual tasks, ViT architectures, and datasets validate +the effectiveness and generalizability of the proposed training scheme. +Besides, our training scheme can also be applied to improve performance when +fine-tuning ViTs. Lastly, but equally important, the proposed EWA technique can +significantly improve the effectiveness of naive MoE in various 2D visual small +datasets and 3D visual tasks. + +
+
+ comment: 12 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Class-relation Knowledge Distillation for Novel Class Discovery ICCV2023 + + +
+ We tackle the problem of novel class discovery, which aims to learn novel +classes without supervision based on labeled data from known classes. A key +challenge lies in transferring the knowledge in the known-class data to the +learning of novel classes. Previous methods mainly focus on building a shared +representation space for knowledge transfer and often ignore modeling class +relations. To address this, we introduce a class relation representation for +the novel classes based on the predicted class distribution of a model trained +on known classes. Empirically, we find that such class relation becomes less +informative during typical discovery training. To prevent such information +loss, we propose a novel knowledge distillation framework, which utilizes our +class-relation representation to regularize the learning of novel classes. In +addition, to enable a flexible knowledge distillation scheme for each data +point in novel classes, we develop a learnable weighting function for the +regularization, which adaptively promotes knowledge transfer based on the +semantic similarity between the novel and known classes. To validate the +effectiveness and generalization of our method, we conduct extensive +experiments on multiple benchmarks, including CIFAR100, Stanford Cars, CUB, and +FGVC-Aircraft datasets. Our results demonstrate that the proposed method +outperforms the previous state-of-the-art methods by a significant margin on +almost all benchmarks. Code is available at +\href{https://github.com/kleinzcy/Cr-KD-NCD}{here}. + +
+
+ comment: ICCV2023 +
+
+
+
+
+ + ♻ ☆ Target-Grounded Graph-Aware Transformer for Aerial Vision-and-Dialog + Navigation + + +
+ This report details the methods of the winning entry of the AVDN Challenge in +ICCV CLVL 2023. The competition addresses the Aerial Navigation from Dialog +History (ANDH) task, which requires a drone agent to associate dialog history +with aerial observations to reach the destination. For better cross-modal +grounding abilities of the drone agent, we propose a Target-Grounded +Graph-Aware Transformer (TG-GAT) framework. Concretely, TG-GAT first leverages +a graph-aware transformer to capture spatiotemporal dependency, which benefits +navigation state tracking and robust action planning. In addition,an auxiliary +visual grounding task is devised to boost the agent's awareness of referred +landmarks. Moreover, a hybrid augmentation strategy based on large language +models is utilized to mitigate data scarcity limitations. Our TG-GAT framework +won the AVDN Challenge, with 2.2% and 3.0% absolute improvements over the +baseline on SPL and SR metrics, respectively. The code is available at +https://github.com/yifeisu/TG-GAT. + +
+
+
+
+
+ + ♻ ☆ Novel Class Discovery for Long-tailed Recognition + + +
+ While the novel class discovery has recently made great progress, existing +methods typically focus on improving algorithms on class-balanced benchmarks. +However, in real-world recognition tasks, the class distributions of their +corresponding datasets are often imbalanced, which leads to serious performance +degeneration of those methods. In this paper, we consider a more realistic +setting for novel class discovery where the distributions of novel and known +classes are long-tailed. One main challenge of this new problem is to discover +imbalanced novel classes with the help of long-tailed known classes. To tackle +this problem, we propose an adaptive self-labeling strategy based on an +equiangular prototype representation of classes. Our method infers high-quality +pseudo-labels for the novel classes by solving a relaxed optimal transport +problem and effectively mitigates the class biases in learning the known and +novel classes. We perform extensive experiments on CIFAR100, ImageNet100, +Herbarium19 and large-scale iNaturalist18 datasets, and the results demonstrate +the superiority of our method. Our code is available at +https://github.com/kleinzcy/NCDLR. + +
+
+ comment: TMLR2023, Final version +
+
+
+
+
+ + ♻ ☆ Estimator Meets Equilibrium Perspective: A Rectified Straight Through + Estimator for Binary Neural Networks Training ICCV 2023 + + +
+ Binarization of neural networks is a dominant paradigm in neural networks +compression. The pioneering work BinaryConnect uses Straight Through Estimator +(STE) to mimic the gradients of the sign function, but it also causes the +crucial inconsistency problem. Most of the previous methods design different +estimators instead of STE to mitigate it. However, they ignore the fact that +when reducing the estimating error, the gradient stability will decrease +concomitantly. These highly divergent gradients will harm the model training +and increase the risk of gradient vanishing and gradient exploding. To fully +take the gradient stability into consideration, we present a new perspective to +the BNNs training, regarding it as the equilibrium between the estimating error +and the gradient stability. In this view, we firstly design two indicators to +quantitatively demonstrate the equilibrium phenomenon. In addition, in order to +balance the estimating error and the gradient stability well, we revise the +original straight through estimator and propose a power function based +estimator, Rectified Straight Through Estimator (ReSTE for short). Comparing to +other estimators, ReSTE is rational and capable of flexibly balancing the +estimating error with the gradient stability. Extensive experiments on CIFAR-10 +and ImageNet datasets show that ReSTE has excellent performance and surpasses +the state-of-the-art methods without any auxiliary modules or losses. + +
+
+ comment: 10 pages, 6 figures. Accepted in ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Early Stopping for Deep Image Prior + + +
+ Deep image prior (DIP) and its variants have showed remarkable potential for +solving inverse problems in computer vision, without any extra training data. +Practical DIP models are often substantially overparameterized. During the +fitting process, these models learn mostly the desired visual content first, +and then pick up the potential modeling and observational noise, i.e., +overfitting. Thus, the practicality of DIP often depends critically on good +early stopping (ES) that captures the transition period. In this regard, the +majority of DIP works for vision tasks only demonstrates the potential of the +models -- reporting the peak performance against the ground truth, but provides +no clue about how to operationally obtain near-peak performance without access +to the groundtruth. In this paper, we set to break this practicality barrier of +DIP, and propose an efficient ES strategy, which consistently detects near-peak +performance across several vision tasks and DIP variants. Based on a simple +measure of dispersion of consecutive DIP reconstructions, our ES method not +only outpaces the existing ones -- which only work in very narrow domains, but +also remains effective when combined with a number of methods that try to +mitigate the overfitting. The code is available at +https://github.com/sun-umn/Early_Stopping_for_DIP. + +
+
+
+
+
+ + ♻ ☆ Prototypical few-shot segmentation for cross-institution male pelvic + structures with spatial registration + + +
+ The prowess that makes few-shot learning desirable in medical image analysis +is the efficient use of the support image data, which are labelled to classify +or segment new classes, a task that otherwise requires substantially more +training images and expert annotations. This work describes a fully 3D +prototypical few-shot segmentation algorithm, such that the trained networks +can be effectively adapted to clinically interesting structures that are absent +in training, using only a few labelled images from a different institute. +First, to compensate for the widely recognised spatial variability between +institutions in episodic adaptation of novel classes, a novel spatial +registration mechanism is integrated into prototypical learning, consisting of +a segmentation head and an spatial alignment module. Second, to assist the +training with observed imperfect alignment, support mask conditioning module is +proposed to further utilise the annotation available from the support images. +Extensive experiments are presented in an application of segmenting eight +anatomical structures important for interventional planning, using a data set +of 589 pelvic T2-weighted MR images, acquired at seven institutes. The results +demonstrate the efficacy in each of the 3D formulation, the spatial +registration, and the support mask conditioning, all of which made positive +contributions independently or collectively. Compared with the previously +proposed 2D alternatives, the few-shot segmentation performance was improved +with statistical significance, regardless whether the support data come from +the same or different institutes. + +
+
+ comment: accepted by Medical Image Analysis +
+
+
+
+
+ + ♻ ☆ Spatial and Modal Optimal Transport for Fast Cross-Modal MRI + Reconstruction + + +
+ Multi-modal magnetic resonance imaging (MRI) plays a crucial role in +comprehensive disease diagnosis in clinical medicine. However, acquiring +certain modalities, such as T2-weighted images (T2WIs), is time-consuming and +prone to be with motion artifacts. It negatively impacts subsequent multi-modal +image analysis. To address this issue, we propose an end-to-end deep learning +framework that utilizes T1-weighted images (T1WIs) as auxiliary modalities to +expedite T2WIs' acquisitions. While image pre-processing is capable of +mitigating misalignment, improper parameter selection leads to adverse +pre-processing effects, requiring iterative experimentation and adjustment. To +overcome this shortage, we employ Optimal Transport (OT) to synthesize T2WIs by +aligning T1WIs and performing cross-modal synthesis, effectively mitigating +spatial misalignment effects. Furthermore, we adopt an alternating iteration +framework between the reconstruction task and the cross-modal synthesis task to +optimize the final results. Then, we prove that the reconstructed T2WIs and the +synthetic T2WIs become closer on the T2 image manifold with iterations +increasing, and further illustrate that the improved reconstruction result +enhances the synthesis process, whereas the enhanced synthesis result improves +the reconstruction process. Finally, experimental results from FastMRI and +internal datasets confirm the effectiveness of our method, demonstrating +significant improvements in image reconstruction quality even at low sampling +rates. + +
+
+
+
+
+ + ♻ ☆ Can Linguistic Knowledge Improve Multimodal Alignment in Vision-Language + Pretraining? + + +
+ The multimedia community has shown a significant interest in perceiving and +representing the physical world with multimodal pretrained neural network +models, and among them, the visual-language pertaining (VLP) is, currently, the +most captivating topic. However, there have been few endeavors dedicated to the +exploration of 1) whether essential linguistic knowledge (e.g., semantics and +syntax) can be extracted during VLP, and 2) how such linguistic knowledge +impact or enhance the multimodal alignment. In response, here we aim to +elucidate the impact of comprehensive linguistic knowledge, including semantic +expression and syntactic structure, on multimodal alignment. Specifically, we +design and release the SNARE, the first large-scale multimodal alignment +probing benchmark, to detect the vital linguistic components, e.g., lexical, +semantic, and syntax knowledge, containing four tasks: Semantic structure, +Negation logic, Attribute ownership, and Relationship composition. Based on our +proposed probing benchmarks, our holistic analyses of five advanced VLP models +illustrate that the VLP model: i) shows insensitivity towards complex syntax +structures and relies on content words for sentence comprehension; ii) +demonstrates limited comprehension of combinations between sentences and +negations; iii) faces challenges in determining the presence of actions or +spatial relationships within visual information and struggles with verifying +the correctness of triple combinations. We make our benchmark and code +available at \url{https://github.com/WangFei-2019/SNARE/}. + +
+
+ comment: [TL;DR] we design and release the SNARE, the first large-scale + multimodal alignment probing benchmark for current vision-language pretrained + models +
+
+
+
+
+ + ♻ ☆ One-Class Risk Estimation for One-Class Hyperspectral Image + Classification + + +
+ Hyperspectral imagery (HSI) one-class classification is aimed at identifying +a single target class from the HSI by using only knowing positive data, which +can significantly reduce the requirements for annotation. However, when +one-class classification meets HSI, it is difficult for classifiers to find a +balance between the overfitting and underfitting of positive data due to the +problems of distribution overlap and distribution imbalance. Although deep +learning-based methods are currently the mainstream to overcome distribution +overlap in HSI multiclassification, few studies focus on deep learning-based +HSI one-class classification. In this article, a weakly supervised deep HSI +one-class classifier, namely, HOneCls, is proposed, where a risk estimator,the +one-class risk estimator, is particularly introduced to make the fully +convolutional neural network (FCN) with the ability of one class classification +in the case of distribution imbalance. Extensive experiments (20 tasks in +total) were conducted to demonstrate the superiority of the proposed +classifier. + +
+
+ comment: Accepted by TGRS +
+
+
+
+
+ + ♻ ☆ Distinctive Self-Similar Object Detection + + +
+ Deep learning-based object detection has demonstrated a significant presence +in the practical applications of artificial intelligence. However, objects such +as fire and smoke, pose challenges to object detection because of their +non-solid and various shapes, and consequently difficult to truly meet +requirements in practical fire prevention and control. In this paper, we +propose that the distinctive fractal feature of self-similar in fire and smoke +can relieve us from struggling with their various shapes. To our best +knowledge, we are the first to discuss this problem. In order to evaluate the +self-similarity of the fire and smoke and improve the precision of object +detection, we design a semi-supervised method that use Hausdorff distance to +describe the resemblance between instances. Besides, based on the concept of +self-similar, we have devised a novel methodology for evaluating this +particular task in a more equitable manner. We have meticulously designed our +network architecture based on well-established and representative baseline +networks such as YOLO and Faster R-CNN. Our experiments have been conducted on +publicly available fire and smoke detection datasets, which we have thoroughly +verified to ensure the validity of our approach. As a result, we have observed +significant improvements in the detection accuracy. + +
+
+
+
+
+ + ♻ ☆ Defensive Few-shot Learning + + +
+ This paper investigates a new challenging problem called defensive few-shot +learning in order to learn a robust few-shot model against adversarial attacks. +Simply applying the existing adversarial defense methods to few-shot learning +cannot effectively solve this problem. This is because the commonly assumed +sample-level distribution consistency between the training and test sets can no +longer be met in the few-shot setting. To address this situation, we develop a +general defensive few-shot learning (DFSL) framework to answer the following +two key questions: (1) how to transfer adversarial defense knowledge from one +sample distribution to another? (2) how to narrow the distribution gap between +clean and adversarial examples under the few-shot setting? To answer the first +question, we propose an episode-based adversarial training mechanism by +assuming a task-level distribution consistency to better transfer the +adversarial defense knowledge. As for the second question, within each few-shot +task, we design two kinds of distribution consistency criteria to narrow the +distribution gap between clean and adversarial examples from the feature-wise +and prediction-wise perspectives, respectively. Extensive experiments +demonstrate that the proposed framework can effectively make the existing +few-shot models robust against adversarial attacks. Code is available at +https://github.com/WenbinLee/DefensiveFSL.git. + +
+
+ comment: Accepted to IEEE Transactions on Pattern Analysis and Machine + Intelligence (TPAMI) 2022 +
+
+
+
+
+ + ♻ ☆ A Perceptually Optimized and Self-Calibrated Tone Mapping Operator + + +
+ With the increasing popularity and accessibility of high dynamic range (HDR) +photography, tone mapping operators (TMOs) for dynamic range compression are +practically demanding. In this paper, we develop a two-stage neural +network-based TMO that is self-calibrated and perceptually optimized. In Stage +one, motivated by the physiology of the early stages of the human visual +system, we first decompose an HDR image into a normalized Laplacian pyramid. We +then use two lightweight deep neural networks (DNNs), taking the normalized +representation as input and estimating the Laplacian pyramid of the +corresponding LDR image. We optimize the tone mapping network by minimizing the +normalized Laplacian pyramid distance (NLPD), a perceptual metric aligning with +human judgments of tone-mapped image quality. In Stage two, the input HDR image +is self-calibrated to compute the final LDR image. We feed the same HDR image +but rescaled with different maximum luminances to the learned tone mapping +network, and generate a pseudo-multi-exposure image stack with different detail +visibility and color saturation. We then train another lightweight DNN to fuse +the LDR image stack into a desired LDR image by maximizing a variant of the +structural similarity index for multi-exposure image fusion (MEF-SSIM), which +has been proven perceptually relevant to fused image quality. The proposed +self-calibration mechanism through MEF enables our TMO to accept uncalibrated +HDR images, while being physiology-driven. Extensive experiments show that our +method produces images with consistently better visual quality. Additionally, +since our method builds upon three lightweight DNNs, it is among the fastest +local TMOs. + +
+
+ comment: 15 pages,17 figures +
+
+
+
+
+ + ♻ ☆ Rethinking the Role of Pre-Trained Networks in Source-Free Domain + Adaptation ICCV 2023 + + +
+ Source-free domain adaptation (SFDA) aims to adapt a source model trained on +a fully-labeled source domain to an unlabeled target domain. Large-data +pre-trained networks are used to initialize source models during source +training, and subsequently discarded. However, source training can cause the +model to overfit to source data distribution and lose applicable target domain +knowledge. We propose to integrate the pre-trained network into the target +adaptation process as it has diversified features important for generalization +and provides an alternate view of features and classification decisions +different from the source model. We propose to distil useful target domain +information through a co-learning strategy to improve target pseudolabel +quality for finetuning the source model. Evaluation on 4 benchmark datasets +show that our proposed strategy improves adaptation performance and can be +successfully integrated with existing SFDA methods. Leveraging modern +pre-trained networks that have stronger representation learning ability in the +co-learning strategy further boosts performance. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ♻ ☆ TMR: Text-to-Motion Retrieval Using Contrastive 3D Human Motion + Synthesis ICCV 2023 + + +
+ In this paper, we present TMR, a simple yet effective approach for text to 3D +human motion retrieval. While previous work has only treated retrieval as a +proxy evaluation metric, we tackle it as a standalone task. Our method extends +the state-of-the-art text-to-motion synthesis model TEMOS, and incorporates a +contrastive loss to better structure the cross-modal latent space. We show that +maintaining the motion generation loss, along with the contrastive training, is +crucial to obtain good performance. We introduce a benchmark for evaluation and +provide an in-depth analysis by reporting results on several protocols. Our +extensive experiments on the KIT-ML and HumanML3D datasets show that TMR +outperforms the prior work by a significant margin, for example reducing the +median rank from 54 to 19. Finally, we showcase the potential of our approach +on moment retrieval. Our code and models are publicly available at +https://mathis.petrovich.fr/tmr. + +
+
+ comment: ICCV 2023 Camera Ready, project page: + https://mathis.petrovich.fr/tmr/ +
+
+
+
+
+ + ♻ ☆ Masked Feature Modelling: Feature Masking for the Unsupervised + Pre-training of a Graph Attention Network Block for Bottom-up Video Event + Recognition + + +
+ In this paper, we introduce Masked Feature Modelling (MFM), a novel approach +for the unsupervised pre-training of a Graph Attention Network (GAT) block. MFM +utilizes a pretrained Visual Tokenizer to reconstruct masked features of +objects within a video, leveraging the MiniKinetics dataset. We then +incorporate the pre-trained GAT block into a state-of-the-art bottom-up +supervised video-event recognition architecture, ViGAT, to improve the model's +starting point and overall accuracy. Experimental evaluations on the YLI-MED +dataset demonstrate the effectiveness of MFM in improving event recognition +performance. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ♻ ☆ Online Open-set Semi-supervised Object Detection by Valuable Instances + Mining + + +
+ Open-set semi-supervised object detection (OSSOD) leverages practical +open-set unlabeled datasets with out-of-distribution (OOD) instances for +semi-supervised object detection (SSOD). The main challenge in OSSOD is +distinguishing and filtering the OOD instances (i.e., outliers) from +in-distribution (ID) instances during pseudo-labeling. The only OSSOD work +employs an additional offline OOD detection network trained solely with labeled +data for solving this problem. However, the limited training data restricts the +potential for improvement. Meanwhile, the offline strategy results in low +efficiency. To alleviate these issues, this paper proposes an end-to-end online +OSSOD framework that improves performance and efficiency: 1) We propose a +semi-supervised outlier filtering method that more effectively filters the OOD +instances by using both labeled and unlabeled data. 2) We propose a +threshold-free Dual Competing OOD head that further improves the performance by +suppressing the mispredictions during semi-supervised outlier filtering. 3) Our +proposed method is an online end-to-end trainable OSSOD framework. Experimental +results show that our method achieves state-of-the-art performance on several +OSSOD benchmarks compared to existing methods. Moreover, additional experiments +show that our method can be easily applied to different SSOD frameworks. + +
+
+
+
+
+ + ♻ ☆ SDR-GAIN: A High Real-Time Occluded Pedestrian Pose Completion Method + for Autonomous Driving + + +
+ To mitigate the challenges arising from partial occlusion in human pose +keypoint based pedestrian detection methods , we present a novel pedestrian +pose keypoint completion method called the separation and dimensionality +reduction-based generative adversarial imputation networks (SDR-GAIN) . +Firstly, we utilize OpenPose to estimate pedestrian poses in images. Then, we +isolate the head and torso keypoints of pedestrians with incomplete keypoints +due to occlusion or other factors and perform dimensionality reduction to +enhance features and further unify feature distribution. Finally, we introduce +two generative models based on the generative adversarial networks (GAN) +framework, which incorporate Huber loss, residual structure, and L1 +regularization to generate missing parts of the incomplete head and torso pose +keypoints of partially occluded pedestrians, resulting in pose completion. Our +experiments on MS COCO and JAAD datasets demonstrate that SDR-GAIN outperforms +basic GAIN framework, interpolation methods PCHIP and MAkima, machine learning +methods k-NN and MissForest in terms of pose completion task. Furthermore, the +SDR-GAIN algorithm exhibits a remarkably short running time of approximately +0.4ms and boasts exceptional real-time performance. As such, it holds +significant practical value in the domain of autonomous driving, wherein high +system response speeds are of paramount importance. Specifically, it excels at +rapidly and precisely capturing human pose key points, thus enabling an +expanded range of applications for pedestrian detection tasks based on pose key +points, including but not limited to pedestrian behavior recognition and +prediction. + +
+
+
+
+
+ + ♻ ☆ Recovering 3D Human Mesh from Monocular Images: A Survey + + +
+ Estimating human pose and shape from monocular images is a long-standing +problem in computer vision. Since the release of statistical body models, 3D +human mesh recovery has been drawing broader attention. With the same goal of +obtaining well-aligned and physically plausible mesh results, two paradigms +have been developed to overcome challenges in the 2D-to-3D lifting process: i) +an optimization-based paradigm, where different data terms and regularization +terms are exploited as optimization objectives; and ii) a regression-based +paradigm, where deep learning techniques are embraced to solve the problem in +an end-to-end fashion. Meanwhile, continuous efforts are devoted to improving +the quality of 3D mesh labels for a wide range of datasets. Though remarkable +progress has been achieved in the past decade, the task is still challenging +due to flexible body motions, diverse appearances, complex environments, and +insufficient in-the-wild annotations. To the best of our knowledge, this is the +first survey that focuses on the task of monocular 3D human mesh recovery. We +start with the introduction of body models and then elaborate recovery +frameworks and training objectives by providing in-depth analyses of their +strengths and weaknesses. We also summarize datasets, evaluation metrics, and +benchmark results. Open issues and future directions are discussed in the end, +hoping to motivate researchers and facilitate their research in this area. A +regularly updated project page can be found at +https://github.com/tinatiansjz/hmr-survey. + +
+
+ comment: Published in IEEE TPAMI, Survey on monocular 3D human mesh recovery, + Project page: https://github.com/tinatiansjz/hmr-survey +
+
+
+
+
+ + ♻ ☆ VadCLIP: Adapting Vision-Language Models for Weakly Supervised Video + Anomaly Detection + + +
+ The recent contrastive language-image pre-training (CLIP) model has shown +great success in a wide range of image-level tasks, revealing remarkable +ability for learning powerful visual representations with rich semantics. An +open and worthwhile problem is efficiently adapting such a strong model to the +video domain and designing a robust video anomaly detector. In this work, we +propose VadCLIP, a new paradigm for weakly supervised video anomaly detection +(WSVAD) by leveraging the frozen CLIP model directly without any pre-training +and fine-tuning process. Unlike current works that directly feed extracted +features into the weakly supervised classifier for frame-level binary +classification, VadCLIP makes full use of fine-grained associations between +vision and language on the strength of CLIP and involves dual branch. One +branch simply utilizes visual features for coarse-grained binary +classification, while the other fully leverages the fine-grained language-image +alignment. With the benefit of dual branch, VadCLIP achieves both +coarse-grained and fine-grained video anomaly detection by transferring +pre-trained knowledge from CLIP to WSVAD task. We conduct extensive experiments +on two commonly-used benchmarks, demonstrating that VadCLIP achieves the best +performance on both coarse-grained and fine-grained WSVAD, surpassing the +state-of-the-art methods by a large margin. Specifically, VadCLIP achieves +84.51% AP and 88.02% AUC on XD-Violence and UCF-Crime, respectively. Code and +features will be released to facilitate future VAD research. + +
+
+ comment: Submitted +
+
+
+
+
+ + ♻ ☆ Exploring the Optimization Objective of One-Class Classification for + Anomaly Detection + + +
+ One-class classification (OCC) is a longstanding method for anomaly +detection. With the powerful representation capability of the pre-trained +backbone, OCC methods have witnessed significant performance improvements. +Typically, most of these OCC methods employ transfer learning to enhance the +discriminative nature of the pre-trained backbone's features, thus achieving +remarkable efficacy. While most current approaches emphasize feature transfer +strategies, we argue that the optimization objective space within OCC methods +could also be an underlying critical factor influencing performance. In this +work, we conducted a thorough investigation into the optimization objective of +OCC. Through rigorous theoretical analysis and derivation, we unveil a key +insights: any space with the suitable norm can serve as an equivalent +substitute for the hypersphere center, without relying on the distribution +assumption of training samples. Further, we provide guidelines for determining +the feasible domain of norms for the OCC optimization objective. This novel +insight sparks a simple and data-agnostic deep one-class classification method. +Our method is straightforward, with a single 1x1 convolutional layer as a +trainable projector and any space with suitable norm as the optimization +objective. Extensive experiments validate the reliability and efficacy of our +findings and the corresponding methodology, resulting in state-of-the-art +performance in both one-class classification and industrial vision anomaly +detection and segmentation tasks. + +
+
+ comment: 15 paegs, 10 figures +
+
+
+
+
+ + ♻ ☆ Feature Unlearning for Pre-trained GANs and VAEs + + +
+ We tackle the problem of feature unlearning from a pre-trained image +generative model: GANs and VAEs. Unlike a common unlearning task where an +unlearning target is a subset of the training set, we aim to unlearn a specific +feature, such as hairstyle from facial images, from the pre-trained generative +models. As the target feature is only presented in a local region of an image, +unlearning the entire image from the pre-trained model may result in losing +other details in the remaining region of the image. To specify which features +to unlearn, we collect randomly generated images that contain the target +features. We then identify a latent representation corresponding to the target +feature and then use the representation to fine-tune the pre-trained model. +Through experiments on MNIST and CelebA datasets, we show that target features +are successfully removed while keeping the fidelity of the original models. +Further experiments with an adversarial attack show that the unlearned model is +more robust under the presence of malicious parties. + +
+
+
+
+
+ + ♻ ☆ Why Does Little Robustness Help? Understanding and Improving Adversarial + Transferability from Surrogate Training + + +
+ Adversarial examples (AEs) for DNNs have been shown to be transferable: AEs +that successfully fool white-box surrogate models can also deceive other +black-box models with different architectures. Although a bunch of empirical +studies have provided guidance on generating highly transferable AEs, many of +these findings lack explanations and even lead to inconsistent advice. In this +paper, we take a further step towards understanding adversarial +transferability, with a particular focus on surrogate aspects. Starting from +the intriguing little robustness phenomenon, where models adversarially trained +with mildly perturbed adversarial samples can serve as better surrogates, we +attribute it to a trade-off between two predominant factors: model smoothness +and gradient similarity. Our investigations focus on their joint effects, +rather than their separate correlations with transferability. Through a series +of theoretical and empirical analyses, we conjecture that the data distribution +shift in adversarial training explains the degradation of gradient similarity. +Building on these insights, we explore the impacts of data augmentation and +gradient regularization on transferability and identify that the trade-off +generally exists in the various training mechanisms, thus building a +comprehensive blueprint for the regulation mechanism behind transferability. +Finally, we provide a general route for constructing better surrogates to boost +transferability which optimizes both model smoothness and gradient similarity +simultaneously, e.g., the combination of input gradient regularization and +sharpness-aware minimization (SAM), validated by extensive experiments. In +summary, we call for attention to the united impacts of these two factors for +launching effective transfer attacks, rather than optimizing one while ignoring +the other, and emphasize the crucial role of manipulating surrogate models. + +
+
+ comment: Accepted by IEEE Symposium on Security and Privacy (Oakland) 2024; 21 + pages, 11 figures, 13 tables +
+
+
+
+
+ + ♻ ☆ Comparative Study: Standalone IEEE 16-bit Floating-Point for Image + Classification + + +
+ Reducing the number of bits needed to encode the weights and activations of +neural networks is highly desirable as it speeds up their training and +inference time while reducing memory consumption. It is unsurprising that +considerable attention has been drawn to developing neural networks that employ +lower-precision computation. This includes IEEE 16-bit, Google bfloat16, 8-bit, +4-bit floating-point or fixed-point, 2-bit, and various mixed-precision +algorithms. Out of these low-precision formats, IEEE 16-bit stands out due to +its universal compatibility with contemporary GPUs. This accessibility +contrasts with bfloat16, which needs high-end GPUs, or other non-standard +fewer-bit designs, which typically require software simulation. This study +focuses on the widely accessible IEEE 16-bit format for comparative analysis. +This analysis involves an in-depth theoretical investigation of the factors +that lead to discrepancies between 16-bit and 32-bit models, including a +formalization of the concepts of floating-point error and tolerance to +understand the conditions under which a 16-bit model can approximate 32-bit +results. Contrary to literature that credits the success of noise-tolerated +neural networks to regularization effects, our study-supported by a series of +rigorous experiments-provides a quantitative explanation of why standalone IEEE +16-bit floating-point neural networks can perform on par with 32-bit and +mixed-precision networks in various image classification tasks. Because no +prior research has studied IEEE 16-bit as a standalone floating-point precision +in neural networks, we believe our findings will have significant impacts, +encouraging the adoption of standalone IEEE 16-bit networks in future neural +network applications. + +
+
+
+
+
+ + ♻ ☆ Inter-Rater Uncertainty Quantification in Medical Image Segmentation via + Rater-Specific Bayesian Neural Networks + + +
+ Automated medical image segmentation inherently involves a certain degree of +uncertainty. One key factor contributing to this uncertainty is the ambiguity +that can arise in determining the boundaries of a target region of interest, +primarily due to variations in image appearance. On top of this, even among +experts in the field, different opinions can emerge regarding the precise +definition of specific anatomical structures. This work specifically addresses +the modeling of segmentation uncertainty, known as inter-rater uncertainty. Its +primary objective is to explore and analyze the variability in segmentation +outcomes that can occur when multiple experts in medical imaging interpret and +annotate the same images. We introduce a novel Bayesian neural network-based +architecture to estimate inter-rater uncertainty in medical image segmentation. +Our approach has three key advancements. Firstly, we introduce a +one-encoder-multi-decoder architecture specifically tailored for uncertainty +estimation, enabling us to capture the rater-specific representation of each +expert involved. Secondly, we propose Bayesian modeling for the new +architecture, allowing efficient capture of the inter-rater distribution, +particularly in scenarios with limited annotations. Lastly, we enhance the +rater-specific representation by integrating an attention module into each +decoder. This module facilitates focused and refined segmentation results for +each rater. We conduct extensive evaluations using synthetic and real-world +datasets to validate our technical innovations rigorously. Our method surpasses +existing baseline methods in five out of seven diverse tasks on the publicly +available \emph{QUBIQ} dataset, considering two evaluation metrics encompassing +different uncertainty aspects. Our codes, models, and the new dataset are +available through our GitHub repository: +https://github.com/HaoWang420/bOEMD-net . + +
+
+ comment: submitted to a journal for review +
+
+
+
+
+ + ♻ ☆ Scale Federated Learning for Label Set Mismatch in Medical Image + Classification + + +
+ Federated learning (FL) has been introduced to the healthcare domain as a +decentralized learning paradigm that allows multiple parties to train a model +collaboratively without privacy leakage. However, most previous studies have +assumed that every client holds an identical label set. In reality, medical +specialists tend to annotate only diseases within their area of expertise or +interest. This implies that label sets in each client can be different and even +disjoint. In this paper, we propose the framework FedLSM to solve the problem +of Label Set Mismatch. FedLSM adopts different training strategies on data with +different uncertainty levels to efficiently utilize unlabeled or partially +labeled data as well as class-wise adaptive aggregation in the classification +layer to avoid inaccurate aggregation when clients have missing labels. We +evaluated FedLSM on two public real-world medical image datasets, including +chest X-ray (CXR) diagnosis with 112,120 CXR images and skin lesion diagnosis +with 10,015 dermoscopy images, and showed that it significantly outperformed +other state-of-the-art FL algorithms. The code can be found at +https://github.com/dzp2095/FedLSM. + +
+
+
+
+
+ + ♻ ☆ EmoTalk: Speech-Driven Emotional Disentanglement for 3D Face Animation ICCV 2023 + + +
+ Speech-driven 3D face animation aims to generate realistic facial expressions +that match the speech content and emotion. However, existing methods often +neglect emotional facial expressions or fail to disentangle them from speech +content. To address this issue, this paper proposes an end-to-end neural +network to disentangle different emotions in speech so as to generate rich 3D +facial expressions. Specifically, we introduce the emotion disentangling +encoder (EDE) to disentangle the emotion and content in the speech by +cross-reconstructed speech signals with different emotion labels. Then an +emotion-guided feature fusion decoder is employed to generate a 3D talking face +with enhanced emotion. The decoder is driven by the disentangled identity, +emotional, and content embeddings so as to generate controllable personal and +emotional styles. Finally, considering the scarcity of the 3D emotional talking +face data, we resort to the supervision of facial blendshapes, which enables +the reconstruction of plausible 3D faces from 2D emotional data, and contribute +a large-scale 3D emotional talking face dataset (3D-ETF) to train the network. +Our experiments and user studies demonstrate that our approach outperforms +state-of-the-art methods and exhibits more diverse facial movements. We +recommend watching the supplementary video: +https://ziqiaopeng.github.io/emotalk + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ CNOS: A Strong Baseline for CAD-based Novel Object Segmentation ICCV 2023 + + +
+ We propose a simple three-stage approach to segment unseen objects in RGB +images using their CAD models. Leveraging recent powerful foundation models, +DINOv2 and Segment Anything, we create descriptors and generate proposals, +including binary masks for a given input RGB image. By matching proposals with +reference descriptors created from CAD models, we achieve precise object ID +assignment along with modal masks. We experimentally demonstrate that our +method achieves state-of-the-art results in CAD-based novel object +segmentation, surpassing existing approaches on the seven core datasets of the +BOP challenge by 19.8% AP using the same BOP evaluation protocol. Our source +code is available at https://github.com/nv-nguyen/cnos. + +
+
+ comment: ICCV 2023, R6D Workshop +
+
+
+
+
+ + ♻ ☆ Face Encryption via Frequency-Restricted Identity-Agnostic Attacks + + +
+ Billions of people are sharing their daily live images on social media +everyday. However, malicious collectors use deep face recognition systems to +easily steal their biometric information (e.g., faces) from these images. Some +studies are being conducted to generate encrypted face photos using adversarial +attacks by introducing imperceptible perturbations to reduce face information +leakage. However, existing studies need stronger black-box scenario feasibility +and more natural visual appearances, which challenge the feasibility of privacy +protection. To address these problems, we propose a frequency-restricted +identity-agnostic (FRIA) framework to encrypt face images from unauthorized +face recognition without access to personal information. As for the weak +black-box scenario feasibility, we obverse that representations of the average +feature in multiple face recognition models are similar, thus we propose to +utilize the average feature via the crawled dataset from the Internet as the +target to guide the generation, which is also agnostic to identities of unknown +face recognition systems; in nature, the low-frequency perturbations are more +visually perceptible by the human vision system. Inspired by this, we restrict +the perturbation in the low-frequency facial regions by discrete cosine +transform to achieve the visual naturalness guarantee. Extensive experiments on +several face recognition models demonstrate that our FRIA outperforms other +state-of-the-art methods in generating more natural encrypted faces while +attaining high black-box attack success rates of 96%. In addition, we validate +the efficacy of FRIA using real-world black-box commercial API, which reveals +the potential of FRIA in practice. Our codes can be found in +https://github.com/XinDong10/FRIA. + +
+
+ comment: I noticed something missing in the article's description in + subsection 3.2, so I'd like to undo it and re-finalize and describe it +
+
+
+
+
+ + ♻ ☆ Undercover Deepfakes: Detecting Fake Segments in Videos ICCV 2023 + + +
+ The recent renaissance in generative models, driven primarily by the advent +of diffusion models and iterative improvement in GAN methods, has enabled many +creative applications. However, each advancement is also accompanied by a rise +in the potential for misuse. In the arena of the deepfake generation, this is a +key societal issue. In particular, the ability to modify segments of videos +using such generative techniques creates a new paradigm of deepfakes which are +mostly real videos altered slightly to distort the truth. This paradigm has +been under-explored by the current deepfake detection methods in the academic +literature. In this paper, we present a deepfake detection method that can +address this issue by performing deepfake prediction at the frame and video +levels. To facilitate testing our method, we prepared a new benchmark dataset +where videos have both real and fake frame sequences with very subtle +transitions. We provide a benchmark on the proposed dataset with our detection +method which utilizes the Vision Transformer based on Scaling and Shifting to +learn spatial features, and a Timeseries Transformer to learn temporal features +of the videos to help facilitate the interpretation of possible deepfakes. +Extensive experiments on a variety of deepfake generation methods show +excellent results by the proposed method on temporal segmentation and classical +video-level predictions as well. In particular, the paradigm we address will +form a powerful tool for the moderation of deepfakes, where human oversight can +be better targeted to the parts of videos suspected of being deepfakes. All +experiments can be reproduced at: +github.com/rgb91/temporal-deepfake-segmentation. + +
+
+ comment: ICCV 2023 Workshop and Challenge on DeepFake Analysis and Detection +
+
+
+
+
+ + ♻ ☆ Effective Whole-body Pose Estimation with Two-stages Distillation ICCV 2023 + + +
+ Whole-body pose estimation localizes the human body, hand, face, and foot +keypoints in an image. This task is challenging due to multi-scale body parts, +fine-grained localization for low-resolution regions, and data scarcity. +Meanwhile, applying a highly efficient and accurate pose estimator to widely +human-centric understanding and generation tasks is urgent. In this work, we +present a two-stage pose \textbf{D}istillation for \textbf{W}hole-body +\textbf{P}ose estimators, named \textbf{DWPose}, to improve their effectiveness +and efficiency. The first-stage distillation designs a weight-decay strategy +while utilizing a teacher's intermediate feature and final logits with both +visible and invisible keypoints to supervise the student from scratch. The +second stage distills the student model itself to further improve performance. +Different from the previous self-knowledge distillation, this stage finetunes +the student's head with only 20% training time as a plug-and-play training +strategy. For data limitations, we explore the UBody dataset that contains +diverse facial expressions and hand gestures for real-life applications. +Comprehensive experiments show the superiority of our proposed simple yet +effective methods. We achieve new state-of-the-art performance on +COCO-WholeBody, significantly boosting the whole-body AP of RTMPose-l from +64.8% to 66.5%, even surpassing RTMPose-x teacher with 65.3% AP. We release a +series of models with different sizes, from tiny to large, for satisfying +various downstream tasks. Our codes and models are available at +https://github.com/IDEA-Research/DWPose. + +
+
+ comment: Accepted by ICCV 2023, CV4Metaverse Workshop +
+
+
+
+
+ + ♻ ☆ Few-shot $\mathbf{1/a}$ Anomalies Feedback : Damage Vision Mining + Opportunity and Embedding Feature Imbalance + + +
+ In past decade, previous balanced datasets have been used to advance deep +learning algorithms in industrial damage vision tasks. Specifically, for +condition-based maintenance, automating visual inspection is crucial to ensure +high quality. Damage vision mining cannot avoid the imbalanced data issue +because of rare unseen events and high quality status by improved operations. +For visual damage inspection, deteriorated class acquired from the surface of +concrete and steel components are occasionally imbalanced. From numerous +related surveys, we summarize that imbalanced data problems can be categorized +into four types; 1) missing range of target and label valuables, 2) +majority-minority class imbalance, 3) foreground-background of spatial +imbalance, 4) long-tailed class of pixel-wise imbalance. Since 2015, there has +been many imbalanced studies using deep learning approaches that includes +regression, image classification, object detection, semantic segmentation. +However, anomaly detection for imbalanced data is not yet well known. In the +study, we highlight one-class anomaly detection application whether anomalous +class or not, and demonstrate clear examples on imbalanced vision datasets: +medical disease, hazardous behavior, material deterioration, plant disease, +river sludge, and disaster damage. We provide key results on damage vision +mining advantage, hypothesizing that the more effective range of positive +ratio, the higher accuracy gain of anomalies feedback. In our imbalanced +studies, compared with the balanced case of positive ratio 1/1, we find that +there is applicable positive ratio $1/a$, where the accuracy are consistently +high. However, extremely imbalanced range from one-shot to $1/2a$, whose +accuracy are inferior to those of applicable ratio. In contrast, ranged with +positive ratio over $2/a$, it is shifting in over-mining phase without +effective gain of accuracy. + +
+
+ comment: 34 pages, 53 figures, 28 tables +
+
+
+
+
+ + ♻ ☆ NVAutoNet: Fast and Accurate 360$^{\circ}$ 3D Visual Perception For Self + Driving + + +
+ Robust, real-time perception of 3D world is essential to the autonomous +vehicle. We introduce an end-to-end surround camera perception system, named +NVAutoNet, for self-driving. NVAutoNet is a multi-task, multi-camera network +which takes a variable set of time-synced camera images as input and produces a +rich collection of 3D signals such as sizes, orientations, locations of +obstacles, parking spaces and free-spaces, etc. NVAutoNet is modular and +end-to-end: 1) the outputs can be consumed directly by downstream modules +without any post-processing such as clustering and fusion -- improving speed of +model deployment and in-car testing 2) the whole network training is done in +one single stage -- improving speed of model improvement and iterations. The +network is carefully designed to have high accuracy while running at 53 fps on +NVIDIA Orin SoC (system-on-a-chip). The network is robust to sensor mounting +variations (within some tolerances) and can be quickly customized for different +vehicle types via efficient model fine-tuning. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Prototype Adapter for Vision-Language Models + + +
+ Recently, large-scale pre-trained vision-language models (e.g. CLIP and +ALIGN) have demonstrated remarkable effectiveness in acquiring transferable +visual representations. To leverage the valuable knowledge encoded within these +models for downstream tasks, several fine-tuning approaches, including prompt +tuning methods and adapter-based methods, have been developed to adapt +vision-language models effectively with supervision. However, these methods +rely on the availability of annotated samples, which can be labor-intensive and +time-consuming to acquire, thus limiting scalability. To address this issue, in +this work, we design an unsupervised fine-tuning approach for vision-language +models called Unsupervised Prototype Adapter (UP-Adapter). Specifically, for +the unannotated target datasets, we leverage the text-image aligning capability +of CLIP to automatically select the most confident samples for each class. +Utilizing these selected samples, we generate class prototypes, which serve as +the initialization for the learnable prototype model. After fine-tuning, the +prototype model prediction is combined with the original CLIP's prediction by a +residual connection to perform downstream recognition tasks. Our extensive +experimental results on image recognition and domain generalization show that +the proposed unsupervised method outperforms 8-shot CoOp, 8-shot Tip-Adapter, +and also the state-of-the-art UPL method by large margins. + +
+
+ comment: Accepted by PRCV 2023 +
+
+
+
+
+
+
+
+ + Information Retrieval 11 + +
+
+
+ + ☆ On the Practicality of Dynamic Updates in Fast Searchable Encryption + + +
+ Searchable encrypted (SE) indexing systems are a useful tool for utilizing +cloud services to store and manage sensitive information. However, much of the +work on SE systems to date has remained theoretical. In order to make them of +practical use, more work is needed to develop optimal protocols and working +models for them. This includes, in particular, the creation of a working update +model in order to maintain an encrypted index of a dynamic document set such as +an email inbox. I have created a working, real-world end-to-end SE +implementation that satisfies these needs, including the first empirical +performance evaluation of the dynamic SE update operation. In doing so, I show +a viable path to move from the theoretical concepts described by previous +researchers to a future production-worthy implementation and identify issues +for follow-on investigation. + +
+
+
+
+
+ + ☆ Leveraging Knowledge and Reinforcement Learning for Enhanced Reliability + of Language Models CIKM'23 + + +
+ The Natural Language Processing(NLP) community has been using crowd sourcing +techniques to create benchmark datasets such as General Language Understanding +and Evaluation(GLUE) for training modern Language Models such as BERT. GLUE +tasks measure the reliability scores using inter annotator metrics i.e. Cohens +Kappa. However, the reliability aspect of LMs has often been overlooked. To +counter this problem, we explore a knowledge-guided LM ensembling approach that +leverages reinforcement learning to integrate knowledge from ConceptNet and +Wikipedia as knowledge graph embeddings. This approach mimics human annotators +resorting to external knowledge to compensate for information deficits in the +datasets. Across nine GLUE datasets, our research shows that ensembling +strengthens reliability and accuracy scores, outperforming state of the art. + +
+
+ comment: Accepted at CIKM'23 +
+
+
+
+
+ + ☆ A Bayesian Active Learning Approach to Comparative Judgement + + +
+ Assessment is a crucial part of education. Traditional marking is a source of +inconsistencies and unconscious bias, placing a high cognitive load on the +assessors. An approach to address these issues is comparative judgement (CJ). +In CJ, the assessor is presented with a pair of items and is asked to select +the better one. Following a series of comparisons, a rank is derived using a +ranking model, for example, the BTM, based on the results. While CJ is +considered a reliable method for marking, there are concerns around +transparency, and the ideal number of pairwise comparisons to generate a +reliable estimation of the rank order is not known. Additionally, there have +been attempts to generate a method of selecting pairs that should be compared +next in an informative manner, but some existing methods are known to have +created their own bias within results inflating the reliability metric used. As +a result, a random selection approach is usually deployed. + We propose a novel Bayesian approach to CJ (BCJ) for determining the ranks of +compared items alongside a new way to select the pairs to present to the +marker(s) using active learning (AL), addressing the key shortcomings of +traditional CJ. Furthermore, we demonstrate how the entire approach may provide +transparency by providing the user insights into how it is making its decisions +and, at the same time, being more efficient. Results from our experiments +confirm that the proposed BCJ combined with entropy-driven AL pair-selection +method is superior to other alternatives. We also find that the more +comparisons done, the more accurate BCJ becomes, which solves the issue the +current method has of the model deteriorating if too many comparisons are +performed. As our approach can generate the complete predicted rank +distribution for an item, we also show how this can be utilised in devising a +predicted grade, guided by the assessor. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ☆ Learning and Optimization of Implicit Negative Feedback for Industrial + Short-video Recommender System CIKM'23 + + +
+ Short-video recommendation is one of the most important recommendation +applications in today's industrial information systems. Compared with other +recommendation tasks, the enormous amount of feedback is the most typical +characteristic. Specifically, in short-video recommendation, the +easiest-to-collect user feedback is from the skipping behaviors, which leads to +two critical challenges for the recommendation model. First, the skipping +behavior reflects implicit user preferences, and thus it is challenging for +interest extraction. Second, the kind of special feedback involves multiple +objectives, such as total watching time, which is also very challenging. In +this paper, we present our industrial solution in Kuaishou, which serves +billion-level users every day. Specifically, we deploy a feedback-aware +encoding module which well extracts user preference taking the impact of +context into consideration. We further design a multi-objective prediction +module which well distinguishes the relation and differences among different +model objectives in the short-video recommendation. We conduct extensive online +A/B testing, along with detailed and careful analysis, which verifies the +effectiveness of our solution. + +
+
+ comment: Accepted by CIKM'23 +
+
+
+
+
+ + ☆ Optimizing Group-Fair Plackett-Luce Ranking Models for Relevance and + Ex-Post Fairness + + +
+ In learning-to-rank (LTR), optimizing only the relevance (or the expected +ranking utility) can cause representational harm to certain categories of +items. Moreover, if there is implicit bias in the relevance scores, LTR models +may fail to optimize for true relevance. Previous works have proposed efficient +algorithms to train stochastic ranking models that achieve fairness of exposure +to the groups ex-ante (or, in expectation), which may not guarantee +representation fairness to the groups ex-post, that is, after realizing a +ranking from the stochastic ranking model. Typically, ex-post fairness is +achieved by post-processing, but previous work does not train stochastic +ranking models that are aware of this post-processing. + In this paper, we propose a novel objective that maximizes expected relevance +only over those rankings that satisfy given representation constraints to +ensure ex-post fairness. Building upon recent work on an efficient sampler for +ex-post group-fair rankings, we propose a group-fair Plackett-Luce model and +show that it can be efficiently optimized for our objective in the LTR +framework. + Experiments on three real-world datasets show that our group-fair algorithm +guarantees fairness alongside usually having better relevance compared to the +LTR baselines. In addition, our algorithm also achieves better relevance than +post-processing baselines, which also ensures ex-post fairness. Further, when +implicit bias is injected into the training data, our algorithm typically +outperforms existing LTR baselines in relevance. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ MMBAttn: Max-Mean and Bit-wise Attention for CTR Prediction + + +
+ With the increasing complexity and scale of click-through rate (CTR) +prediction tasks in online advertising and recommendation systems, accurately +estimating the importance of features has become a critical aspect of +developing effective models. In this paper, we propose an attention-based +approach that leverages max and mean pooling operations, along with a bit-wise +attention mechanism, to enhance feature importance estimation in CTR +prediction. Traditionally, pooling operations such as max and mean pooling have +been widely used to extract relevant information from features. However, these +operations can lead to information loss and hinder the accurate determination +of feature importance. To address this challenge, we propose a novel attention +architecture that utilizes a bit-based attention structure that emphasizes the +relationships between all bits in features, together with maximum and mean +pooling. By considering the fine-grained interactions at the bit level, our +method aims to capture intricate patterns and dependencies that might be +overlooked by traditional pooling operations. To examine the effectiveness of +the proposed method, experiments have been conducted on three public datasets. +The experiments demonstrated that the proposed method significantly improves +the performance of the base models to achieve state-of-the-art results. + +
+
+
+
+
+ + ☆ LSTM-based QoE Evaluation for Web Microservices' Reputation Scoring + + +
+ Sentiment analysis is the task of mining the authors' opinions about specific +entities. It allows organizations to monitor different services in real time +and act accordingly. Reputation is what is generally said or believed about +people or things. Informally, reputation combines the measure of reliability +derived from feedback, reviews, and ratings gathered from users, which reflect +their quality of experience (QoE) and can either increase or harm the +reputation of the provided services. In this study, we propose to perform +sentiment analysis on web microservices reviews to exploit the provided +information to assess and score the microservices' reputation. Our proposed +approach uses the Long Short-Term Memory (LSTM) model to perform sentiment +analysis and the Net Brand Reputation (NBR) algorithm to assess reputation +scores for microservices. This approach is tested on a set of more than 10,000 +reviews related to 15 Amazon Web microservices, and the experimental results +have shown that our approach is more accurate than existing approaches, with an +accuracy and precision of 93% obtained after applying an oversampling strategy +and a resulting reputation score of the considered microservices community of +89%. + +
+
+
+
+
+ + ☆ A Bayesian Active Learning Approach to Comparative Judgement + + +
+ Assessment is a crucial part of education. Traditional marking is a source of +inconsistencies and unconscious bias, placing a high cognitive load on the +assessors. An approach to address these issues is comparative judgement (CJ). +In CJ, the assessor is presented with a pair of items and is asked to select +the better one. Following a series of comparisons, a rank is derived using a +ranking model, for example, the BTM, based on the results. While CJ is +considered a reliable method for marking, there are concerns around +transparency, and the ideal number of pairwise comparisons to generate a +reliable estimation of the rank order is not known. Additionally, there have +been attempts to generate a method of selecting pairs that should be compared +next in an informative manner, but some existing methods are known to have +created their own bias within results inflating the reliability metric used. As +a result, a random selection approach is usually deployed. + We propose a novel Bayesian approach to CJ (BCJ) for determining the ranks of +compared items alongside a new way to select the pairs to present to the +marker(s) using active learning (AL), addressing the key shortcomings of +traditional CJ. Furthermore, we demonstrate how the entire approach may provide +transparency by providing the user insights into how it is making its decisions +and, at the same time, being more efficient. Results from our experiments +confirm that the proposed BCJ combined with entropy-driven AL pair-selection +method is superior to other alternatives. We also find that the more +comparisons done, the more accurate BCJ becomes, which solves the issue the +current method has of the model deteriorating if too many comparisons are +performed. As our approach can generate the complete predicted rank +distribution for an item, we also show how this can be utilised in devising a +predicted grade, guided by the assessor. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ☆ Model-free Reinforcement Learning with Stochastic Reward Stabilization + for Recommender Systems SIGIR '23 + + +
+ Model-free RL-based recommender systems have recently received increasing +research attention due to their capability to handle partial feedback and +long-term rewards. However, most existing research has ignored a critical +feature in recommender systems: one user's feedback on the same item at +different times is random. The stochastic rewards property essentially differs +from that in classic RL scenarios with deterministic rewards, which makes +RL-based recommender systems much more challenging. In this paper, we first +demonstrate in a simulator environment where using direct stochastic feedback +results in a significant drop in performance. Then to handle the stochastic +feedback more efficiently, we design two stochastic reward stabilization +frameworks that replace the direct stochastic feedback with that learned by a +supervised model. Both frameworks are model-agnostic, i.e., they can +effectively utilize various supervised models. We demonstrate the superiority +of the proposed frameworks over different RL-based recommendation baselines +with extensive experiments on a recommendation simulator as well as an +industrial-level recommender system. + +
+
+ comment: SIGIR '23 +
+
+
+
+
+ + ☆ Large Language Models in Analyzing Crash Narratives -- A Comparative + Study of ChatGPT, BARD and GPT-4 + + +
+ In traffic safety research, extracting information from crash narratives +using text analysis is a common practice. With recent advancements of large +language models (LLM), it would be useful to know how the popular LLM +interfaces perform in classifying or extracting information from crash +narratives. To explore this, our study has used the three most popular publicly +available LLM interfaces- ChatGPT, BARD and GPT4. This study investigated their +usefulness and boundaries in extracting information and answering queries +related to accidents from 100 crash narratives from Iowa and Kansas. During the +investigation, their capabilities and limitations were assessed and their +responses to the queries were compared. Five questions were asked related to +the narratives: 1) Who is at-fault? 2) What is the manner of collision? 3) Has +the crash occurred in a work-zone? 4) Did the crash involve pedestrians? and 5) +What are the sequence of harmful events in the crash? For questions 1 through +4, the overall similarity among the LLMs were 70%, 35%, 96% and 89%, +respectively. The similarities were higher while answering direct questions +requiring binary responses and significantly lower for complex questions. To +compare the responses to question 5, network diagram and centrality measures +were analyzed. The network diagram from the three LLMs were not always similar +although they sometimes have the same influencing events with high in-degree, +out-degree and betweenness centrality. This study suggests using multiple +models to extract viable information from narratives. Also, caution must be +practiced while using these interfaces to obtain crucial safety related +information. + +
+
+
+
+
+ + ♻ ☆ Replace Scoring with Arrangement: A Contextual Set-to-Arrangement + Framework for Learning-to-Rank CIKM 2023 + + +
+ Learning-to-rank is a core technique in the top-N recommendation task, where +an ideal ranker would be a mapping from an item set to an arrangement (a.k.a. +permutation). Most existing solutions fall in the paradigm of probabilistic +ranking principle (PRP), i.e., first score each item in the candidate set and +then perform a sort operation to generate the top ranking list. However, these +approaches neglect the contextual dependence among candidate items during +individual scoring, and the sort operation is non-differentiable. To bypass the +above issues, we propose Set-To-Arrangement Ranking (STARank), a new framework +directly generates the permutations of the candidate items without the need for +individually scoring and sort operations; and is end-to-end differentiable. As +a result, STARank can operate when only the ground-truth permutations are +accessible without requiring access to the ground-truth relevance scores for +items. For this purpose, STARank first reads the candidate items in the context +of the user browsing history, whose representations are fed into a +Plackett-Luce module to arrange the given items into a list. To effectively +utilize the given ground-truth permutations for supervising STARank, we +leverage the internal consistency property of Plackett-Luce models to derive a +computationally efficient list-wise loss. Experimental comparisons against 9 +the state-of-the-art methods on 2 learning-to-rank benchmark datasets and 3 +top-N real-world recommendation datasets demonstrate the superiority of STARank +in terms of conventional ranking metrics. Notice that these ranking metrics do +not consider the effects of the contextual dependence among the items in the +list, we design a new family of simulation-based ranking metrics, where +existing metrics can be regarded as special cases. STARank can consistently +achieve better performance in terms of PBM and UBM simulation-based metrics. + +
+
+ comment: CIKM 2023 +
+
+
+
+
+
+
+
+ + Machine Learning 106 + +
+
+
+ + ☆ Unveiling the Role of Message Passing in Dual-Privacy Preservation on + GNNs CIKM 2023 + + +
+ Graph Neural Networks (GNNs) are powerful tools for learning representations +on graphs, such as social networks. However, their vulnerability to privacy +inference attacks restricts their practicality, especially in high-stake +domains. To address this issue, privacy-preserving GNNs have been proposed, +focusing on preserving node and/or link privacy. This work takes a step back +and investigates how GNNs contribute to privacy leakage. Through theoretical +analysis and simulations, we identify message passing under structural bias as +the core component that allows GNNs to \textit{propagate} and \textit{amplify} +privacy leakage. Building upon these findings, we propose a principled +privacy-preserving GNN framework that effectively safeguards both node and link +privacy, referred to as dual-privacy preservation. The framework comprises +three major modules: a Sensitive Information Obfuscation Module that removes +sensitive information from node embeddings, a Dynamic Structure Debiasing +Module that dynamically corrects the structural bias, and an Adversarial +Learning Module that optimizes the privacy-utility trade-off. Experimental +results on four benchmark datasets validate the effectiveness of the proposed +model in protecting both node and link privacy while preserving high utility +for downstream tasks, such as node classification. + +
+
+ comment: CIKM 2023 +
+
+
+
+
+ + ☆ Does Asking Clarifying Questions Increases Confidence in Generated Code? + On the Communication Skills of Large Language Models + + +
+ Large language models (LLMs) have significantly improved the ability to +perform tasks in the field of code generation. However, there is still a gap +between LLMs being capable coders and being top-tier software engineers. Based +on the observation that top-level software engineers often ask clarifying +questions to reduce ambiguity in both requirements and coding solutions, we +argue that the same should be applied to LLMs for code generation tasks. By +asking probing questions in various topics before generating the final code, +the challenges of programming with LLMs, such as unclear intent specification, +lack of computational thinking, and undesired code quality, may be alleviated. +This, in turn, increases confidence in the generated code. In this work, we +explore how to leverage better communication skills to achieve greater +confidence in generated code. We propose a communication-centered process that +uses an LLM-generated communicator to identify issues with high ambiguity or +low confidence in problem descriptions and generated code. We then ask +clarifying questions to obtain responses from users for refining the code. + +
+
+
+
+
+ + ☆ A2Q: Accumulator-Aware Quantization with Guaranteed Overflow Avoidance + + +
+ We present accumulator-aware quantization (A2Q), a novel weight quantization +method designed to train quantized neural networks (QNNs) to avoid overflow +when using low-precision accumulators during inference. A2Q introduces a unique +formulation inspired by weight normalization that constrains the L1-norm of +model weights according to accumulator bit width bounds that we derive. Thus, +in training QNNs for low-precision accumulation, A2Q also inherently promotes +unstructured weight sparsity to guarantee overflow avoidance. We apply our +method to deep learning-based computer vision tasks to show that A2Q can train +QNNs for low-precision accumulators while maintaining model accuracy +competitive with a floating-point baseline. In our evaluations, we consider the +impact of A2Q on both general-purpose platforms and programmable hardware. +However, we primarily target model deployment on FPGAs because they can be +programmed to fully exploit custom accumulator bit widths. Our experimentation +shows accumulator bit width significantly impacts the resource efficiency of +FPGA-based accelerators. On average across our benchmarks, A2Q offers up to a +2.3x reduction in resource utilization over 32-bit accumulator counterparts +with 99.2% of the floating-point model accuracy. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2301.13376 +
+
+
+
+
+ + ☆ Escaping the Sample Trap: Fast and Accurate Epistemic Uncertainty + Estimation with Pairwise-Distance Estimators + + +
+ This work introduces a novel approach for epistemic uncertainty estimation +for ensemble models using pairwise-distance estimators (PaiDEs). These +estimators utilize the pairwise-distance between model components to establish +bounds on entropy and uses said bounds as estimates for information-based +criterion. Unlike recent deep learning methods for epistemic uncertainty +estimation, which rely on sample-based Monte Carlo estimators, PaiDEs are able +to estimate epistemic uncertainty up to 100$\times$ faster, over a larger space +(up to 100$\times$) and perform more accurately in higher dimensions. To +validate our approach, we conducted a series of experiments commonly used to +evaluate epistemic uncertainty estimation: 1D sinusoidal data, Pendulum-v0, +Hopper-v2, Ant-v2 and Humanoid-v2. For each experimental setting, an Active +Learning framework was applied to demonstrate the advantages of PaiDEs for +epistemic uncertainty estimation. + +
+
+
+
+
+ + ☆ Ngambay-French Neural Machine Translation (sba-Fr) + + +
+ In Africa, and the world at large, there is an increasing focus on developing +Neural Machine Translation (NMT) systems to overcome language barriers. NMT for +Low-resource language is particularly compelling as it involves learning with +limited labelled data. However, obtaining a well-aligned parallel corpus for +low-resource languages can be challenging. The disparity between the +technological advancement of a few global languages and the lack of research on +NMT for local languages in Chad is striking. End-to-end NMT trials on +low-resource Chad languages have not been attempted. Additionally, there is a +dearth of online and well-structured data gathering for research in Natural +Language Processing, unlike some African languages. However, a guided approach +for data gathering can produce bitext data for many Chadian language +translation pairs with well-known languages that have ample data. In this +project, we created the first sba-Fr Dataset, which is a corpus of +Ngambay-to-French translations, and fine-tuned three pre-trained models using +this dataset. Our experiments show that the M2M100 model outperforms other +models with high BLEU scores on both original and original+synthetic data. The +publicly available bitext dataset can be used for research purposes. + +
+
+ comment: Accepted at RANLP 2023 - International Workshop NLP tools and + resources for translation and interpreting applications +
+
+
+
+
+ + ☆ TpuGraphs: A Performance Prediction Dataset on Large Tensor + Computational Graphs + + +
+ Precise hardware performance models play a crucial role in code +optimizations. They can assist compilers in making heuristic decisions or aid +autotuners in identifying the optimal configuration for a given program. For +example, the autotuner for XLA, a machine learning compiler, discovered 10-20% +speedup on state-of-the-art models serving substantial production traffic at +Google. Although there exist a few datasets for program performance prediction, +they target small sub-programs such as basic blocks or kernels. This paper +introduces TpuGraphs, a performance prediction dataset on full tensor programs, +represented as computational graphs, running on Tensor Processing Units (TPUs). +Each graph in the dataset represents the main computation of a machine learning +workload, e.g., a training epoch or an inference step. Each data sample +contains a computational graph, a compilation configuration, and the execution +time of the graph when compiled with the configuration. The graphs in the +dataset are collected from open-source machine learning programs, featuring +popular model architectures, e.g., ResNet, EfficientNet, Mask R-CNN, and +Transformer. TpuGraphs provides 25x more graphs than the largest graph property +prediction dataset (with comparable graph sizes), and 770x larger graphs on +average compared to existing performance prediction datasets on machine +learning programs. This graph-level prediction task on large graphs introduces +new challenges in learning, ranging from scalability, training efficiency, to +model quality. + +
+
+
+
+
+ + ☆ Staleness-Alleviated Distributed GNN Training via Online + Dynamic-Embedding Prediction + + +
+ Despite the recent success of Graph Neural Networks (GNNs), it remains +challenging to train GNNs on large-scale graphs due to neighbor explosions. As +a remedy, distributed computing becomes a promising solution by leveraging +abundant computing resources (e.g., GPU). However, the node dependency of graph +data increases the difficulty of achieving high concurrency in distributed GNN +training, which suffers from the massive communication overhead. To address it, +Historical value approximation is deemed a promising class of distributed +training techniques. It utilizes an offline memory to cache historical +information (e.g., node embedding) as an affordable approximation of the exact +value and achieves high concurrency. However, such benefits come at the cost of +involving dated training information, leading to staleness, imprecision, and +convergence issues. To overcome these challenges, this paper proposes SAT +(Staleness-Alleviated Training), a novel and scalable distributed GNN training +framework that reduces the embedding staleness adaptively. The key idea of SAT +is to model the GNN's embedding evolution as a temporal graph and build a model +upon it to predict future embedding, which effectively alleviates the staleness +of the cached historical embedding. We propose an online algorithm to train the +embedding predictor and the distributed GNN alternatively and further provide a +convergence analysis. Empirically, we demonstrate that SAT can effectively +reduce embedding staleness and thus achieve better performance and convergence +speed on multiple large-scale graph datasets. + +
+
+ comment: Preprint. Do not distribute. arXiv admin note: text overlap with + arXiv:2206.00057 +
+
+
+
+
+ + ☆ Learning to Intervene on Concept Bottlenecks + + +
+ While traditional deep learning models often lack interpretability, concept +bottleneck models (CBMs) provide inherent explanations via their concept +representations. Specifically, they allow users to perform interventional +interactions on these concepts by updating the concept values and thus +correcting the predictive output of the model. Traditionally, however, these +interventions are applied to the model only once and discarded afterward. To +rectify this, we present concept bottleneck memory models (CB2M), an extension +to CBMs. Specifically, a CB2M learns to generalize interventions to appropriate +novel situations via a two-fold memory with which it can learn to detect +mistakes and to reapply previous interventions. In this way, a CB2M learns to +automatically improve model performance from a few initially obtained +interventions. If no prior human interventions are available, a CB2M can detect +potential mistakes of the CBM bottleneck and request targeted interventions. In +our experimental evaluations on challenging scenarios like handling +distribution shifts and confounded training data, we illustrate that CB2M are +able to successfully generalize interventions to unseen data and can indeed +identify wrongly inferred concepts. Overall, our results show that CB2M is a +great tool for users to provide interactive feedback on CBMs, e.g., by guiding +a user's interaction and requiring fewer interventions. + +
+
+
+
+
+ + ☆ Gotta match 'em all: Solution diversification in graph matching matched + filters + + +
+ We present a novel approach for finding multiple noisily embedded template +graphs in a very large background graph. Our method builds upon the +graph-matching-matched-filter technique proposed in Sussman et al., with the +discovery of multiple diverse matchings being achieved by iteratively +penalizing a suitable node-pair similarity matrix in the matched filter +algorithm. In addition, we propose algorithmic speed-ups that greatly enhance +the scalability of our matched-filter approach. We present theoretical +justification of our methodology in the setting of correlated Erdos-Renyi +graphs, showing its ability to sequentially discover multiple templates under +mild model conditions. We additionally demonstrate our method's utility via +extensive experiments both using simulated models and real-world dataset, +include human brain connectomes and a large transactional knowledge base. + +
+
+ comment: 36 pages, 12 figures, 1 table +
+
+
+
+
+ + ☆ Six Lectures on Linearized Neural Networks + + +
+ In these six lectures, we examine what can be learnt about the behavior of +multi-layer neural networks from the analysis of linear models. We first recall +the correspondence between neural networks and linear models via the so-called +lazy regime. We then review four models for linearized neural networks: linear +regression with concentrated features, kernel ridge regression, random feature +model and neural tangent model. Finally, we highlight the limitations of the +linear theory and discuss how other approaches can overcome them. + +
+
+ comment: 77 pages, 8 figures +
+
+
+
+
+ + ☆ Nougat: Neural Optical Understanding for Academic Documents + + +
+ Scientific knowledge is predominantly stored in books and scientific +journals, often in the form of PDFs. However, the PDF format leads to a loss of +semantic information, particularly for mathematical expressions. We propose +Nougat (Neural Optical Understanding for Academic Documents), a Visual +Transformer model that performs an Optical Character Recognition (OCR) task for +processing scientific documents into a markup language, and demonstrate the +effectiveness of our model on a new dataset of scientific documents. The +proposed approach offers a promising solution to enhance the accessibility of +scientific knowledge in the digital age, by bridging the gap between +human-readable documents and machine-readable text. We release the models and +code to accelerate future work on scientific text recognition. + +
+
+ comment: 17 pages, 10 figures +
+
+
+
+
+ + ☆ An investigation into the impact of deep learning model choice on sex + and race bias in cardiac MR segmentation + + +
+ In medical imaging, artificial intelligence (AI) is increasingly being used +to automate routine tasks. However, these algorithms can exhibit and exacerbate +biases which lead to disparate performances between protected groups. We +investigate the impact of model choice on how imbalances in subject sex and +race in training datasets affect AI-based cine cardiac magnetic resonance image +segmentation. We evaluate three convolutional neural network-based models and +one vision transformer model. We find significant sex bias in three of the four +models and racial bias in all of the models. However, the severity and nature +of the bias varies between the models, highlighting the importance of model +choice when attempting to train fair AI-based segmentation models for medical +imaging tasks. + +
+
+
+
+
+ + ☆ Using Visual and Vehicular Sensors for Driver Behavior Analysis: A + Survey + + +
+ Risky drivers account for 70% of fatal accidents in the United States. With +recent advances in sensors and intelligent vehicular systems, there has been +significant research on assessing driver behavior to improve driving +experiences and road safety. This paper examines the various techniques used to +analyze driver behavior using visual and vehicular data, providing an overview +of the latest research in this field. The paper also discusses the challenges +and open problems in the field and offers potential recommendations for future +research. The survey concludes that integrating vision and vehicular +information can significantly enhance the accuracy and effectiveness of driver +behavior analysis, leading to improved safety measures and reduced traffic +accidents. + +
+
+ comment: 10 pages, 2 figures, 5 tables +
+
+
+
+
+ + ☆ EntropyRank: Unsupervised Keyphrase Extraction via Side-Information + Optimization for Language Model-based Text Compression + + +
+ We propose an unsupervised method to extract keywords and keyphrases from +texts based on a pre-trained language model (LM) and Shannon's information +maximization. Specifically, our method extracts phrases having the highest +conditional entropy under the LM. The resulting set of keyphrases turns out to +solve a relevant information-theoretic problem: if provided as side +information, it leads to the expected minimal binary code length in compressing +the text using the LM and an entropy encoder. Alternately, the resulting set is +an approximation via a causal LM to the set of phrases that minimize the +entropy of the text when conditioned upon it. Empirically, the method provides +results comparable to the most commonly used methods in various keyphrase +extraction benchmark challenges. + +
+
+
+
+
+ + ☆ TFDNet: Time-Frequency Enhanced Decomposed Network for Long-term Time + Series Forecasting + + +
+ Long-term time series forecasting is a vital task and has a wide range of +real applications. Recent methods focus on capturing the underlying patterns +from one single domain (e.g. the time domain or the frequency domain), and have +not taken a holistic view to process long-term time series from the +time-frequency domains. In this paper, we propose a Time-Frequency Enhanced +Decomposed Network (TFDNet) to capture both the long-term underlying patterns +and temporal periodicity from the time-frequency domain. In TFDNet, we devise a +multi-scale time-frequency enhanced encoder backbone and develop two separate +trend and seasonal time-frequency blocks to capture the distinct patterns +within the decomposed trend and seasonal components in multi-resolutions. +Diverse kernel learning strategies of the kernel operations in time-frequency +blocks have been explored, by investigating and incorporating the potential +different channel-wise correlation patterns of multivariate time series. +Experimental evaluation of eight datasets from five benchmark domains +demonstrated that TFDNet is superior to state-of-the-art approaches in both +effectiveness and efficiency. + +
+
+
+
+
+ + ☆ In-context learning for model-free system identification + + +
+ In traditional system identification, we estimate a model of an unknown +dynamical system based on given input/output sequences and available physical +knowledge. Yet, is it also possible to understand the intricacies of dynamical +systems not solely from their input/output patterns, but by observing the +behavior of other systems within the same class? This central question drives +the study presented in this paper. + In response to this query, we introduce a novel paradigm for system +identification, addressing two primary tasks: one-step-ahead prediction and +multi-step simulation. Unlike conventional methods, we do not directly estimate +a model for the specific system. Instead, we pretrain a meta model that +represents a class of dynamical systems. This meta model is trained from a +potentially infinite stream of synthetic data, generated by systems randomly +extracted from a certain distribution. At its core, the meta model serves as an +implicit representation of the main characteristics of a class of dynamical +systems. When provided with a brief context from a new system - specifically, a +short input/output sequence - the meta model implicitly discerns its dynamics, +enabling predictions of its behavior. + The proposed approach harnesses the power of Transformer architectures, +renowned for their in-context learning capabilities in Natural Language +Processing tasks. For one-step prediction, a GPT-like decoder-only architecture +is utilized, whereas the simulation problem employs an encoder-decoder +structure. + Initial experimental results affirmatively answer our foundational question, +opening doors to fresh research avenues in system identification. + +
+
+
+
+
+ + ☆ EOG Artifact Removal from Single and Multi-channel EEG Recordings + through the combination of Long Short-Term Memory Networks and Independent + Component Analysis + + +
+ Introduction: Electroencephalogram (EEG) signals have gained significant +popularity in various applications due to their rich information content. +However, these signals are prone to contamination from various sources of +artifacts, notably the electrooculogram (EOG) artifacts caused by eye +movements. The most effective approach to mitigate EOG artifacts involves +recording EOG signals simultaneously with EEG and employing blind source +separation techniques, such as independent component analysis (ICA). +Nevertheless, the availability of EOG recordings is not always feasible, +particularly in pre-recorded datasets. Objective: In this paper, we present a +novel methodology that combines a long short-term memory (LSTM)-based neural +network with ICA to address the challenge of EOG artifact removal from +contaminated EEG signals. Approach: Our approach aims to accomplish two primary +objectives: 1) estimate the horizontal and vertical EOG signals from the +contaminated EEG data, and 2) employ ICA to eliminate the estimated EOG signals +from the EEG, thereby producing an artifact-free EEG signal. Main results: To +evaluate the performance of our proposed method, we conducted experiments on a +publicly available dataset comprising recordings from 27 participants. We +employed well-established metrics such as mean squared error, mean absolute +error, and mean error to assess the quality of our artifact removal technique. +Significance: Furthermore, we compared the performance of our approach with two +state-of-the-art deep learning-based methods reported in the literature, +demonstrating the superior performance of our proposed methodology. + +
+
+
+
+
+ + ☆ A topological model for partial equivariance in deep learning and data + analysis + + +
+ In this article, we propose a topological model to encode partial +equivariance in neural networks. To this end, we introduce a class of +operators, called P-GENEOs, that change data expressed by measurements, +respecting the action of certain sets of transformations, in a non-expansive +way. If the set of transformations acting is a group, then we obtain the +so-called GENEOs. We then study the spaces of measurements, whose domains are +subject to the action of certain self-maps, and the space of P-GENEOs between +these spaces. We define pseudo-metrics on them and show some properties of the +resulting spaces. In particular, we show how such spaces have convenient +approximation and convexity properties. + +
+
+
+
+
+ + ☆ On the Impact of Language Selection for Training and Evaluating + Programming Language Models SC + + +
+ The recent advancements in Transformer-based Language Models have +demonstrated significant potential in enhancing the multilingual capabilities +of these models. The remarkable progress made in this domain not only applies +to natural language tasks but also extends to the domain of programming +languages. Despite the ability of these models to learn from multiple +languages, evaluations typically focus on particular combinations of the same +languages. In this study, we evaluate the similarity of programming languages +by analyzing their representations using a CodeBERT-based model. Our +experiments reveal that token representation in languages such as C++, Python, +and Java exhibit proximity to one another, whereas the same tokens in languages +such as Mathematica and R display significant dissimilarity. Our findings +suggest that this phenomenon can potentially result in performance challenges +when dealing with diverse languages. Thus, we recommend using our similarity +measure to select a diverse set of programming languages when training and +evaluating future models. + +
+
+ comment: Accepted to 2023 IEEE 23rd International Working Conference on Source + Code Analysis and Manipulation (SCAM), NIER track +
+
+
+
+
+ + ☆ A Generic Machine Learning Framework for Fully-Unsupervised Anomaly + Detection with Contaminated Data + + +
+ Anomaly detection (AD) tasks have been solved using machine learning +algorithms in various domains and applications. The great majority of these +algorithms use normal data to train a residual-based model, and assign anomaly +scores to unseen samples based on their dissimilarity with the learned normal +regime. The underlying assumption of these approaches is that anomaly-free data +is available for training. This is, however, often not the case in real-world +operational settings, where the training data may be contaminated with a +certain fraction of abnormal samples. Training with contaminated data, in turn, +inevitably leads to a deteriorated AD performance of the residual-based +algorithms. + In this paper we introduce a framework for a fully unsupervised refinement of +contaminated training data for AD tasks. The framework is generic and can be +applied to any residual-based machine learning model. We demonstrate the +application of the framework to two public datasets of multivariate time series +machine data from different application fields. We show its clear superiority +over the naive approach of training with contaminated data without refinement. +Moreover, we compare it to the ideal, unrealistic reference in which +anomaly-free data would be available for training. Since the approach exploits +information from the anomalies, and not only from the normal regime, it is +comparable and often outperforms the ideal baseline as well. + +
+
+
+
+
+ + ☆ Compressor-Based Classification for Atrial Fibrillation Detection + + +
+ Atrial fibrillation (AF) is one of the most common arrhythmias with +challenging public health implications. Automatic detection of AF episodes is +therefore one of the most important tasks in biomedical engineering. In this +paper, we apply the recently introduced method of compressor-based text +classification to the task of AF detection (binary classification between heart +rhythms). We investigate the normalised compression distance applied to +$\Delta$RR and RR-interval sequences, the configuration of the k-Nearest +Neighbour classifier, and an optimal window length. We achieve good +classification results (avg. sensitivity = 97.1%, avg. specificity = 91.7%, +best sensitivity of 99.8%, best specificity of 97.6% with 5-fold +cross-validation). Obtained performance is close to the best specialised AF +detection algorithms. Our results suggest that gzip classification, originally +proposed for texts, is suitable for biomedical data and continuous stochastic +sequences in general. + +
+
+ comment: This paper is sent for review at the IEEE conference, 2023 +
+
+
+
+
+ + ☆ Fine-tuning can cripple your foundation model; preserving features may + be the solution + + +
+ Pre-trained foundation models, owing primarily to their enormous capacity and +exposure to vast amount of training data scraped from the internet, enjoy the +advantage of storing knowledge about plenty of real-world concepts. Such models +are typically fine-tuned on downstream datasets to produce remarkable +state-of-the-art performances. While various fine-tuning methods have been +devised and are shown to be highly effective, we observe that a fine-tuned +model's ability to recognize concepts on tasks $\textit{different}$ from the +downstream one is reduced significantly compared to its pre-trained +counterpart. This is clearly undesirable as a huge amount of time and money +went into learning those very concepts in the first place. We call this +undesirable phenomenon "concept forgetting" and via experiments show that most +end-to-end fine-tuning approaches suffer heavily from this side effect. To this +end, we also propose a rather simple fix to this problem by designing a method +called LDIFS (short for $\ell_2$ distance in feature space) that simply +preserves the features of the original foundation model during fine-tuning. We +show that LDIFS significantly reduces concept forgetting without having +noticeable impact on the downstream task performance. + +
+
+
+
+
+ + ☆ Transforming the Output of Generative Pre-trained Transformer: The + Influence of the PGI Framework on Attention Dynamics + + +
+ This paper presents a novel approach named Persona-Grouping-Intelligence +(PGI), which has been crafted to tackle the challenges posed by GPT models when +applied to real-world business issues. PGI leverages the inherent capabilities +of the GPT model to comprehend intricate language structures and generate +responses that are contextually relevant. The experiment occurred in a business +scenario where human intelligence was being underutilized due to less optimized +business processes. The primary objective of this approach is to leverage GPT +models to reduce the workload on humans in tasks that are extensive, +monotonous, and repetitive. Instead, the focus is redirected toward +decision-making activities. Remarkably, the experiment yielded an accuracy rate +of 93.81% in validating 4,000 responses generated by the model, underscoring +the effectiveness of the PGI strategies. Effectively addressing the issue of +underutilized human intelligence, this paradigm shift aligns business +environments with dynamic machine intelligence, enabling them to navigate the +intricacies of real-world challenges. This approach facilitates the practical +utilization of these models to tackle actual problems. The methodology offers +an opportunity to reshape the fundamental structure of business processes by +seamlessly integrating human decision-making with adaptable machine +intelligence. Consequently, this optimization enhances operational efficiency +and elevates strategic decision-making across diverse business contexts. + +
+
+
+
+
+ + ☆ Bang and the Artefacts are Gone! Rapid Artefact Removal and Tissue + Segmentation in Haematoxylin and Eosin Stained Biopsies + + +
+ We present H&E Otsu thresholding, a scheme for rapidly detecting tissue in +whole-slide images (WSIs) that eliminates a wide range of undesirable artefacts +such as pen marks and scanning artefacts. Our method involves obtaining a +bid-modal representation of a low-magnification RGB overview image which +enables simple Otsu thresholding to separate tissue from background and +artefacts. We demonstrate our method on WSIs prepared from a wide range of +institutions and WSI digital scanners, each containing substantial artefacts +that cause other methods to fail. The beauty of our approach lies in its +simplicity: manipulating RGB colour space and using Otsu thresholding allows +for the rapid removal of artefacts and segmentation of tissue. + +
+
+ comment: 4 pages, 2 figures +
+
+
+
+
+ + ☆ Learning Compact Neural Networks with Deep Overparameterised Multitask + Learning IJCAI2023 + + +
+ Compact neural network offers many benefits for real-world applications. +However, it is usually challenging to train the compact neural networks with +small parameter sizes and low computational costs to achieve the same or better +model performance compared to more complex and powerful architecture. This is +particularly true for multitask learning, with different tasks competing for +resources. We present a simple, efficient and effective multitask learning +overparameterisation neural network design by overparameterising the model +architecture in training and sharing the overparameterised model parameters +more effectively across tasks, for better optimisation and generalisation. +Experiments on two challenging multitask datasets (NYUv2 and COCO) demonstrate +the effectiveness of the proposed method across various convolutional networks +and parameter sizes. + +
+
+ comment: Accepted for IJCAI2023 workshop, 1st International Workshop on + Generalizing from Limited Resources in the Open World +
+
+
+
+
+ + ☆ Federated Linear Bandit Learning via Over-the-Air Computation + + +
+ In this paper, we investigate federated contextual linear bandit learning +within a wireless system that comprises a server and multiple devices. Each +device interacts with the environment, selects an action based on the received +reward, and sends model updates to the server. The primary objective is to +minimize cumulative regret across all devices within a finite time horizon. To +reduce the communication overhead, devices communicate with the server via +over-the-air computation (AirComp) over noisy fading channels, where the +channel noise may distort the signals. In this context, we propose a customized +federated linear bandits scheme, where each device transmits an analog signal, +and the server receives a superposition of these signals distorted by channel +noise. A rigorous mathematical analysis is conducted to determine the regret +bound of the proposed scheme. Both theoretical analysis and numerical +experiments demonstrate the competitive performance of our proposed scheme in +terms of regret bounds in various settings. + +
+
+
+
+
+ + ☆ Training normalizing flows with computationally intensive target + probability distributions + + +
+ Machine learning techniques, in particular the so-called normalizing flows, +are becoming increasingly popular in the context of Monte Carlo simulations as +they can effectively approximate target probability distributions. In the case +of lattice field theories (LFT) the target distribution is given by the +exponential of the action. The common loss function's gradient estimator based +on the "reparametrization trick" requires the calculation of the derivative of +the action with respect to the fields. This can present a significant +computational cost for complicated, non-local actions like e.g. fermionic +action in QCD. In this contribution, we propose an estimator for normalizing +flows based on the REINFORCE algorithm that avoids this issue. We apply it to +two dimensional Schwinger model with Wilson fermions at criticality and show +that it is up to ten times faster in terms of the wall-clock time as well as +requiring up to $30\%$ less memory than the reparameterization trick estimator. +It is also more numerically stable allowing for single precision calculations +and the use of half-float tensor cores. We present an in-depth analysis of the +origins of those improvements. We believe that these benefits will appear also +outside the realm of the LFT, in each case where the target probability +distribution is computationally intensive. + +
+
+ comment: 15 pages, 5 figures, 4 tables, 3 listings +
+
+
+
+
+ + ☆ A Bayesian Active Learning Approach to Comparative Judgement + + +
+ Assessment is a crucial part of education. Traditional marking is a source of +inconsistencies and unconscious bias, placing a high cognitive load on the +assessors. An approach to address these issues is comparative judgement (CJ). +In CJ, the assessor is presented with a pair of items and is asked to select +the better one. Following a series of comparisons, a rank is derived using a +ranking model, for example, the BTM, based on the results. While CJ is +considered a reliable method for marking, there are concerns around +transparency, and the ideal number of pairwise comparisons to generate a +reliable estimation of the rank order is not known. Additionally, there have +been attempts to generate a method of selecting pairs that should be compared +next in an informative manner, but some existing methods are known to have +created their own bias within results inflating the reliability metric used. As +a result, a random selection approach is usually deployed. + We propose a novel Bayesian approach to CJ (BCJ) for determining the ranks of +compared items alongside a new way to select the pairs to present to the +marker(s) using active learning (AL), addressing the key shortcomings of +traditional CJ. Furthermore, we demonstrate how the entire approach may provide +transparency by providing the user insights into how it is making its decisions +and, at the same time, being more efficient. Results from our experiments +confirm that the proposed BCJ combined with entropy-driven AL pair-selection +method is superior to other alternatives. We also find that the more +comparisons done, the more accurate BCJ becomes, which solves the issue the +current method has of the model deteriorating if too many comparisons are +performed. As our approach can generate the complete predicted rank +distribution for an item, we also show how this can be utilised in devising a +predicted grade, guided by the assessor. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ☆ JAX-LOB: A GPU-Accelerated limit order book simulator to unlock large + scale reinforcement learning for trading + + +
+ Financial exchanges across the world use limit order books (LOBs) to process +orders and match trades. For research purposes it is important to have large +scale efficient simulators of LOB dynamics. LOB simulators have previously been +implemented in the context of agent-based models (ABMs), reinforcement learning +(RL) environments, and generative models, processing order flows from +historical data sets and hand-crafted agents alike. For many applications, +there is a requirement for processing multiple books, either for the +calibration of ABMs or for the training of RL agents. We showcase the first +GPU-enabled LOB simulator designed to process thousands of books in parallel, +with a notably reduced per-message processing time. The implementation of our +simulator - JAX-LOB - is based on design choices that aim to best exploit the +powers of JAX without compromising on the realism of LOB-related mechanisms. We +integrate JAX-LOB with other JAX packages, to provide an example of how one may +address an optimal execution problem with reinforcement learning, and to share +some preliminary results from end-to-end RL training on GPUs. + +
+
+
+
+
+ + ☆ AtmoRep: A stochastic model of atmosphere dynamics using large scale + representation learning + + +
+ The atmosphere affects humans in a multitude of ways, from loss of life due +to adverse weather effects to long-term social and economic impacts on +societies. Computer simulations of atmospheric dynamics are, therefore, of +great importance for the well-being of our and future generations. Here, we +propose AtmoRep, a novel, task-independent stochastic computer model of +atmospheric dynamics that can provide skillful results for a wide range of +applications. AtmoRep uses large-scale representation learning from artificial +intelligence to determine a general description of the highly complex, +stochastic dynamics of the atmosphere from the best available estimate of the +system's historical trajectory as constrained by observations. This is enabled +by a novel self-supervised learning objective and a unique ensemble that +samples from the stochastic model with a variability informed by the one in the +historical record. The task-independent nature of AtmoRep enables skillful +results for a diverse set of applications without specifically training for +them and we demonstrate this for nowcasting, temporal interpolation, model +correction, and counterfactuals. We also show that AtmoRep can be improved with +additional data, for example radar observations, and that it can be extended to +tasks such as downscaling. Our work establishes that large-scale neural +networks can provide skillful, task-independent models of atmospheric dynamics. +With this, they provide a novel means to make the large record of atmospheric +observations accessible for applications and for scientific inquiry, +complementing existing simulations based on first principles. + +
+
+
+
+
+ + ☆ Hyperbolic Random Forests + + +
+ Hyperbolic space is becoming a popular choice for representing data due to +the hierarchical structure - whether implicit or explicit - of many real-world +datasets. Along with it comes a need for algorithms capable of solving +fundamental tasks, such as classification, in hyperbolic space. Recently, +multiple papers have investigated hyperbolic alternatives to hyperplane-based +classifiers, such as logistic regression and SVMs. While effective, these +approaches struggle with more complex hierarchical data. We, therefore, propose +to generalize the well-known random forests to hyperbolic space. We do this by +redefining the notion of a split using horospheres. Since finding the globally +optimal split is computationally intractable, we find candidate horospheres +through a large-margin classifier. To make hyperbolic random forests work on +multi-class data and imbalanced experiments, we furthermore outline a new +method for combining classes based on their lowest common ancestor and a +class-balanced version of the large-margin loss. Experiments on standard and +new benchmarks show that our approach outperforms both conventional random +forest algorithms and recent hyperbolic classifiers. + +
+
+ comment: Code available at https://github.com/LarsDoorenbos/HoroRF +
+
+
+
+
+ + ☆ Integrating LLMs and Decision Transformers for Language Grounded + Generative Quality-Diversity + + +
+ Quality-Diversity is a branch of stochastic optimization that is often +applied to problems from the Reinforcement Learning and control domains in +order to construct repertoires of well-performing policies/skills that exhibit +diversity with respect to a behavior space. Such archives are usually composed +of a finite number of reactive agents which are each associated to a unique +behavior descriptor, and instantiating behavior descriptors outside of that +coarsely discretized space is not straight-forward. While a few recent works +suggest solutions to that issue, the trajectory that is generated is not easily +customizable beyond the specification of a target behavior descriptor. We +propose to jointly solve those problems in environments where semantic +information about static scene elements is available by leveraging a Large +Language Model to augment the repertoire with natural language descriptions of +trajectories, and training a policy conditioned on those descriptions. Thus, +our method allows a user to not only specify an arbitrary target behavior +descriptor, but also provide the model with a high-level textual prompt to +shape the generated trajectory. We also propose an LLM-based approach to +evaluating the performance of such generative agents. Furthermore, we develop a +benchmark based on simulated robot navigation in a 2d maze that we use for +experimental validation. + +
+
+ comment: 16 pages, 9 figures, 2 tables +
+
+
+
+
+ + ☆ Heterogeneous Decentralized Machine Unlearning with Seed Model + Distillation + + +
+ As some recent information security legislation endowed users with +unconditional rights to be forgotten by any trained machine learning model, +personalized IoT service providers have to put unlearning functionality into +their consideration. The most straightforward method to unlearn users' +contribution is to retrain the model from the initial state, which is not +realistic in high throughput applications with frequent unlearning requests. +Though some machine unlearning frameworks have been proposed to speed up the +retraining process, they fail to match decentralized learning scenarios. In +this paper, we design a decentralized unlearning framework called HDUS, which +uses distilled seed models to construct erasable ensembles for all clients. +Moreover, the framework is compatible with heterogeneous on-device models, +representing stronger scalability in real-world applications. Extensive +experiments on three real-world datasets show that our HDUS achieves +state-of-the-art performance. + +
+
+
+
+
+ + ☆ Heterogeneous Federated Learning via Personalized Generative Networks + + +
+ Federated Learning (FL) allows several clients to construct a common global +machine-learning model without having to share their data. FL, however, faces +the challenge of statistical heterogeneity between the client's data, which +degrades performance and slows down the convergence toward the global model. In +this paper, we provide theoretical proof that minimizing heterogeneity between +clients facilitates the convergence of a global model for every single client. +This becomes particularly important under empirical concept shifts among +clients, rather than merely considering imbalanced classes, which have been +studied until now. Therefore, we propose a method for knowledge transfer +between clients where the server trains client-specific generators. Each +generator generates samples for the corresponding client to remove the conflict +with other clients' models. Experiments conducted on synthetic and real data, +along with a theoretical study, support the effectiveness of our method in +constructing a well-generalizable global model by reducing the conflict between +local models. + +
+
+
+
+
+ + ☆ Kissing to Find a Match: Efficient Low-Rank Permutation Representation + + +
+ Permutation matrices play a key role in matching and assignment problems +across the fields, especially in computer vision and robotics. However, memory +for explicitly representing permutation matrices grows quadratically with the +size of the problem, prohibiting large problem instances. In this work, we +propose to tackle the curse of dimensionality of large permutation matrices by +approximating them using low-rank matrix factorization, followed by a +nonlinearity. To this end, we rely on the Kissing number theory to infer the +minimal rank required for representing a permutation matrix of a given size, +which is significantly smaller than the problem size. This leads to a drastic +reduction in computation and memory costs, e.g., up to $3$ orders of magnitude +less memory for a problem of size $n=20000$, represented using $8.4\times10^5$ +elements in two small matrices instead of using a single huge matrix with +$4\times 10^8$ elements. The proposed representation allows for accurate +representations of large permutation matrices, which in turn enables handling +large problems that would have been infeasible otherwise. We demonstrate the +applicability and merits of the proposed approach through a series of +experiments on a range of problems that involve predicting permutation +matrices, from linear and quadratic assignment to shape matching problems. + +
+
+ comment: 13 pages, 6 figures +
+
+
+
+
+ + ☆ Model-free Reinforcement Learning with Stochastic Reward Stabilization + for Recommender Systems SIGIR '23 + + +
+ Model-free RL-based recommender systems have recently received increasing +research attention due to their capability to handle partial feedback and +long-term rewards. However, most existing research has ignored a critical +feature in recommender systems: one user's feedback on the same item at +different times is random. The stochastic rewards property essentially differs +from that in classic RL scenarios with deterministic rewards, which makes +RL-based recommender systems much more challenging. In this paper, we first +demonstrate in a simulator environment where using direct stochastic feedback +results in a significant drop in performance. Then to handle the stochastic +feedback more efficiently, we design two stochastic reward stabilization +frameworks that replace the direct stochastic feedback with that learned by a +supervised model. Both frameworks are model-agnostic, i.e., they can +effectively utilize various supervised models. We demonstrate the superiority +of the proposed frameworks over different RL-based recommendation baselines +with extensive experiments on a recommendation simulator as well as an +industrial-level recommender system. + +
+
+ comment: SIGIR '23 +
+
+
+
+
+ + ☆ Optimizing Group-Fair Plackett-Luce Ranking Models for Relevance and + Ex-Post Fairness + + +
+ In learning-to-rank (LTR), optimizing only the relevance (or the expected +ranking utility) can cause representational harm to certain categories of +items. Moreover, if there is implicit bias in the relevance scores, LTR models +may fail to optimize for true relevance. Previous works have proposed efficient +algorithms to train stochastic ranking models that achieve fairness of exposure +to the groups ex-ante (or, in expectation), which may not guarantee +representation fairness to the groups ex-post, that is, after realizing a +ranking from the stochastic ranking model. Typically, ex-post fairness is +achieved by post-processing, but previous work does not train stochastic +ranking models that are aware of this post-processing. + In this paper, we propose a novel objective that maximizes expected relevance +only over those rankings that satisfy given representation constraints to +ensure ex-post fairness. Building upon recent work on an efficient sampler for +ex-post group-fair rankings, we propose a group-fair Plackett-Luce model and +show that it can be efficiently optimized for our objective in the LTR +framework. + Experiments on three real-world datasets show that our group-fair algorithm +guarantees fairness alongside usually having better relevance compared to the +LTR baselines. In addition, our algorithm also achieves better relevance than +post-processing baselines, which also ensures ex-post fairness. Further, when +implicit bias is injected into the training data, our algorithm typically +outperforms existing LTR baselines in relevance. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ Bayesian Reasoning for Physics Informed Neural Networks + + +
+ Physics informed neural network (PINN) approach in Bayesian formulation is +presented. We adopt the Bayesian neural network framework formulated by MacKay +(Neural Computation 4 (3) (1992) 448). The posterior densities are obtained +from Laplace approximation. For each model (fit), the so-called evidence is +computed. It is a measure that classifies the hypothesis. The most optimal +solution has the maximal value of the evidence. The Bayesian framework allows +us to control the impact of the boundary contribution to the total loss. +Indeed, the relative weights of loss components are fine-tuned by the Bayesian +algorithm. We solve heat, wave, and Burger's equations. The obtained results +are in good agreement with the exact solutions. All solutions are provided with +the uncertainties computed within the Bayesian framework. + +
+
+ comment: 19 pages, 11 figures +
+
+
+
+
+ + ☆ GEMTrans: A General, Echocardiography-based, Multi-Level Transformer + Framework for Cardiovascular Diagnosis + + +
+ Echocardiography (echo) is an ultrasound imaging modality that is widely used +for various cardiovascular diagnosis tasks. Due to inter-observer variability +in echo-based diagnosis, which arises from the variability in echo image +acquisition and the interpretation of echo images based on clinical experience, +vision-based machine learning (ML) methods have gained popularity to act as +secondary layers of verification. For such safety-critical applications, it is +essential for any proposed ML method to present a level of explainability along +with good accuracy. In addition, such methods must be able to process several +echo videos obtained from various heart views and the interactions among them +to properly produce predictions for a variety of cardiovascular measurements or +interpretation tasks. Prior work lacks explainability or is limited in scope by +focusing on a single cardiovascular task. To remedy this, we propose a General, +Echo-based, Multi-Level Transformer (GEMTrans) framework that provides +explainability, while simultaneously enabling multi-video training where the +inter-play among echo image patches in the same frame, all frames in the same +video, and inter-video relationships are captured based on a downstream task. +We show the flexibility of our framework by considering two critical tasks +including ejection fraction (EF) and aortic stenosis (AS) severity detection. +Our model achieves mean absolute errors of 4.15 and 4.84 for single and +dual-video EF estimation and an accuracy of 96.5 % for AS detection, while +providing informative task-specific attention maps and prototypical +explainability. + +
+
+ comment: To be published in MLMI 2023 +
+
+
+
+
+ + ☆ Physics-Inspired Neural Graph ODE for Long-term Dynamical Simulation + + +
+ Simulating and modeling the long-term dynamics of multi-object physical +systems is an essential and challenging task. Current studies model the +physical systems utilizing Graph Neural Networks (GNNs) with equivariant +properties. Specifically, they model the dynamics as a sequence of discrete +states with a fixed time interval and learn a direct mapping for all the two +adjacent states. However, this direct mapping overlooks the continuous nature +between the two states. Namely, we have verified that there are countless +possible trajectories between two discrete dynamic states in current GNN-based +direct mapping models. This issue greatly hinders the model generalization +ability, leading to poor performance of the long-term simulation. In this +paper, to better model the latent trajectory through discrete supervision +signals, we propose a Physics-Inspired Neural Graph ODE (PINGO) algorithm. In +PINGO, to ensure the uniqueness of the trajectory, we construct a +Physics-Inspired Neural ODE framework to update the latent trajectory. +Meanwhile, to effectively capture intricate interactions among objects, we use +a GNN-based model to parameterize Neural ODE in a plug-and-play manner. +Furthermore, we prove that the discrepancy between the learned trajectory of +PIGNO and the true trajectory can be theoretically bounded. Extensive +experiments verify our theoretical findings and demonstrate that our model +yields an order-of-magnitude improvement over the state-of-the-art baselines, +especially on long-term predictions and roll-out errors. + +
+
+
+
+
+ + ☆ Physics-inspired Equivariant Descriptors of Non-bonded Interactions + + +
+ Most of the existing machine-learning schemes applied to atomic-scale +simulations rely on a local description of the geometry of a structure, and +struggle to model effects that are driven by long-range physical interactions. +Efforts to overcome these limitations have focused on the direct incorporation +of electrostatics, which is the most prominent effect, often relying on +architectures that mirror the functional form of explicit physical models. +Including other forms of non-bonded interactions, or predicting properties +other than the interatomic potential, requires ad hoc modifications. We propose +an alternative approach that extends the long-distance equivariant (LODE) +framework to generate local descriptors of an atomic environment that resemble +non-bonded potentials with arbitrary asymptotic behaviors, ranging from +point-charge electrostatics to dispersion forces. We show that the LODE +formalism is amenable to a direct physical interpretation in terms of a +generalized multipole expansion, that simplifies its implementation and reduces +the number of descriptors needed to capture a given asymptotic behavior. These +generalized LODE features provide improved extrapolation capabilities when +trained on structures dominated by a given asymptotic behavior, but do not help +in capturing the wildly different energy scales that are relevant for a more +heterogeneous data set. This approach provides a practical scheme to +incorporate different types of non-bonded interactions, and a framework to +investigate the interplay of physical and data-related considerations that +underlie this challenging modeling problem. + +
+
+
+
+
+ + ☆ Structural Cycle GAN for Virtual Immunohistochemistry Staining of Gland + Markers in the Colon MICCAI + + +
+ With the advent of digital scanners and deep learning, diagnostic operations +may move from a microscope to a desktop. Hematoxylin and Eosin (H&E) staining +is one of the most frequently used stains for disease analysis, diagnosis, and +grading, but pathologists do need different immunohistochemical (IHC) stains to +analyze specific structures or cells. Obtaining all of these stains (H&E and +different IHCs) on a single specimen is a tedious and time-consuming task. +Consequently, virtual staining has emerged as an essential research direction. +Here, we propose a novel generative model, Structural Cycle-GAN (SC-GAN), for +synthesizing IHC stains from H&E images, and vice versa. Our method expressly +incorporates structural information in the form of edges (in addition to color +data) and employs attention modules exclusively in the decoder of the proposed +generator model. This integration enhances feature localization and preserves +contextual information during the generation process. In addition, a structural +loss is incorporated to ensure accurate structure alignment between the +generated and input markers. To demonstrate the efficacy of the proposed model, +experiments are conducted with two IHC markers emphasizing distinct structures +of glands in the colon: the nucleus of epithelial cells (CDX2) and the +cytoplasm (CK818). Quantitative metrics such as FID and SSIM are frequently +used for the analysis of generative models, but they do not correlate +explicitly with higher-quality virtual staining results. Therefore, we propose +two new quantitative metrics that correlate directly with the virtual staining +specificity of IHC markers. + +
+
+ comment: Accepted to MICCAI Workshop 2023 +
+
+
+
+
+ + ☆ Using Adamic-Adar Index Algorithm to Predict Volunteer Collaboration: + Less is More + + +
+ Social networks exhibit a complex graph-like structure due to the uncertainty +surrounding potential collaborations among participants. Machine learning +algorithms possess generic outstanding performance in multiple real-world +prediction tasks. However, whether machine learning algorithms outperform +specific algorithms designed for graph link prediction remains unknown to us. +To address this issue, the Adamic-Adar Index (AAI), Jaccard Coefficient (JC) +and common neighbour centrality (CNC) as representatives of graph-specific +algorithms were applied to predict potential collaborations, utilizing data +from volunteer activities during the Covid-19 pandemic in Shenzhen city, along +with the classical machine learning algorithms such as random forest, support +vector machine, and gradient boosting as single predictors and components of +ensemble learning. This paper introduces that the AAI algorithm outperformed +the traditional JC and CNC, and other machine learning algorithms in analyzing +graph node attributes for this task. + +
+
+
+
+
+ + ☆ IOMatch: Simplifying Open-Set Semi-Supervised Learning with Joint + Inliers and Outliers Utilization ICCV 2023 + + +
+ Semi-supervised learning (SSL) aims to leverage massive unlabeled data when +labels are expensive to obtain. Unfortunately, in many real-world applications, +the collected unlabeled data will inevitably contain unseen-class outliers not +belonging to any of the labeled classes. To deal with the challenging open-set +SSL task, the mainstream methods tend to first detect outliers and then filter +them out. However, we observe a surprising fact that such approach could result +in more severe performance degradation when labels are extremely scarce, as the +unreliable outlier detector may wrongly exclude a considerable portion of +valuable inliers. To tackle with this issue, we introduce a novel open-set SSL +framework, IOMatch, which can jointly utilize inliers and outliers, even when +it is difficult to distinguish exactly between them. Specifically, we propose +to employ a multi-binary classifier in combination with the standard closed-set +classifier for producing unified open-set classification targets, which regard +all outliers as a single new class. By adopting these targets as open-set +pseudo-labels, we optimize an open-set classifier with all unlabeled samples +including both inliers and outliers. Extensive experiments have shown that +IOMatch significantly outperforms the baseline methods across different +benchmark datasets and different settings despite its remarkable simplicity. +Our code and models are available at https://github.com/nukezil/IOMatch. + +
+
+ comment: Accepted by ICCV 2023, selected for an Oral presentation +
+
+
+
+
+ + ☆ DAG-ACFL: Asynchronous Clustered Federated Learning based on DAG-DLT + + +
+ Federated learning (FL) aims to collaboratively train a global model while +ensuring client data privacy. However, FL faces challenges from the non-IID +data distribution among clients. Clustered FL (CFL) has emerged as a promising +solution, but most existing CFL frameworks adopt synchronous frameworks lacking +asynchrony. An asynchronous CFL framework called SDAGFL based on directed +acyclic graph distributed ledger techniques (DAG-DLT) was proposed, but its +complete decentralization leads to high communication and storage costs. We +propose DAG-ACFL, an asynchronous clustered FL framework based on directed +acyclic graph distributed ledger techniques (DAG-DLT). We first detail the +components of DAG-ACFL. A tip selection algorithm based on the cosine +similarity of model parameters is then designed to aggregate models from +clients with similar distributions. An adaptive tip selection algorithm +leveraging change-point detection dynamically determines the number of selected +tips. We evaluate the clustering and training performance of DAG-ACFL on +multiple datasets and analyze its communication and storage costs. Experiments +show the superiority of DAG-ACFL in asynchronous clustered FL. By combining +DAG-DLT with clustered FL, DAG-ACFL realizes robust, decentralized and private +model training with efficient performance. + +
+
+
+
+
+ + ☆ Federated Learning in IoT: a Survey from a Resource-Constrained + Perspective + + +
+ The IoT ecosystem is able to leverage vast amounts of data for intelligent +decision-making. Federated Learning (FL), a decentralized machine learning +technique, is widely used to collect and train machine learning models from a +variety of distributed data sources. Both IoT and FL systems can be +complementary and used together. However, the resource-constrained nature of +IoT devices prevents the widescale deployment FL in the real world. This +research paper presents a comprehensive survey of the challenges and solutions +associated with implementing Federated Learning (FL) in resource-constrained +Internet of Things (IoT) environments, viewed from 2 levels, client and server. +We focus on solutions regarding limited client resources, presence of +heterogeneous client data, server capacity, and high communication costs, and +assess their effectiveness in various scenarios. Furthermore, we categorize the +solutions based on the location of their application, i.e., the IoT client, and +the FL server. In addition to a comprehensive review of existing research and +potential future directions, this paper also presents new evaluation metrics +that would allow researchers to evaluate their solutions on +resource-constrained IoT devices. + +
+
+ comment: Presented and accepted at The IEEE 2023 International Conference on + Artificial Intelligence, Robotics, Signal and Image Processing (AIRoSIP) +
+
+
+
+
+ + ☆ Enhancing Breast Cancer Classification Using Transfer ResNet with + Lightweight Attention Mechanism + + +
+ Deep learning models have revolutionized image classification by learning +complex feature hierarchies in raw pixel data. This paper introduces an image +classification method based on the ResNet model, and introduces a lightweight +attention mechanism framework to improve performance. The framework optimizes +feature representation, enhances classification capabilities, and improves +feature discriminativeness. We verified the effectiveness of the algorithm on +the Breakhis dataset, showing its superior performance in many aspects. Not +only in terms of conventional models, our method also shows advantages on +state-of-the-art methods such as contemporary visual transformers. Significant +improvements have been achieved in metrics such as precision, accuracy, recall, +F1-score, and G-means, while also performing well in terms of convergence time. +These results strengthen the performance of the algorithm and solidify its +application prospects in practical image classification tasks. Keywords: ResNet +model, Lightweight attention mechanism + +
+
+ comment: 6 pages, 4 figures,6 tables +
+
+
+
+
+ + ☆ MatchXML: An Efficient Text-label Matching Framework for Extreme + Multi-label Text Classification + + +
+ The eXtreme Multi-label text Classification(XMC) refers to training a +classifier that assigns a text sample with relevant labels from an extremely +large-scale label set (e.g., millions of labels). We propose MatchXML, an +efficient text-label matching framework for XMC. We observe that the label +embeddings generated from the sparse Term Frequency-Inverse Document +Frequency(TF-IDF) features have several limitations. We thus propose label2vec +to effectively train the semantic dense label embeddings by the Skip-gram +model. The dense label embeddings are then used to build a Hierarchical Label +Tree by clustering. In fine-tuning the pre-trained encoder Transformer, we +formulate the multi-label text classification as a text-label matching problem +in a bipartite graph. We then extract the dense text representations from the +fine-tuned Transformer. Besides the fine-tuned dense text embeddings, we also +extract the static dense sentence embeddings from a pre-trained Sentence +Transformer. Finally, a linear ranker is trained by utilizing the sparse TF-IDF +features, the fine-tuned dense text representations and static dense sentence +features. Experimental results demonstrate that MatchXML achieves +state-of-the-art accuracy on five out of six datasets. As for the speed, +MatchXML outperforms the competing methods on all the six datasets. Our source +code is publicly available at https://github.com/huiyegit/MatchXML. + +
+
+
+
+
+ + ☆ OmniQuant: Omnidirectionally Calibrated Quantization for Large Language + Models + + +
+ Large language models (LLMs) have revolutionized natural language processing +tasks. However, their practical deployment is hindered by their immense memory +and computation requirements. Although recent post-training quantization (PTQ) +methods are effective in reducing memory footprint and improving the +computational efficiency of LLM, they hand-craft quantization parameters, which +leads to low performance and fails to deal with extremely low-bit quantization. +To tackle this issue, we introduce an Omnidirectionally calibrated Quantization +(OmniQuant) technique for LLMs, which achieves good performance in diverse +quantization settings while maintaining the computational efficiency of PTQ by +efficiently optimizing various quantization parameters. OmniQuant comprises two +innovative components including Learnable Weight Clipping (LWC) and Learnable +Equivalent Transformation (LET). LWC modulates the extreme values of weights by +optimizing the clipping threshold. Meanwhile, LET tackles activation outliers +by shifting the challenge of quantization from activations to weights through a +learnable equivalent transformation. Operating within a differentiable +framework using block-wise error minimization, OmniQuant can optimize the +quantization process efficiently for both weight-only and weight-activation +quantization. For instance, the LLaMA-2 model family with the size of 7-70B can +be processed with OmniQuant on a single A100-40G GPU within 1-16 hours using +128 samples. Extensive experiments validate OmniQuant's superior performance +across diverse quantization configurations such as W4A4, W6A6, W4A16, W3A16, +and W2A16. Additionally, OmniQuant demonstrates effectiveness in +instruction-tuned models and delivers notable improvements in inference speed +and memory reduction on real devices. Codes and models are available at +\url{https://github.com/OpenGVLab/OmniQuant}. + +
+
+ comment: A differentiable quantization method for LLM +
+
+
+
+
+ + ☆ Nonparametric Additive Value Functions: Interpretable Reinforcement + Learning with an Application to Surgical Recovery + + +
+ We propose a nonparametric additive model for estimating interpretable value +functions in reinforcement learning. Learning effective adaptive clinical +interventions that rely on digital phenotyping features is a major for concern +medical practitioners. With respect to spine surgery, different post-operative +recovery recommendations concerning patient mobilization can lead to +significant variation in patient recovery. While reinforcement learning has +achieved widespread success in domains such as games, recent methods heavily +rely on black-box methods, such neural networks. Unfortunately, these methods +hinder the ability of examining the contribution each feature makes in +producing the final suggested decision. While such interpretations are easily +provided in classical algorithms such as Least Squares Policy Iteration, basic +linearity assumptions prevent learning higher-order flexible interactions +between features. In this paper, we present a novel method that offers a +flexible technique for estimating action-value functions without making +explicit parametric assumptions regarding their additive functional form. This +nonparametric estimation strategy relies on incorporating local kernel +regression and basis expansion to obtain a sparse, additive representation of +the action-value function. Under this approach, we are able to locally +approximate the action-value function and retrieve the nonlinear, independent +contribution of select features as well as joint feature pairs. We validate the +proposed approach with a simulation study, and, in an application to spine +disease, uncover recovery recommendations that are inline with related clinical +knowledge. + +
+
+ comment: 28 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ A System-Level View on Out-of-Distribution Data in Robotics + + +
+ When testing conditions differ from those represented in training data, +so-called out-of-distribution (OOD) inputs can mar the reliability of learned +components in the modern robot autonomy stack. Therefore, coping with OOD data +is an important challenge on the path towards trustworthy learning-enabled +open-world autonomy. In this paper, we aim to demystify the topic of OOD data +and its associated challenges in the context of data-driven robotic systems, +drawing connections to emerging paradigms in the ML community that study the +effect of OOD data on learned models in isolation. We argue that as +roboticists, we should reason about the overall \textit{system-level} +competence of a robot as it operates in OOD conditions. We highlight key +research questions around this system-level view of OOD problems to guide +future research toward safe and reliable learning-enabled autonomy. + +
+
+
+
+
+ + ♻ ☆ Uncertainty Estimation using the Local Lipschitz for Deep Learning Image + Reconstruction Models + + +
+ The use of supervised deep neural network approaches has been investigated to +solve inverse problems in all domains, especially radiology where imaging +technologies are at the heart of diagnostics. However, in deployment, these +models are exposed to input distributions that are widely shifted from training +data, due in part to data biases or drifts. It becomes crucial to know whether +a given input lies outside the training data distribution before relying on the +reconstruction for diagnosis. The goal of this work is three-fold: (i) +demonstrate use of the local Lipshitz value as an uncertainty estimation +threshold for determining suitable performance, (ii) provide method for +identifying out-of-distribution (OOD) images where the model may not have +generalized, and (iii) use the local Lipschitz values to guide proper data +augmentation through identifying false positives and decrease epistemic +uncertainty. We provide results for both MRI reconstruction and CT sparse view +to full view reconstruction using AUTOMAP and UNET architectures due to it +being pertinent in the medical domain that reconstructed images remain +diagnostically accurate. + +
+
+
+
+
+ + ♻ ☆ On Model Identification and Out-of-Sample Prediction of Principal + Component Regression: Applications to Synthetic Controls + + +
+ We analyze principal component regression (PCR) in a high-dimensional +error-in-variables setting with fixed design. Under suitable conditions, we +show that PCR consistently identifies the unique model with minimum +$\ell_2$-norm. These results enable us to establish non-asymptotic +out-of-sample prediction guarantees that improve upon the best known rates. In +the course of our analysis, we introduce a natural linear algebraic condition +between the in- and out-of-sample covariates, which allows us to avoid +distributional assumptions for out-of-sample predictions. Our simulations +illustrate the importance of this condition for generalization, even under +covariate shifts. Accordingly, we construct a hypothesis test to check when +this conditions holds in practice. As a byproduct, our results also lead to +novel results for the synthetic controls literature, a leading approach for +policy evaluation. To the best of our knowledge, our prediction guarantees for +the fixed design setting have been elusive in both the high-dimensional +error-in-variables and synthetic controls literatures. + +
+
+
+
+
+ + ♻ ☆ Kernel Density Matrices for Probabilistic Deep Learning + + +
+ This paper introduces a novel approach to probabilistic deep learning, kernel +density matrices, which provide a simpler yet effective mechanism for +representing joint probability distributions of both continuous and discrete +random variables. In quantum mechanics, a density matrix is the most general +way to describe the state of a quantum system. This work extends the concept of +density matrices by allowing them to be defined in a reproducing kernel Hilbert +space. This abstraction allows the construction of differentiable models for +density estimation, inference, and sampling, and enables their integration into +end-to-end deep neural models. In doing so, we provide a versatile +representation of marginal and joint probability distributions that allows us +to develop a differentiable, compositional, and reversible inference procedure +that covers a wide range of machine learning tasks, including density +estimation, discriminative learning, and generative modeling. The broad +applicability of the framework is illustrated by two examples: an image +classification model that can be naturally transformed into a conditional +generative model, and a model for learning with label proportions that +demonstrates the framework's ability to deal with uncertainty in the training +samples. + +
+
+
+
+
+ + ♻ ☆ Federated Object Detection for Quality Inspection in Shared Production + + +
+ Federated learning (FL) has emerged as a promising approach for training +machine learning models on decentralized data without compromising data +privacy. In this paper, we propose a FL algorithm for object detection in +quality inspection tasks using YOLOv5 as the object detection algorithm and +Federated Averaging (FedAvg) as the FL algorithm. We apply this approach to a +manufacturing use-case where multiple factories/clients contribute data for +training a global object detection model while preserving data privacy on a +non-IID dataset. Our experiments demonstrate that our FL approach achieves +better generalization performance on the overall clients' test dataset and +generates improved bounding boxes around the objects compared to models trained +using local clients' datasets. This work showcases the potential of FL for +quality inspection tasks in the manufacturing industry and provides valuable +insights into the performance and feasibility of utilizing YOLOv5 and FedAvg +for federated object detection. + +
+
+ comment: Will submit it to an IEEE conference +
+
+
+
+
+ + ♻ ☆ Federated Ensemble YOLOv5 -- A Better Generalized Object Detection + Algorithm + + +
+ Federated learning (FL) has gained significant traction as a +privacy-preserving algorithm, but the underlying resemblances of federated +learning algorithms like Federated averaging (FedAvg) or Federated SGD (Fed +SGD) to ensemble learning algorithms have not been fully explored. The purpose +of this paper is to examine the application of FL to object detection as a +method to enhance generalizability, and to compare its performance against a +centralized training approach for an object detection algorithm. Specifically, +we investigate the performance of a YOLOv5 model trained using FL across +multiple clients and employ a random sampling strategy without replacement, so +each client holds a portion of the same dataset used for centralized training. +Our experimental results showcase the superior efficiency of the FL object +detector's global model in generating accurate bounding boxes for unseen +objects, with the test set being a mixture of objects from two distinct clients +not represented in the training dataset. These findings suggest that FL can be +viewed from an ensemble algorithm perspective, akin to a synergistic blend of +Bagging and Boosting techniques. As a result, FL can be seen not only as a +method to enhance privacy, but also as a method to enhance the performance of a +machine learning model. + +
+
+ comment: 8 pages and submitted to FLTA2023 symposium under IEEE +
+
+
+
+
+ + ♻ ☆ Cross-domain Transfer Learning and State Inference for Soft Robots via a + Semi-supervised Sequential Variational Bayes Framework ICRA + + +
+ Recently, data-driven models such as deep neural networks have shown to be +promising tools for modelling and state inference in soft robots. However, +voluminous amounts of data are necessary for deep models to perform +effectively, which requires exhaustive and quality data collection, +particularly of state labels. Consequently, obtaining labelled state data for +soft robotic systems is challenged for various reasons, including difficulty in +the sensorization of soft robots and the inconvenience of collecting data in +unstructured environments. To address this challenge, in this paper, we propose +a semi-supervised sequential variational Bayes (DSVB) framework for transfer +learning and state inference in soft robots with missing state labels on +certain robot configurations. Considering that soft robots may exhibit distinct +dynamics under different robot configurations, a feature space transfer +strategy is also incorporated to promote the adaptation of latent features +across multiple configurations. Unlike existing transfer learning approaches, +our proposed DSVB employs a recurrent neural network to model the nonlinear +dynamics and temporal coherence in soft robot data. The proposed framework is +validated on multiple setup configurations of a pneumatic-based soft robot +finger. Experimental results on four transfer scenarios demonstrate that DSVB +performs effective transfer learning and accurate state inference amidst +missing state labels. The data and code are available at +https://github.com/shageenderan/DSVB. + +
+
+ comment: Accepted at the International Conference on Robotics and Automation + (ICRA) 2023 +
+
+
+
+
+ + ♻ ☆ Diffusion Language Models Can Perform Many Tasks with Scaling and + Instruction-Finetuning + + +
+ The recent surge of generative AI has been fueled by the generative power of +diffusion probabilistic models and the scalable capabilities of large language +models. Despite their potential, it remains elusive whether diffusion language +models can solve general language tasks comparable to their autoregressive +counterparts. This paper demonstrates that scaling diffusion models w.r.t. +data, sizes, and tasks can effectively make them strong language learners. We +build competent diffusion language models at scale by first acquiring knowledge +from massive data via masked language modeling pretraining thanks to their +intrinsic connections. We then reprogram pretrained masked language models into +diffusion language models via diffusive adaptation, wherein task-specific +finetuning and instruction finetuning are explored to unlock their versatility +in solving general language tasks. Experiments show that scaling diffusion +language models consistently improves performance across downstream language +tasks. We further discover that instruction finetuning can elicit zero-shot and +few-shot in-context learning abilities that help tackle many unseen tasks by +following natural language instructions, and show promise in advanced and +challenging abilities such as reasoning. + +
+
+ comment: added references +
+
+
+
+
+ + ♻ ☆ Actuator Trajectory Planning for UAVs with Overhead Manipulator using + Reinforcement Learning + + +
+ In this paper, we investigate the operation of an aerial manipulator system, +namely an Unmanned Aerial Vehicle (UAV) equipped with a controllable arm with +two degrees of freedom to carry out actuation tasks on the fly. Our solution is +based on employing a Q-learning method to control the trajectory of the tip of +the arm, also called end-effector. More specifically, we develop a motion +planning model based on Time To Collision (TTC), which enables a quadrotor UAV +to navigate around obstacles while ensuring the manipulator's reachability. +Additionally, we utilize a model-based Q-learning model to independently track +and control the desired trajectory of the manipulator's end-effector, given an +arbitrary baseline trajectory for the UAV platform. Such a combination enables +a variety of actuation tasks such as high-altitude welding, structural +monitoring and repair, battery replacement, gutter cleaning, skyscrapper +cleaning, and power line maintenance in hard-to-reach and risky environments +while retaining compatibility with flight control firmware. Our RL-based +control mechanism results in a robust control strategy that can handle +uncertainties in the motion of the UAV, offering promising performance. +Specifically, our method achieves 92% accuracy in terms of average displacement +error (i.e. the mean distance between the target and obtained trajectory +points) using Q-learning with 15,000 episodes + +
+
+
+
+
+ + ♻ ☆ StepMix: A Python Package for Pseudo-Likelihood Estimation of + Generalized Mixture Models with External Variables + + +
+ StepMix is an open-source Python package for the pseudo-likelihood estimation +(one-, two- and three-step approaches) of generalized finite mixture models +(latent profile and latent class analysis) with external variables (covariates +and distal outcomes). In many applications in social sciences, the main +objective is not only to cluster individuals into latent classes, but also to +use these classes to develop more complex statistical models. These models +generally divide into a measurement model that relates the latent classes to +observed indicators, and a structural model that relates covariates and outcome +variables to the latent classes. The measurement and structural models can be +estimated jointly using the so-called one-step approach or sequentially using +stepwise methods, which present significant advantages for practitioners +regarding the interpretability of the estimated latent classes. In addition to +the one-step approach, StepMix implements the most important stepwise +estimation methods from the literature, including the bias-adjusted three-step +methods with BCH and ML corrections and the more recent two-step approach. +These pseudo-likelihood estimators are presented in this paper under a unified +framework as specific expectation-maximization subroutines. To facilitate and +promote their adoption among the data science community, StepMix follows the +object-oriented design of the scikit-learn library and provides an additional R +wrapper. + +
+
+ comment: Sacha Morin and Robin Legault contributed equally +
+
+
+
+
+ + ♻ ☆ Arrhythmia Classifier Based on Ultra-Lightweight Binary Neural Network + + +
+ Reasonably and effectively monitoring arrhythmias through ECG signals has +significant implications for human health. With the development of deep +learning, numerous ECG classification algorithms based on deep learning have +emerged. However, most existing algorithms trade off high accuracy for complex +models, resulting in high storage usage and power consumption. This also +inevitably increases the difficulty of implementation on wearable Artificial +Intelligence-of-Things (AIoT) devices with limited resources. In this study, we +proposed a universally applicable ultra-lightweight binary neural network(BNN) +that is capable of 5-class and 17-class arrhythmia classification based on ECG +signals. Our BNN achieves 96.90% (full precision 97.09%) and 97.50% (full +precision 98.00%) accuracy for 5-class and 17-class classification, +respectively, with state-of-the-art storage usage (3.76 KB and 4.45 KB). +Compared to other binarization works, our approach excels in supporting two +multi-classification modes while achieving the smallest known storage space. +Moreover, our model achieves optimal accuracy in 17-class classification and +boasts an elegantly simple network architecture. The algorithm we use is +optimized specifically for hardware implementation. Our research showcases the +potential of lightweight deep learning models in the healthcare industry, +specifically in wearable medical devices, which hold great promise for +improving patient outcomes and quality of life. Code is available on: +https://github.com/xpww/ECG_BNN_Net + +
+
+ comment: 6 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ SpeechGen: Unlocking the Generative Power of Speech Language Models with + Prompts + + +
+ Large language models (LLMs) have gained considerable attention for +Artificial Intelligence Generated Content (AIGC), particularly with the +emergence of ChatGPT. However, the direct adaptation of continuous speech to +LLMs that process discrete tokens remains an unsolved challenge, hindering the +application of LLMs for speech generation. The advanced speech LMs are in the +corner, as that speech signals encapsulate a wealth of information, including +speaker and emotion, beyond textual data alone. Prompt tuning has demonstrated +notable gains in parameter efficiency and competitive performance on some +speech classification tasks. However, the extent to which prompts can +effectively elicit generation tasks from speech LMs remains an open question. +In this paper, we present pioneering research that explores the application of +prompt tuning to stimulate speech LMs for various generation tasks, within a +unified framework called SpeechGen, with around 10M trainable parameters. The +proposed unified framework holds great promise for efficiency and +effectiveness, particularly with the imminent arrival of advanced speech LMs, +which will significantly enhance the capabilities of the framework. The code +and demos of SpeechGen will be available on the project website: +\url{https://ga642381.github.io/SpeechPrompt/speechgen} + +
+
+ comment: Work in progress. The first three authors contributed equally +
+
+
+
+
+ + ♻ ☆ Symmetry-Preserving Program Representations for Learning Code Semantics + + +
+ Large Language Models (LLMs) have shown promise in automated program +reasoning, a crucial aspect of many security tasks. However, existing LLM +architectures for code are often borrowed from other domains like natural +language processing, raising concerns about their generalization and robustness +to unseen code. A key generalization challenge is to incorporate the knowledge +of code semantics, including control and data flow, into the LLM architectures. + Drawing inspiration from examples of convolution layers exploiting +translation symmetry, we explore how code symmetries can enhance LLM +architectures for program analysis and modeling. We present a rigorous +group-theoretic framework that formally defines code symmetries as +semantics-preserving transformations and provides techniques for precisely +reasoning about symmetry preservation within LLM architectures. Using this +framework, we introduce a novel variant of self-attention that preserves +program symmetries, demonstrating its effectiveness in generalization and +robustness through detailed experimental evaluations across different binary +and source code analysis tasks. Overall, our code symmetry framework offers +rigorous and powerful reasoning techniques that can guide the future +development of specialized LLMs for code and advance LLM-guided program +reasoning tasks. + +
+
+
+
+
+ + ♻ ☆ Overcoming Adversarial Attacks for Human-in-the-Loop Applications ICML 2022 + + +
+ Including human analysis has the potential to positively affect the +robustness of Deep Neural Networks and is relatively unexplored in the +Adversarial Machine Learning literature. Neural network visual explanation maps +have been shown to be prone to adversarial attacks. Further research is needed +in order to select robust visualizations of explanations for the image analyst +to evaluate a given model. These factors greatly impact Human-In-The-Loop +(HITL) evaluation tools due to their reliance on adversarial images, including +explanation maps and measurements of robustness. We believe models of human +visual attention may improve interpretability and robustness of human-machine +imagery analysis systems. Our challenge remains, how can HITL evaluation be +robust in this adversarial landscape? + +
+
+ comment: New Frontiers in Adversarial Machine Learning, ICML 2022 +
+
+
+
+
+ + ♻ ☆ On the lifting and reconstruction of nonlinear systems with multiple + attractors + + +
+ The Koopman operator provides a linear perspective on non-linear dynamics by +focusing on the evolution of observables in an invariant subspace. Observables +of interest are typically linearly reconstructed from the Koopman +eigenfunctions. Despite the broad use of Koopman operators over the past few +years, there exist some misconceptions about the applicability of Koopman +operators to dynamical systems with more than one fixed point. In this work, an +explanation is provided for the mechanism of lifting for the Koopman operator +of nonlinear systems with multiple attractors. Considering the example of the +Duffing oscillator, we show that by exploiting the inherent symmetry between +the basins of attraction, a linear reconstruction with three degrees of freedom +in the Koopman observable space is sufficient to globally linearize the system. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ♻ ☆ Vectorized Scenario Description and Motion Prediction for Scenario-Based + Testing + + +
+ Automated vehicles (AVs) are tested in diverse scenarios, typically specified +by parameters such as velocities, distances, or curve radii. To describe +scenarios uniformly independent of such parameters, this paper proposes a +vectorized scenario description defined by the road geometry and vehicles' +trajectories. Data of this form are generated for three scenarios, merged, and +used to train the motion prediction model VectorNet, allowing to predict an +AV's trajectory for unseen scenarios. Predicting scenario evaluation metrics, +VectorNet partially achieves lower errors than regression models that +separately process the three scenarios' data. However, for comprehensive +generalization, sufficient variance in the training data must be ensured. Thus, +contrary to existing methods, our proposed method can merge diverse scenarios' +data and exploit spatial and temporal nuances in the vectorized scenario +description. As a result, data from specified test scenarios and real-world +scenarios can be compared and combined for (predictive) analyses and scenario +selection. + +
+
+ comment: 6 pages, 7 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ On marginal feature attributions of tree-based models + + +
+ Due to their power and ease of use, tree-based machine learning models, such +as random forests and gradient-boosted tree ensembles, have become very +popular. To interpret them, local feature attributions based on marginal +expectations, e.g. marginal (interventional) Shapley, Owen or Banzhaf values, +may be employed. Such methods are true to the model and implementation +invariant, i.e. dependent only on the input-output function of the model. We +contrast this with the popular TreeSHAP algorithm by presenting two +(statistically similar) decision trees that compute the exact same function for +which the "path-dependent" TreeSHAP yields different rankings of features, +whereas the marginal Shapley values coincide. Furthermore, we discuss how the +internal structure of tree-based models may be leveraged to help with computing +their marginal feature attributions according to a linear game value. One +important observation is that these are simple (piecewise-constant) functions +with respect to a certain grid partition of the input space determined by the +trained model. Another crucial observation, showcased by experiments with +XGBoost, LightGBM and CatBoost libraries, is that only a portion of all +features appears in a tree from the ensemble. Thus, the complexity of computing +marginal Shapley (or Owen or Banzhaf) feature attributions may be reduced. This +remains valid for a broader class of game values which we shall axiomatically +characterize. A prime example is the case of CatBoost models where the trees +are oblivious (symmetric) and the number of features in each of them is no +larger than the depth. We exploit the symmetry to derive an explicit formula, +with improved complexity and only in terms of the internal model parameters, +for marginal Shapley (and Banzhaf and Owen) values of CatBoost models. This +results in a fast, accurate algorithm for estimating these feature +attributions. + +
+
+ comment: Major revision. Notation is simplified, technical details are moved + to appendix, Algorithm 3.12 is rewritten, the complexity bound in Theorem 3.6 + is improved, {\S}4 on numerical experiments is expanded. Theorem 2.4 (a + classification result for game values) and the results of Appendix F + (generalizations of Theorem 3.6) are new. 29 pages+appendix (63 pages in + total), 9 figures +
+
+
+
+
+ + ♻ ☆ ChatMOF: An Autonomous AI System for Predicting and Generating + Metal-Organic Frameworks + + +
+ ChatMOF is an autonomous Artificial Intelligence (AI) system that is built to +predict and generate metal-organic frameworks (MOFs). By leveraging a +large-scale language model (GPT-4 and GPT-3.5-turbo), ChatMOF extracts key +details from textual inputs and delivers appropriate responses, thus +eliminating the necessity for rigid structured queries. The system is comprised +of three core components (i.e. an agent, a toolkit, and an evaluator) and it +forms a robust pipeline that manages a variety of tasks, including data +retrieval, property prediction, and structure generations. The study further +explores the merits and constraints of using large language models (LLMs) AI +system in material sciences using and showcases its transformative potential +for future advancements. + +
+
+
+
+
+ + ♻ ☆ Pathology Steered Stratification Network for Subtype Identification in + Alzheimer's Disease + + +
+ Alzheimer's disease (AD) is a heterogeneous, multifactorial neurodegenerative +disorder characterized by beta-amyloid, pathologic tau, and neurodegeneration. +There are no effective treatments for Alzheimer's disease at a late stage, +urging for early intervention. However, existing statistical inference +approaches of AD subtype identification ignore the pathological domain +knowledge, which could lead to ill-posed results that are sometimes +inconsistent with the essential neurological principles. Integrating systems +biology modeling with machine learning, we propose a novel pathology steered +stratification network (PSSN) that incorporates established domain knowledge in +AD pathology through a reaction-diffusion model, where we consider non-linear +interactions between major biomarkers and diffusion along brain structural +network. Trained on longitudinal multimodal neuroimaging data, the biological +model predicts long-term trajectories that capture individual progression +pattern, filling in the gaps between sparse imaging data available. A deep +predictive neural network is then built to exploit spatiotemporal dynamics, +link neurological examinations with clinical profiles, and generate subtype +assignment probability on an individual basis. We further identify an +evolutionary disease graph to quantify subtype transition probabilities through +extensive simulations. Our stratification achieves superior performance in both +inter-cluster heterogeneity and intra-cluster homogeneity of various clinical +scores. Applying our approach to enriched samples of aging populations, we +identify six subtypes spanning AD spectrum, where each subtype exhibits a +distinctive biomarker pattern that is consistent with its clinical outcome. +PSSN provides insights into pre-symptomatic diagnosis and practical guidance on +clinical treatments, which may be further generalized to other +neurodegenerative diseases. + +
+
+
+
+
+ + ♻ ☆ Resource-Adaptive Newton's Method for Distributed Learning + + +
+ Distributed stochastic optimization methods based on Newton's method offer +significant advantages over first-order methods by leveraging curvature +information for improved performance. However, the practical applicability of +Newton's method is hindered in large-scale and heterogeneous learning +environments due to challenges such as high computation and communication costs +associated with the Hessian matrix, sub-model diversity, staleness in training, +and data heterogeneity. To address these challenges, this paper introduces a +novel and efficient algorithm called RANL, which overcomes the limitations of +Newton's method by employing a simple Hessian initialization and adaptive +assignments of training regions. The algorithm demonstrates impressive +convergence properties, which are rigorously analyzed under standard +assumptions in stochastic optimization. The theoretical analysis establishes +that RANL achieves a linear convergence rate while effectively adapting to +available resources and maintaining high efficiency. Unlike traditional +first-order methods, RANL exhibits remarkable independence from the condition +number of the problem and eliminates the need for complex parameter tuning. +These advantages make RANL a promising approach for distributed stochastic +optimization in practical scenarios. + +
+
+
+
+
+ + ♻ ☆ LExecutor: Learning-Guided Execution + + +
+ Executing code is essential for various program analysis tasks, e.g., to +detect bugs that manifest through exceptions or to obtain execution traces for +further dynamic analysis. However, executing an arbitrary piece of code is +often difficult in practice, e.g., because of missing variable definitions, +missing user inputs, and missing third-party dependencies. This paper presents +LExecutor, a learning-guided approach for executing arbitrary code snippets in +an underconstrained way. The key idea is to let a neural model predict missing +values that otherwise would cause the program to get stuck, and to inject these +values into the execution. For example, LExecutor injects likely values for +otherwise undefined variables and likely return values of calls to otherwise +missing functions. We evaluate the approach on Python code from popular +open-source projects and on code snippets extracted from Stack Overflow. The +neural model predicts realistic values with an accuracy between 79.5% and +98.2%, allowing LExecutor to closely mimic real executions. As a result, the +approach successfully executes significantly more code than any available +technique, such as simply executing the code as-is. For example, executing the +open-source code snippets as-is covers only 4.1% of all lines, because the code +crashes early on, whereas LExecutor achieves a coverage of 51.6%. + +
+
+ comment: Accepted in research track of the ACM Joint European Software + Engineering Conference and Symposium on the Foundations of Software + Engineering (ESEC/FSE) 2023 +
+
+
+
+
+ + ♻ ☆ Characteristics of networks generated by kernel growing neural gas + + +
+ This research aims to develop kernel GNG, a kernelized version of the growing +neural gas (GNG) algorithm, and to investigate the features of the networks +generated by the kernel GNG. The GNG is an unsupervised artificial neural +network that can transform a dataset into an undirected graph, thereby +extracting the features of the dataset as a graph. The GNG is widely used in +vector quantization, clustering, and 3D graphics. Kernel methods are often used +to map a dataset to feature space, with support vector machines being the most +prominent application. This paper introduces the kernel GNG approach and +explores the characteristics of the networks generated by kernel GNG. Five +kernels, including Gaussian, Laplacian, Cauchy, inverse multiquadric, and log +kernels, are used in this study. The results of this study show that the +average degree and the average clustering coefficient decrease as the kernel +parameter increases for Gaussian, Laplacian, Cauchy, and IMQ kernels. If we +avoid more edges and a higher clustering coefficient (or more triangles), the +kernel GNG with a larger value of the parameter will be more appropriate. + +
+
+
+
+
+ + ♻ ☆ Experts Weights Averaging: A New General Training Scheme for Vision + Transformers + + +
+ Structural re-parameterization is a general training scheme for Convolutional +Neural Networks (CNNs), which achieves performance improvement without +increasing inference cost. As Vision Transformers (ViTs) are gradually +surpassing CNNs in various visual tasks, one may question: if a training scheme +specifically for ViTs exists that can also achieve performance improvement +without increasing inference cost? Recently, Mixture-of-Experts (MoE) has +attracted increasing attention, as it can efficiently scale up the capacity of +Transformers at a fixed cost through sparsely activated experts. Considering +that MoE can also be viewed as a multi-branch structure, can we utilize MoE to +implement a ViT training scheme similar to structural re-parameterization? In +this paper, we affirmatively answer these questions, with a new general +training strategy for ViTs. Specifically, we decouple the training and +inference phases of ViTs. During training, we replace some Feed-Forward +Networks (FFNs) of the ViT with specially designed, more efficient MoEs that +assign tokens to experts by random uniform partition, and perform Experts +Weights Averaging (EWA) on these MoEs at the end of each iteration. After +training, we convert each MoE into an FFN by averaging the experts, +transforming the model back into original ViT for inference. We further provide +a theoretical analysis to show why and how it works. Comprehensive experiments +across various 2D and 3D visual tasks, ViT architectures, and datasets validate +the effectiveness and generalizability of the proposed training scheme. +Besides, our training scheme can also be applied to improve performance when +fine-tuning ViTs. Lastly, but equally important, the proposed EWA technique can +significantly improve the effectiveness of naive MoE in various 2D visual small +datasets and 3D visual tasks. + +
+
+ comment: 12 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Learning to Control Autonomous Fleets from Observation via Offline + Reinforcement Learning + + +
+ Autonomous Mobility-on-Demand (AMoD) systems are an evolving mode of +transportation in which a centrally coordinated fleet of self-driving vehicles +dynamically serves travel requests. The control of these systems is typically +formulated as a large network optimization problem, and reinforcement learning +(RL) has recently emerged as a promising approach to solve the open challenges +in this space. Recent centralized RL approaches focus on learning from online +data, ignoring the per-sample-cost of interactions within real-world +transportation systems. To address these limitations, we propose to formalize +the control of AMoD systems through the lens of offline reinforcement learning +and learn effective control strategies using solely offline data, which is +readily available to current mobility operators. We further investigate design +decisions and provide empirical evidence based on data from real-world mobility +systems showing how offline learning allows to recover AMoD control policies +that (i) exhibit performance on par with online methods, (ii) allow for +sample-efficient online fine-tuning and (iii) eliminate the need for complex +simulation environments. Crucially, this paper demonstrates that offline RL is +a promising paradigm for the application of RL-based solutions within +economically-critical systems, such as mobility systems. + +
+
+
+
+
+ + ♻ ☆ Early Stopping for Deep Image Prior + + +
+ Deep image prior (DIP) and its variants have showed remarkable potential for +solving inverse problems in computer vision, without any extra training data. +Practical DIP models are often substantially overparameterized. During the +fitting process, these models learn mostly the desired visual content first, +and then pick up the potential modeling and observational noise, i.e., +overfitting. Thus, the practicality of DIP often depends critically on good +early stopping (ES) that captures the transition period. In this regard, the +majority of DIP works for vision tasks only demonstrates the potential of the +models -- reporting the peak performance against the ground truth, but provides +no clue about how to operationally obtain near-peak performance without access +to the groundtruth. In this paper, we set to break this practicality barrier of +DIP, and propose an efficient ES strategy, which consistently detects near-peak +performance across several vision tasks and DIP variants. Based on a simple +measure of dispersion of consecutive DIP reconstructions, our ES method not +only outpaces the existing ones -- which only work in very narrow domains, but +also remains effective when combined with a number of methods that try to +mitigate the overfitting. The code is available at +https://github.com/sun-umn/Early_Stopping_for_DIP. + +
+
+
+
+
+ + ♻ ☆ Towards Learning and Explaining Indirect Causal Effects in Neural + Networks + + +
+ Recently, there has been a growing interest in learning and explaining causal +effects within Neural Network (NN) models. By virtue of NN architectures, +previous approaches consider only direct and total causal effects assuming +independence among input variables. We view an NN as a structural causal model +(SCM) and extend our focus to include indirect causal effects by introducing +feedforward connections among input neurons. We propose an ante-hoc method that +captures and maintains direct, indirect, and total causal effects during NN +model training. We also propose an algorithm for quantifying learned causal +effects in an NN model and efficient approximation strategies for quantifying +causal effects in high-dimensional data. Extensive experiments conducted on +synthetic and real-world datasets demonstrate that the causal effects learned +by our ante-hoc method better approximate the ground truth effects compared to +existing methods. + +
+
+
+
+
+ + ♻ ☆ Symbolic Relational Deep Reinforcement Learning based on Graph Neural + Networks and Autoregressive Policy Decomposition + + +
+ We focus on reinforcement learning (RL) in relational problems that are +naturally defined in terms of objects, their relations, and object-centric +actions. These problems are characterized by variable state and action spaces, +and finding a fixed-length representation, required by most existing RL +methods, is difficult, if not impossible. We present a deep RL framework based +on graph neural networks and auto-regressive policy decomposition that +naturally works with these problems and is completely domain-independent. We +demonstrate the framework's broad applicability in three distinct domains and +show impressive zero-shot generalization over different problem sizes. + +
+
+ comment: code available at https://github.com/jaromiru/sr-drl +
+
+
+
+
+ + ♻ ☆ E-commerce users' preferences for delivery options + + +
+ Many e-commerce marketplaces offer their users fast delivery options for free +to meet the increasing needs of users, imposing an excessive burden on city +logistics. Therefore, understanding e-commerce users' preference for delivery +options is a key to designing logistics policies. To this end, this study +designs a stated choice survey in which respondents are faced with choice tasks +among different delivery options and time slots, which was completed by 4,062 +users from the three major metropolitan areas in Japan. To analyze the data, +mixed logit models capturing taste heterogeneity as well as flexible +substitution patterns have been estimated. The model estimation results +indicate that delivery attributes including fee, time, and time slot size are +significant determinants of the delivery option choices. Associations between +users' preferences and socio-demographic characteristics, such as age, gender, +teleworking frequency and the presence of a delivery box, were also suggested. +Moreover, we analyzed two willingness-to-pay measures for delivery, namely, the +value of delivery time savings (VODT) and the value of time slot shortening +(VOTS), and applied a non-semiparametric approach to estimate their +distributions in a data-oriented manner. Although VODT has a large +heterogeneity among respondents, the estimated median VODT is 25.6 JPY/day, +implying that more than half of the respondents would wait an additional day if +the delivery fee were increased by only 26 JPY, that is, they do not +necessarily need a fast delivery option but often request it when cheap or +almost free. Moreover, VOTS was found to be low, distributed with the median of +5.0 JPY/hour; that is, users do not highly value the reduction in time slot +size in monetary terms. These findings on e-commerce users' preferences can +help in designing levels of service for last-mile delivery to significantly +improve its efficiency. + +
+
+ comment: Section 1 needs to be rewritten +
+
+
+
+
+ + ♻ ☆ AudioFormer: Audio Transformer learns audio feature representations from + discrete acoustic codes + + +
+ We propose a method named AudioFormer,which learns audio feature +representations through the acquisition of discrete acoustic codes and +subsequently fine-tunes them for audio classification tasks. Initially,we +introduce a novel perspective by considering the audio classification task as a +form of natural language understanding (NLU). Leveraging an existing neural +audio codec model,we generate discrete acoustic codes and utilize them to train +a masked language model (MLM),thereby obtaining audio feature representations. +Furthermore,we pioneer the integration of a Multi-Positive sample Contrastive +(MPC) learning approach. This method enables the learning of joint +representations among multiple discrete acoustic codes within the same audio +input. In our experiments,we treat discrete acoustic codes as textual data and +train a masked language model using a cloze-like methodology,ultimately +deriving high-quality audio representations. Notably,the MPC learning technique +effectively captures collaborative representations among distinct positive +samples. Our research outcomes demonstrate that AudioFormer attains +significantly improved performance compared to prevailing monomodal audio +classification models across multiple datasets,and even outperforms +audio-visual multimodal classification models on select datasets. +Specifically,our approach achieves remarkable results on datasets including +AudioSet (2M,20K),and FSD50K,with performance scores of 53.9,45.1,and +65.6,respectively. We have openly shared both the code and models: +https://github.com/LZH-0225/AudioFormer.git. + +
+
+ comment: Need to supplement more detailed experiments +
+
+
+
+
+ + ♻ ☆ A Neural-Network-Based Convex Regularizer for Inverse Problems + + +
+ The emergence of deep-learning-based methods to solve image-reconstruction +problems has enabled a significant increase in reconstruction quality. +Unfortunately, these new methods often lack reliability and explainability, and +there is a growing interest to address these shortcomings while retaining the +boost in performance. In this work, we tackle this issue by revisiting +regularizers that are the sum of convex-ridge functions. The gradient of such +regularizers is parameterized by a neural network that has a single hidden +layer with increasing and learnable activation functions. This neural network +is trained within a few minutes as a multistep Gaussian denoiser. The numerical +experiments for denoising, CT, and MRI reconstruction show improvements over +methods that offer similar reliability guarantees. + +
+
+
+
+
+ + ♻ ☆ Rethinking the Role of Pre-Trained Networks in Source-Free Domain + Adaptation ICCV 2023 + + +
+ Source-free domain adaptation (SFDA) aims to adapt a source model trained on +a fully-labeled source domain to an unlabeled target domain. Large-data +pre-trained networks are used to initialize source models during source +training, and subsequently discarded. However, source training can cause the +model to overfit to source data distribution and lose applicable target domain +knowledge. We propose to integrate the pre-trained network into the target +adaptation process as it has diversified features important for generalization +and provides an alternate view of features and classification decisions +different from the source model. We propose to distil useful target domain +information through a co-learning strategy to improve target pseudolabel +quality for finetuning the source model. Evaluation on 4 benchmark datasets +show that our proposed strategy improves adaptation performance and can be +successfully integrated with existing SFDA methods. Leveraging modern +pre-trained networks that have stronger representation learning ability in the +co-learning strategy further boosts performance. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Random Inverse Problems Over Graphs: Decentralized Online Learning + + +
+ We establish a framework of distributed random inverse problems over network +graphs with online measurements, and propose a decentralized online learning +algorithm. This unifies the distributed parameter estimation in Hilbert spaces +and the least mean square problem in reproducing kernel Hilbert spaces +(RKHS-LMS). We transform the convergence of the algorithm into the asymptotic +stability of a class of inhomogeneous random difference equations in Hilbert +spaces with L2-bounded martingale difference terms and develop the L2 +-asymptotic stability theory in Hilbert spaces. It is shown that if the network +graph is connected and the sequence of forward operators satisfies the +infinite-dimensional spatio-temporal persistence of excitation condition, then +the estimates of all nodes are mean square and almost surely strongly +consistent. Moreover, we propose a decentralized online learning algorithm in +RKHS based on non-stationary and non-independent online data streams, and prove +that the algorithm is mean square and almost surely strongly consistent if the +operators induced by the random input data satisfy the infinite-dimensional +spatio-temporal persistence of excitation condition. + +
+
+
+
+
+ + ♻ ☆ Match-And-Deform: Time Series Domain Adaptation through Optimal + Transport and Temporal Alignment + + +
+ While large volumes of unlabeled data are usually available, associated +labels are often scarce. The unsupervised domain adaptation problem aims at +exploiting labels from a source domain to classify data from a related, yet +different, target domain. When time series are at stake, new difficulties arise +as temporal shifts may appear in addition to the standard feature distribution +shift. In this paper, we introduce the Match-And-Deform (MAD) approach that +aims at finding correspondences between the source and target time series while +allowing temporal distortions. The associated optimization problem +simultaneously aligns the series thanks to an optimal transport loss and the +time stamps through dynamic time warping. When embedded into a deep neural +network, MAD helps learning new representations of time series that both align +the domains and maximize the discriminative power of the network. Empirical +studies on benchmark datasets and remote sensing data demonstrate that MAD +makes meaningful sample-to-sample pairing and time shift estimation, reaching +similar or better classification performance than state-of-the-art deep time +series domain adaptation strategies. + +
+
+
+
+
+ + ♻ ☆ DRIP: Deep Regularizers for Inverse Problems + + +
+ In this paper we consider inverse problems that are mathematically ill-posed. +That is, given some (noisy) data, there is more than one solution that +approximately fits the data. In recent years, deep neural techniques that find +the most appropriate solution, in the sense that it contains a-priori +information, were developed. However, they suffer from several shortcomings. +First, most techniques cannot guarantee that the solution fits the data at +inference. Second, while the derivation of the techniques is inspired by the +existence of a valid scalar regularization function, such techniques do not in +practice rely on such a function, and therefore veer away from classical +variational techniques. In this work we introduce a new family of neural +regularizers for the solution of inverse problems. These regularizers are based +on a variational formulation and are guaranteed to fit the data. We demonstrate +their use on a number of highly ill-posed problems, from image deblurring to +limited angle tomography. + +
+
+
+
+
+ + ♻ ☆ Beyond Sharing: Conflict-Aware Multivariate Time Series Anomaly + Detection + + +
+ Massive key performance indicators (KPIs) are monitored as multivariate time +series data (MTS) to ensure the reliability of the software applications and +service system. Accurately detecting the abnormality of MTS is very critical +for subsequent fault elimination. The scarcity of anomalies and manual labeling +has led to the development of various self-supervised MTS anomaly detection +(AD) methods, which optimize an overall objective/loss encompassing all +metrics' regression objectives/losses. However, our empirical study uncovers +the prevalence of conflicts among metrics' regression objectives, causing MTS +models to grapple with different losses. This critical aspect significantly +impacts detection performance but has been overlooked in existing approaches. +To address this problem, by mimicking the design of multi-gate +mixture-of-experts (MMoE), we introduce CAD, a Conflict-aware multivariate KPI +Anomaly Detection algorithm. CAD offers an exclusive structure for each metric +to mitigate potential conflicts while fostering inter-metric promotions. Upon +thorough investigation, we find that the poor performance of vanilla MMoE +mainly comes from the input-output misalignment settings of MTS formulation and +convergence issues arising from expansive tasks. To address these challenges, +we propose a straightforward yet effective task-oriented metric selection and +p&s (personalized and shared) gating mechanism, which establishes CAD as the +first practicable multi-task learning (MTL) based MTS AD model. Evaluations on +multiple public datasets reveal that CAD obtains an average F1-score of 0.943 +across three public datasets, notably outperforming state-of-the-art methods. +Our code is accessible at https://github.com/dawnvince/MTS_CAD. + +
+
+ comment: 11 pages, ESEC/FSE industry track 2023 +
+
+
+
+
+ + ♻ ☆ IntelliGraphs: Datasets for Benchmarking Knowledge Graph Generation + + +
+ Knowledge Graph Embedding (KGE) models are used to learn continuous +representations of entities and relations. A key task in the literature is +predicting missing links between entities. However, Knowledge Graphs are not +just sets of links but also have semantics underlying their structure. +Semantics is crucial in several downstream tasks, such as query answering or +reasoning. We introduce the subgraph inference task, where a model has to +generate likely and semantically valid subgraphs. We propose IntelliGraphs, a +set of five new Knowledge Graph datasets. The IntelliGraphs datasets contain +subgraphs with semantics expressed in logical rules for evaluating subgraph +inference. We also present the dataset generator that produced the synthetic +datasets. We designed four novel baseline models, which include three models +based on traditional KGEs. We evaluate their expressiveness and show that these +models cannot capture the semantics. We believe this benchmark will encourage +the development of machine learning models that emphasize semantic +understanding. + +
+
+
+
+
+ + ♻ ☆ Deep Reinforcement Learning for Artificial Upwelling Energy Management + + +
+ The potential of artificial upwelling (AU) as a means of lifting +nutrient-rich bottom water to the surface, stimulating seaweed growth, and +consequently enhancing ocean carbon sequestration, has been gaining increasing +attention in recent years. This has led to the development of the first +solar-powered and air-lifted AU system (AUS) in China. However, efficient +scheduling of air injection systems in complex marine environments remains a +crucial challenge in operating AUS, as it holds the potential to significantly +improve energy efficiency. To tackle this challenge, we propose a novel energy +management approach that utilizes deep reinforcement learning (DRL) algorithm +to develop efficient strategies for operating AUS. Specifically, we formulate +the problem of maximizing the energy efficiency of AUS as a Markov decision +process and integrate the quantile network in distributional reinforcement +learning (QR-DQN) with the deep dueling network to solve it. Through extensive +simulations, we evaluate the performance of our algorithm and demonstrate its +superior effectiveness over traditional rule-based approaches and other DRL +algorithms in reducing energy wastage while ensuring the stable and efficient +operation of AUS. Our findings suggest that a DRL-based approach offers a +promising way to improve the energy efficiency of AUS and enhance the +sustainability of seaweed cultivation and carbon sequestration in the ocean. + +
+
+ comment: 31 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ Masked Feature Modelling: Feature Masking for the Unsupervised + Pre-training of a Graph Attention Network Block for Bottom-up Video Event + Recognition + + +
+ In this paper, we introduce Masked Feature Modelling (MFM), a novel approach +for the unsupervised pre-training of a Graph Attention Network (GAT) block. MFM +utilizes a pretrained Visual Tokenizer to reconstruct masked features of +objects within a video, leveraging the MiniKinetics dataset. We then +incorporate the pre-trained GAT block into a state-of-the-art bottom-up +supervised video-event recognition architecture, ViGAT, to improve the model's +starting point and overall accuracy. Experimental evaluations on the YLI-MED +dataset demonstrate the effectiveness of MFM in improving event recognition +performance. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ♻ ☆ Distributed Graph Neural Network Training: A Survey + + +
+ Graph neural networks (GNNs) are a type of deep learning models that are +trained on graphs and have been successfully applied in various domains. +Despite the effectiveness of GNNs, it is still challenging for GNNs to +efficiently scale to large graphs. As a remedy, distributed computing becomes a +promising solution of training large-scale GNNs, since it is able to provide +abundant computing resources. However, the dependency of graph structure +increases the difficulty of achieving high-efficiency distributed GNN training, +which suffers from the massive communication and workload imbalance. In recent +years, many efforts have been made on distributed GNN training, and an array of +training algorithms and systems have been proposed. Yet, there is a lack of +systematic review on the optimization techniques for the distributed execution +of GNN training. In this survey, we analyze three major challenges in +distributed GNN training that are massive feature communication, the loss of +model accuracy and workload imbalance. Then we introduce a new taxonomy for the +optimization techniques in distributed GNN training that address the above +challenges. The new taxonomy classifies existing techniques into four +categories that are GNN data partition, GNN batch generation, GNN execution +model, and GNN communication protocol. We carefully discuss the techniques in +each category. In the end, we summarize existing distributed GNN systems for +multi-GPUs, GPU-clusters and CPU-clusters, respectively, and give a discussion +about the future direction on distributed GNN training. + +
+
+
+
+
+ + ♻ ☆ Scenario generation for market risk models using generative neural + networks + + +
+ In this research, we show how to expand existing approaches of using +generative adversarial networks (GANs) as economic scenario generators (ESG) to +a whole internal market risk model - with enough risk factors to model the full +band-width of investments for an insurance company and for a one year time +horizon as required in Solvency 2. We demonstrate that the results of a +GAN-based internal model are similar to regulatory approved internal models in +Europe. Therefore, GAN-based models can be seen as a data-driven alternative +way of market risk modeling. + +
+
+
+
+
+ + ♻ ☆ Frequentist Regret Bounds for Randomized Least-Squares Value Iteration + + +
+ We consider the exploration-exploitation dilemma in finite-horizon +reinforcement learning (RL). When the state space is large or continuous, +traditional tabular approaches are unfeasible and some form of function +approximation is mandatory. In this paper, we introduce an +optimistically-initialized variant of the popular randomized least-squares +value iteration (RLSVI), a model-free algorithm where exploration is induced by +perturbing the least-squares approximation of the action-value function. Under +the assumption that the Markov decision process has low-rank transition +dynamics, we prove that the frequentist regret of RLSVI is upper-bounded by +$\widetilde O(d^2 H^2 \sqrt{T})$ where $ d $ are the feature dimension, $ H $ +is the horizon, and $ T $ is the total number of steps. To the best of our +knowledge, this is the first frequentist regret analysis for randomized +exploration with function approximation. + +
+
+ comment: Minor bug fixes +
+
+
+
+
+ + ♻ ☆ NeuRI: Diversifying DNN Generation via Inductive Rule Inference + + +
+ Deep Learning (DL) is prevalently used in various industries to improve +decision-making and automate processes, driven by the ever-evolving DL +libraries and compilers. The correctness of DL systems is crucial for trust in +DL applications. As such, the recent wave of research has been studying the +automated synthesis of test-cases (i.e., DNN models and their inputs) for +fuzzing DL systems. However, existing model generators only subsume a limited +number of operators, lacking the ability to pervasively model operator +constraints. To address this challenge, we propose NeuRI, a fully automated +approach for generating valid and diverse DL models composed of hundreds of +types of operators. NeuRI adopts a three-step process: (i) collecting valid and +invalid API traces from various sources; (ii) applying inductive program +synthesis over the traces to infer the constraints for constructing valid +models; and (iii) performing hybrid model generation by incorporating both +symbolic and concrete operators concolically. Our evaluation shows that NeuRI +improves branch coverage of TensorFlow and PyTorch by 24% and 15% over the +state-of-the-art model-level fuzzers. NeuRI finds 100 new bugs for PyTorch and +TensorFlow in four months, with 81 already fixed or confirmed, and 8 +high-priority bugs labeled by PyTorch, constituting 10% of all high-priority +bugs of the period. Additionally, open-source developers regard error-inducing +models reported by us as "high-quality" and "common in practice". + +
+
+
+
+
+ + ♻ ☆ Feature Unlearning for Pre-trained GANs and VAEs + + +
+ We tackle the problem of feature unlearning from a pre-trained image +generative model: GANs and VAEs. Unlike a common unlearning task where an +unlearning target is a subset of the training set, we aim to unlearn a specific +feature, such as hairstyle from facial images, from the pre-trained generative +models. As the target feature is only presented in a local region of an image, +unlearning the entire image from the pre-trained model may result in losing +other details in the remaining region of the image. To specify which features +to unlearn, we collect randomly generated images that contain the target +features. We then identify a latent representation corresponding to the target +feature and then use the representation to fine-tune the pre-trained model. +Through experiments on MNIST and CelebA datasets, we show that target features +are successfully removed while keeping the fidelity of the original models. +Further experiments with an adversarial attack show that the unlearned model is +more robust under the presence of malicious parties. + +
+
+
+
+
+ + ♻ ☆ Why Does Little Robustness Help? Understanding and Improving Adversarial + Transferability from Surrogate Training + + +
+ Adversarial examples (AEs) for DNNs have been shown to be transferable: AEs +that successfully fool white-box surrogate models can also deceive other +black-box models with different architectures. Although a bunch of empirical +studies have provided guidance on generating highly transferable AEs, many of +these findings lack explanations and even lead to inconsistent advice. In this +paper, we take a further step towards understanding adversarial +transferability, with a particular focus on surrogate aspects. Starting from +the intriguing little robustness phenomenon, where models adversarially trained +with mildly perturbed adversarial samples can serve as better surrogates, we +attribute it to a trade-off between two predominant factors: model smoothness +and gradient similarity. Our investigations focus on their joint effects, +rather than their separate correlations with transferability. Through a series +of theoretical and empirical analyses, we conjecture that the data distribution +shift in adversarial training explains the degradation of gradient similarity. +Building on these insights, we explore the impacts of data augmentation and +gradient regularization on transferability and identify that the trade-off +generally exists in the various training mechanisms, thus building a +comprehensive blueprint for the regulation mechanism behind transferability. +Finally, we provide a general route for constructing better surrogates to boost +transferability which optimizes both model smoothness and gradient similarity +simultaneously, e.g., the combination of input gradient regularization and +sharpness-aware minimization (SAM), validated by extensive experiments. In +summary, we call for attention to the united impacts of these two factors for +launching effective transfer attacks, rather than optimizing one while ignoring +the other, and emphasize the crucial role of manipulating surrogate models. + +
+
+ comment: Accepted by IEEE Symposium on Security and Privacy (Oakland) 2024; 21 + pages, 11 figures, 13 tables +
+
+
+
+
+ + ♻ ☆ Comparative Study: Standalone IEEE 16-bit Floating-Point for Image + Classification + + +
+ Reducing the number of bits needed to encode the weights and activations of +neural networks is highly desirable as it speeds up their training and +inference time while reducing memory consumption. It is unsurprising that +considerable attention has been drawn to developing neural networks that employ +lower-precision computation. This includes IEEE 16-bit, Google bfloat16, 8-bit, +4-bit floating-point or fixed-point, 2-bit, and various mixed-precision +algorithms. Out of these low-precision formats, IEEE 16-bit stands out due to +its universal compatibility with contemporary GPUs. This accessibility +contrasts with bfloat16, which needs high-end GPUs, or other non-standard +fewer-bit designs, which typically require software simulation. This study +focuses on the widely accessible IEEE 16-bit format for comparative analysis. +This analysis involves an in-depth theoretical investigation of the factors +that lead to discrepancies between 16-bit and 32-bit models, including a +formalization of the concepts of floating-point error and tolerance to +understand the conditions under which a 16-bit model can approximate 32-bit +results. Contrary to literature that credits the success of noise-tolerated +neural networks to regularization effects, our study-supported by a series of +rigorous experiments-provides a quantitative explanation of why standalone IEEE +16-bit floating-point neural networks can perform on par with 32-bit and +mixed-precision networks in various image classification tasks. Because no +prior research has studied IEEE 16-bit as a standalone floating-point precision +in neural networks, we believe our findings will have significant impacts, +encouraging the adoption of standalone IEEE 16-bit networks in future neural +network applications. + +
+
+
+
+
+ + ♻ ☆ Augmenting Reinforcement Learning with Transformer-based Scene + Representation Learning for Decision-making of Autonomous Driving + + +
+ Decision-making for urban autonomous driving is challenging due to the +stochastic nature of interactive traffic participants and the complexity of +road structures. Although reinforcement learning (RL)-based decision-making +scheme is promising to handle urban driving scenarios, it suffers from low +sample efficiency and poor adaptability. In this paper, we propose Scene-Rep +Transformer to improve the RL decision-making capabilities with better scene +representation encoding and sequential predictive latent distillation. +Specifically, a multi-stage Transformer (MST) encoder is constructed to model +not only the interaction awareness between the ego vehicle and its neighbors +but also intention awareness between the agents and their candidate routes. A +sequential latent Transformer (SLT) with self-supervised learning objectives is +employed to distill the future predictive information into the latent scene +representation, in order to reduce the exploration space and speed up training. +The final decision-making module based on soft actor-critic (SAC) takes as +input the refined latent scene representation from the Scene-Rep Transformer +and outputs driving actions. The framework is validated in five challenging +simulated urban scenarios with dense traffic, and its performance is manifested +quantitatively by the substantial improvements in data efficiency and +performance in terms of success rate, safety, and efficiency. The qualitative +results reveal that our framework is able to extract the intentions of neighbor +agents to help make decisions and deliver more diversified driving behaviors. + +
+
+
+
+
+ + ♻ ☆ PGB: A PubMed Graph Benchmark for Heterogeneous Network Representation + Learning + + +
+ There has been rapid growth in biomedical literature, yet capturing the +heterogeneity of the bibliographic information of these articles remains +relatively understudied. Although graph mining research via heterogeneous graph +neural networks has taken center stage, it remains unclear whether these +approaches capture the heterogeneity of the PubMed database, a vast digital +repository containing over 33 million articles. We introduce PubMed Graph +Benchmark (PGB), a new benchmark dataset for evaluating heterogeneous graph +embeddings for biomedical literature. The benchmark contains rich metadata +including abstract, authors, citations, MeSH terms, MeSH hierarchy, and some +other information. The benchmark contains three different evaluation tasks +encompassing systematic reviews, node classification, and node clustering. In +PGB, we aggregate the metadata associated with the biomedical articles from +PubMed into a unified source and make the benchmark publicly available for any +future works. + +
+
+
+
+
+ + ♻ ☆ To Spike or Not To Spike: A Digital Hardware Perspective on Deep + Learning Acceleration + + +
+ As deep learning models scale, they become increasingly competitive from +domains spanning computer vision to natural language processing; however, this +happens at the expense of efficiency since they require increasingly more +memory and computing power. The power efficiency of the biological brain +outperforms the one of any large-scale deep learning (DL) model; thus, +neuromorphic computing tries to mimic the brain operations, such as spike-based +information processing, to improve the efficiency of DL models. Despite the +benefits of the brain, such as efficient information transmission, dense +neuronal interconnects, and the co-location of computation and memory, the +available biological substrate has severely constrained the evolution of +biological brains. Electronic hardware does not have the same constraints; +therefore, while modeling spiking neural networks (SNNs) might uncover one +piece of the puzzle, the design of efficient hardware backends for SNNs needs +further investigation, potentially taking inspiration from the available work +done on the artificial neural networks (ANN s) side. As such, when is it wise +to look at the brain while designing new hardware, and when should it be +ignored? To answer this question, we quantitatively compare the digital +hardware acceleration techniques and platforms of ANN s and SNNs. + +
+
+ comment: Replace with reviewed version. Submitted to JETCAS +
+
+
+
+
+ + ♻ ☆ Scale Federated Learning for Label Set Mismatch in Medical Image + Classification + + +
+ Federated learning (FL) has been introduced to the healthcare domain as a +decentralized learning paradigm that allows multiple parties to train a model +collaboratively without privacy leakage. However, most previous studies have +assumed that every client holds an identical label set. In reality, medical +specialists tend to annotate only diseases within their area of expertise or +interest. This implies that label sets in each client can be different and even +disjoint. In this paper, we propose the framework FedLSM to solve the problem +of Label Set Mismatch. FedLSM adopts different training strategies on data with +different uncertainty levels to efficiently utilize unlabeled or partially +labeled data as well as class-wise adaptive aggregation in the classification +layer to avoid inaccurate aggregation when clients have missing labels. We +evaluated FedLSM on two public real-world medical image datasets, including +chest X-ray (CXR) diagnosis with 112,120 CXR images and skin lesion diagnosis +with 10,015 dermoscopy images, and showed that it significantly outperformed +other state-of-the-art FL algorithms. The code can be found at +https://github.com/dzp2095/FedLSM. + +
+
+
+
+
+ + ♻ ☆ Bayesian polynomial neural networks and polynomial neural ordinary + differential equations + + +
+ Symbolic regression with polynomial neural networks and polynomial neural +ordinary differential equations (ODEs) are two recent and powerful approaches +for equation recovery of many science and engineering problems. However, these +methods provide point estimates for the model parameters and are currently +unable to accommodate noisy data. We address this challenge by developing and +validating the following Bayesian inference methods: the Laplace approximation, +Markov Chain Monte Carlo (MCMC) sampling methods, and variational inference. We +have found the Laplace approximation to be the best method for this class of +problems. Our work can be easily extended to the broader class of symbolic +neural networks to which the polynomial neural network belongs. + +
+
+
+
+
+ + ♻ ☆ How to Mask in Error Correction Code Transformer: Systematic and Double + Masking + + +
+ In communication and storage systems, error correction codes (ECCs) are +pivotal in ensuring data reliability. As deep learning's applicability has +broadened across diverse domains, there is a growing research focus on neural +network-based decoders that outperform traditional decoding algorithms. Among +these neural decoders, Error Correction Code Transformer (ECCT) has achieved +the state-of-the-art performance, outperforming other methods by large margins. +To further enhance the performance of ECCT, we propose two novel methods. +First, leveraging the systematic encoding technique of ECCs, we introduce a new +masking matrix for ECCT, aiming to improve the performance and reduce the +computational complexity. Second, we propose a novel transformer architecture +of ECCT called a double-masked ECCT. This architecture employs two different +mask matrices in a parallel manner to learn more diverse features of the +relationship between codeword bits in the masked self-attention blocks. +Extensive simulation results show that the proposed double-masked ECCT +outperforms the conventional ECCT, achieving the state-of-the-art decoding +performance with significant margins. + +
+
+ comment: 8 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Implicit Regularization of Bregman Proximal Point Algorithm and Mirror + Descent on Separable Data + + +
+ Bregman proximal point algorithm (BPPA) has witnessed emerging machine +learning applications, yet its theoretical understanding has been largely +unexplored. We study the computational properties of BPPA through learning +linear classifiers with separable data, and demonstrate provable algorithmic +regularization of BPPA. For any BPPA instantiated with a fixed Bregman +divergence, we provide a lower bound of the margin obtained by BPPA with +respect to an arbitrarily chosen norm. The obtained margin lower bound differs +from the maximal margin by a multiplicative factor, which inversely depends on +the condition number of the distance-generating function measured in the dual +norm. We show that the dependence on the condition number is tight, thus +demonstrating the importance of divergence in affecting the quality of the +learned classifiers. We then extend our findings to mirror descent, for which +we establish similar connections between the margin and Bregman divergence, +together with a non-asymptotic analysis. Numerical experiments on both +synthetic and real-world datasets are provided to support our theoretical +findings. To the best of our knowledge, the aforementioned findings appear to +be new in the literature of algorithmic regularization. + +
+
+
+
+
+ + ♻ ☆ BubbleML: A Multi-Physics Dataset and Benchmarks for Machine Learning + + +
+ In the field of phase change phenomena, the lack of accessible and diverse +datasets suitable for machine learning (ML) training poses a significant +challenge. Existing experimental datasets are often restricted, with limited +availability and sparse ground truth data, impeding our understanding of this +complex multiphysics phenomena. To bridge this gap, we present the BubbleML +Dataset +\footnote{\label{git_dataset}\url{https://github.com/HPCForge/BubbleML}} which +leverages physics-driven simulations to provide accurate ground truth +information for various boiling scenarios, encompassing nucleate pool boiling, +flow boiling, and sub-cooled boiling. This extensive dataset covers a wide +range of parameters, including varying gravity conditions, flow rates, +sub-cooling levels, and wall superheat, comprising 79 simulations. BubbleML is +validated against experimental observations and trends, establishing it as an +invaluable resource for ML research. Furthermore, we showcase its potential to +facilitate exploration of diverse downstream tasks by introducing two +benchmarks: (a) optical flow analysis to capture bubble dynamics, and (b) +operator networks for learning temperature dynamics. The BubbleML dataset and +its benchmarks serve as a catalyst for advancements in ML-driven research on +multiphysics phase change phenomena, enabling the development and comparison of +state-of-the-art techniques and models. + +
+
+ comment: Submitted to Neurips Datasets and Benchmarks Track 2023 +
+
+
+
+
+ + ♻ ☆ Fix Fairness, Don't Ruin Accuracy: Performance Aware Fairness Repair + using AutoML + + +
+ Machine learning (ML) is increasingly being used in critical decision-making +software, but incidents have raised questions about the fairness of ML +predictions. To address this issue, new tools and methods are needed to +mitigate bias in ML-based software. Previous studies have proposed bias +mitigation algorithms that only work in specific situations and often result in +a loss of accuracy. Our proposed solution is a novel approach that utilizes +automated machine learning (AutoML) techniques to mitigate bias. Our approach +includes two key innovations: a novel optimization function and a +fairness-aware search space. By improving the default optimization function of +AutoML and incorporating fairness objectives, we are able to mitigate bias with +little to no loss of accuracy. Additionally, we propose a fairness-aware search +space pruning method for AutoML to reduce computational cost and repair time. +Our approach, built on the state-of-the-art Auto-Sklearn tool, is designed to +reduce bias in real-world scenarios. In order to demonstrate the effectiveness +of our approach, we evaluated our approach on four fairness problems and 16 +different ML models, and our results show a significant improvement over the +baseline and existing bias mitigation techniques. Our approach, Fair-AutoML, +successfully repaired 60 out of 64 buggy cases, while existing bias mitigation +techniques only repaired up to 44 out of 64 cases. + +
+
+ comment: In Proceedings of The 31st ACM Joint European Software Engineering + Conference and Symposium on the Foundations of Software Engineering (ESEC/FSE + 2023) +
+
+
+
+
+ + ♻ ☆ Min-Max Optimization under Delays + + +
+ Delays and asynchrony are inevitable in large-scale machine-learning problems +where communication plays a key role. As such, several works have extensively +analyzed stochastic optimization with delayed gradients. However, as far as we +are aware, no analogous theory is available for min-max optimization, a topic +that has gained recent popularity due to applications in adversarial +robustness, game theory, and reinforcement learning. Motivated by this gap, we +examine the performance of standard min-max optimization algorithms with +delayed gradient updates. First, we show (empirically) that even small delays +can cause prominent algorithms like Extra-gradient (\texttt{EG}) to diverge on +simple instances for which \texttt{EG} guarantees convergence in the absence of +delays. Our empirical study thus suggests the need for a careful analysis of +delayed versions of min-max optimization algorithms. Accordingly, under +suitable technical assumptions, we prove that Gradient Descent-Ascent +(\texttt{GDA}) and \texttt{EG} with delayed updates continue to guarantee +convergence to saddle points for convex-concave and strongly convex-strongly +concave settings. Our complexity bounds reveal, in a transparent manner, the +slow-down in convergence caused by delays. + +
+
+
+
+
+ + ♻ ☆ Self-Deception: Reverse Penetrating the Semantic Firewall of Large + Language Models + + +
+ Large language models (LLMs), such as ChatGPT, have emerged with astonishing +capabilities approaching artificial general intelligence. While providing +convenience for various societal needs, LLMs have also lowered the cost of +generating harmful content. Consequently, LLM developers have deployed +semantic-level defenses to recognize and reject prompts that may lead to +inappropriate content. Unfortunately, these defenses are not foolproof, and +some attackers have crafted "jailbreak" prompts that temporarily hypnotize the +LLM into forgetting content defense rules and answering any improper questions. +To date, there is no clear explanation of the principles behind these +semantic-level attacks and defenses in both industry and academia. + This paper investigates the LLM jailbreak problem and proposes an automatic +jailbreak method for the first time. We propose the concept of a semantic +firewall and provide three technical implementation approaches. Inspired by the +attack that penetrates traditional firewalls through reverse tunnels, we +introduce a "self-deception" attack that can bypass the semantic firewall by +inducing LLM to generate prompts that facilitate jailbreak. We generated a +total of 2,520 attack payloads in six languages (English, Russian, French, +Spanish, Chinese, and Arabic) across seven virtual scenarios, targeting the +three most common types of violations: violence, hate, and pornography. The +experiment was conducted on two models, namely the GPT-3.5-Turbo and GPT-4. The +success rates on the two models were 86.2% and 67%, while the failure rates +were 4.7% and 2.2%, respectively. This highlighted the effectiveness of the +proposed attack method. All experimental code and raw data will be released as +open-source to inspire future research. We believe that manipulating AI +behavior through carefully crafted prompts will become an important research +direction in the future. + +
+
+ comment: Serious errors were found in the experiment, which may lead to the + overturning of the overall conclusions of the paper +
+
+
+
+
+
+
+
+ + Multimedia 5 + +
+
+
+ + ☆ Exploiting Diverse Feature for Multimodal Sentiment Analysis + + +
+ In this paper, we present our solution to the MuSe-Personalisation +sub-challenge in the MuSe 2023 Multimodal Sentiment Analysis Challenge. The +task of MuSe-Personalisation aims to predict the continuous arousal and valence +values of a participant based on their audio-visual, language, and +physiological signal modalities data. Considering different people have +personal characteristics, the main challenge of this task is how to build +robustness feature presentation for sentiment prediction. To address this +issue, we propose exploiting diverse features. Specifically, we proposed a +series of feature extraction methods to build a robust representation and model +ensemble. We empirically evaluate the performance of the utilized method on the +officially provided dataset. \textbf{As a result, we achieved 3rd place in the +MuSe-Personalisation sub-challenge.} Specifically, we achieve the results of +0.8492 and 0.8439 for MuSe-Personalisation in terms of arousal and valence CCC. + +
+
+
+
+
+ + ☆ Bridging the Gap: Fine-to-Coarse Sketch Interpolation Network for + High-Quality Animation Sketch Inbetweening + + +
+ The 2D animation workflow is typically initiated with the creation of +keyframes using sketch-based drawing. Subsequent inbetweens (i.e., intermediate +sketch frames) are crafted through manual interpolation for smooth animations, +which is a labor-intensive process. Thus, the prospect of automatic animation +sketch interpolation has become highly appealing. However, existing video +interpolation methods are generally hindered by two key issues for sketch +inbetweening: 1) limited texture and colour details in sketches, and 2) +exaggerated alterations between two sketch keyframes. To overcome these issues, +we propose a novel deep learning method, namely Fine-to-Coarse Sketch +Interpolation Network (FC-SIN). This approach incorporates multi-level guidance +that formulates region-level correspondence, sketch-level correspondence and +pixel-level dynamics. A multi-stream U-Transformer is then devised to +characterize sketch inbewteening patterns using these multi-level guides +through the integration of both self-attention and cross-attention mechanisms. +Additionally, to facilitate future research on animation sketch inbetweening, +we constructed a large-scale dataset - STD-12K, comprising 30 sketch animation +series in diverse artistic styles. Comprehensive experiments on this dataset +convincingly show that our proposed FC-SIN surpasses the state-of-the-art +interpolation methods. Our code and dataset will be publicly available. + +
+
+ comment: 7pages,6figures +
+
+
+
+
+ + ♻ ☆ Can Linguistic Knowledge Improve Multimodal Alignment in Vision-Language + Pretraining? + + +
+ The multimedia community has shown a significant interest in perceiving and +representing the physical world with multimodal pretrained neural network +models, and among them, the visual-language pertaining (VLP) is, currently, the +most captivating topic. However, there have been few endeavors dedicated to the +exploration of 1) whether essential linguistic knowledge (e.g., semantics and +syntax) can be extracted during VLP, and 2) how such linguistic knowledge +impact or enhance the multimodal alignment. In response, here we aim to +elucidate the impact of comprehensive linguistic knowledge, including semantic +expression and syntactic structure, on multimodal alignment. Specifically, we +design and release the SNARE, the first large-scale multimodal alignment +probing benchmark, to detect the vital linguistic components, e.g., lexical, +semantic, and syntax knowledge, containing four tasks: Semantic structure, +Negation logic, Attribute ownership, and Relationship composition. Based on our +proposed probing benchmarks, our holistic analyses of five advanced VLP models +illustrate that the VLP model: i) shows insensitivity towards complex syntax +structures and relies on content words for sentence comprehension; ii) +demonstrates limited comprehension of combinations between sentences and +negations; iii) faces challenges in determining the presence of actions or +spatial relationships within visual information and struggles with verifying +the correctness of triple combinations. We make our benchmark and code +available at \url{https://github.com/WangFei-2019/SNARE/}. + +
+
+ comment: [TL;DR] we design and release the SNARE, the first large-scale + multimodal alignment probing benchmark for current vision-language pretrained + models +
+
+
+
+
+ + ♻ ☆ Masked Feature Modelling: Feature Masking for the Unsupervised + Pre-training of a Graph Attention Network Block for Bottom-up Video Event + Recognition + + +
+ In this paper, we introduce Masked Feature Modelling (MFM), a novel approach +for the unsupervised pre-training of a Graph Attention Network (GAT) block. MFM +utilizes a pretrained Visual Tokenizer to reconstruct masked features of +objects within a video, leveraging the MiniKinetics dataset. We then +incorporate the pre-trained GAT block into a state-of-the-art bottom-up +supervised video-event recognition architecture, ViGAT, to improve the model's +starting point and overall accuracy. Experimental evaluations on the YLI-MED +dataset demonstrate the effectiveness of MFM in improving event recognition +performance. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ♻ ☆ VadCLIP: Adapting Vision-Language Models for Weakly Supervised Video + Anomaly Detection + + +
+ The recent contrastive language-image pre-training (CLIP) model has shown +great success in a wide range of image-level tasks, revealing remarkable +ability for learning powerful visual representations with rich semantics. An +open and worthwhile problem is efficiently adapting such a strong model to the +video domain and designing a robust video anomaly detector. In this work, we +propose VadCLIP, a new paradigm for weakly supervised video anomaly detection +(WSVAD) by leveraging the frozen CLIP model directly without any pre-training +and fine-tuning process. Unlike current works that directly feed extracted +features into the weakly supervised classifier for frame-level binary +classification, VadCLIP makes full use of fine-grained associations between +vision and language on the strength of CLIP and involves dual branch. One +branch simply utilizes visual features for coarse-grained binary +classification, while the other fully leverages the fine-grained language-image +alignment. With the benefit of dual branch, VadCLIP achieves both +coarse-grained and fine-grained video anomaly detection by transferring +pre-trained knowledge from CLIP to WSVAD task. We conduct extensive experiments +on two commonly-used benchmarks, demonstrating that VadCLIP achieves the best +performance on both coarse-grained and fine-grained WSVAD, surpassing the +state-of-the-art methods by a large margin. Specifically, VadCLIP achieves +84.51% AP and 88.02% AUC on XD-Violence and UCF-Crime, respectively. Code and +features will be released to facilitate future VAD research. + +
+
+ comment: Submitted +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 42 + +
+
+
+ + ☆ Qwen-VL: A Frontier Large Vision-Language Model with Versatile Abilities + + +
+ We introduce the Qwen-VL series, a set of large-scale vision-language models +designed to perceive and understand both text and images. Comprising Qwen-VL +and Qwen-VL-Chat, these models exhibit remarkable performance in tasks like +image captioning, question answering, visual localization, and flexible +interaction. The evaluation covers a wide range of tasks including zero-shot +captioning, visual or document visual question answering, and grounding. We +demonstrate the Qwen-VL outperforms existing Large Vision Language Models +(LVLMs). We present their architecture, training, capabilities, and +performance, highlighting their contributions to advancing multimodal +artificial intelligence. Code, demo and models are available at +https://github.com/QwenLM/Qwen-VL. + +
+
+ comment: Code, demo and models are available at + https://github.com/QwenLM/Qwen-VL +
+
+
+
+
+ + ☆ Code Llama: Open Foundation Models for Code + + +
+ We release Code Llama, a family of large language models for code based on +Llama 2 providing state-of-the-art performance among open models, infilling +capabilities, support for large input contexts, and zero-shot instruction +following ability for programming tasks. We provide multiple flavors to cover a +wide range of applications: foundation models (Code Llama), Python +specializations (Code Llama - Python), and instruction-following models (Code +Llama - Instruct) with 7B, 13B and 34B parameters each. All models are trained +on sequences of 16k tokens and show improvements on inputs with up to 100k +tokens. 7B and 13B Code Llama and Code Llama - Instruct variants support +infilling based on surrounding content. Code Llama reaches state-of-the-art +performance among open models on several code benchmarks, with scores of up to +53% and 55% on HumanEval and MBPP, respectively. Notably, Code Llama - Python +7B outperforms Llama 2 70B on HumanEval and MBPP, and all our models outperform +every other publicly available model on MultiPL-E. We release Code Llama under +a permissive license that allows for both research and commercial use. + +
+
+
+
+
+ + ☆ Can Linguistic Knowledge Improve Multimodal Alignment in Vision-Language + Pretraining? + + +
+ The multimedia community has shown a significant interest in perceiving and +representing the physical world with multimodal pretrained neural network +models, and among them, the visual-language pertaining (VLP) is, currently, the +most captivating topic. However, there have been few endeavors dedicated to the +exploration of 1) whether essential linguistic knowledge (e.g., semantics and +syntax) can be extracted during VLP, and 2) how such linguistic knowledge +impact or enhance the multimodal alignment. In response, here we aim to +elucidate the impact of comprehensive linguistic knowledge, including semantic +expression and syntactic structure, on multimodal alignment. Specifically, we +design and release the SNARE, the first large-scale multimodal alignment +probing benchmark, to detect the vital linguistic components, e.g., lexical, +semantic, and syntax knowledge, containing four tasks: Semantic structure, +Negation logic, Attribute ownership, and Relationship composition. Based on our +proposed probing benchmarks, our holistic analyses of five advanced VLP models +illustrate that the VLP model: i) shows insensitivity towards complex syntax +structures and relies on content words for sentence comprehension; ii) +demonstrates limited comprehension of combinations between sentences and +negations; iii) faces challenges in determining the presence of actions or +spatial relationships within visual information and struggles with verifying +the correctness of triple combinations. We make our benchmark and code +available at \url{https://github.com/WangFei-2019/SNARE/}. + +
+
+
+
+
+ + ☆ Beyond Document Page Classification: Design, Datasets, and Challenges + + +
+ This paper highlights the need to bring document classification benchmarking +closer to real-world applications, both in the nature of data tested ($X$: +multi-channel, multi-paged, multi-industry; $Y$: class distributions and label +set variety) and in classification tasks considered ($f$: multi-page document, +page stream, and document bundle classification, ...). We identify the lack of +public multi-page document classification datasets, formalize different +classification tasks arising in application scenarios, and motivate the value +of targeting efficient multi-page document representations. An experimental +study on proposed multi-page document classification datasets demonstrates that +current benchmarks have become irrelevant and need to be updated to evaluate +complete documents, as they naturally occur in practice. This reality check +also calls for more mature evaluation methodologies, covering calibration +evaluation, inference complexity (time-memory), and a range of realistic +distribution shifts (e.g., born-digital vs. scanning noise, shifting page +order). Our study ends on a hopeful note by recommending concrete avenues for +future improvements.} + +
+
+ comment: 8 pages, under review +
+
+
+
+
+ + ☆ Large Language Models Vote: Prompting for Rare Disease Identification + + +
+ The emergence of generative Large Language Models (LLMs) emphasizes the need +for accurate and efficient prompting approaches. LLMs are often applied in +Few-Shot Learning (FSL) contexts, where tasks are executed with minimal +training data. FSL has become popular in many Artificial Intelligence (AI) +subdomains, including AI for health. Rare diseases, affecting a small fraction +of the population, inherently require FSL techniques due to limited data +availability, though manual data collection and annotation is costly and +time-consuming. In this paper, we propose Models-Vote Prompting (MVP), a +flexible prompting approach for improving the performance of LLM queries in FSL +settings. MVP works by prompting numerous LLMs to perform the same tasks and +then conducting a majority vote on the resulting outputs. This method achieves +improved results to any one model in the ensemble on one-shot rare disease +identification and classification tasks. We also release a novel rare disease +dataset for FSL, available to those who agreed to the MIMIC-IV Data Use +Agreement (DUA). Furthermore, in using MVP, each model is prompted multiple +times, substantially increasing the time needed for manual annotation, and to +address this, we assess the feasibility of using JSON for automating generative +LLM evaluation. + +
+
+
+
+
+ + ☆ Inducing Causal Structure for Abstractive Text Summarization + + +
+ The mainstream of data-driven abstractive summarization models tends to +explore the correlations rather than the causal relationships. Among such +correlations, there can be spurious ones which suffer from the language prior +learned from the training corpus and therefore undermine the overall +effectiveness of the learned model. To tackle this issue, we introduce a +Structural Causal Model (SCM) to induce the underlying causal structure of the +summarization data. We assume several latent causal factors and non-causal +factors, representing the content and style of the document and summary. +Theoretically, we prove that the latent factors in our SCM can be identified by +fitting the observed training data under certain conditions. On the basis of +this, we propose a Causality Inspired Sequence-to-Sequence model (CI-Seq2Seq) +to learn the causal representations that can mimic the causal factors, guiding +us to pursue causal information for summary generation. The key idea is to +reformulate the Variational Auto-encoder (VAE) to fit the joint distribution of +the document and summary variables from the training corpus. Experimental +results on two widely used text summarization datasets demonstrate the +advantages of our approach. + +
+
+
+
+
+ + ☆ Text Similarity from Image Contents using Statistical and Semantic + Analysis Techniques + + +
+ Plagiarism detection is one of the most researched areas among the Natural +Language Processing(NLP) community. A good plagiarism detection covers all the +NLP methods including semantics, named entities, paraphrases etc. and produces +detailed plagiarism reports. Detection of Cross Lingual Plagiarism requires +deep knowledge of various advanced methods and algorithms to perform effective +text similarity checking. Nowadays the plagiarists are also advancing +themselves from hiding the identity from being catch in such offense. The +plagiarists are bypassed from being detected with techniques like paraphrasing, +synonym replacement, mismatching citations, translating one language to +another. Image Content Plagiarism Detection (ICPD) has gained importance, +utilizing advanced image content processing to identify instances of plagiarism +to ensure the integrity of image content. The issue of plagiarism extends +beyond textual content, as images such as figures, graphs, and tables also have +the potential to be plagiarized. However, image content plagiarism detection +remains an unaddressed challenge. Therefore, there is a critical need to +develop methods and systems for detecting plagiarism in image content. In this +paper, the system has been implemented to detect plagiarism form contents of +Images such as Figures, Graphs, Tables etc. Along with statistical algorithms +such as Jaccard and Cosine, introducing semantic algorithms such as LSA, BERT, +WordNet outperformed in detecting efficient and accurate plagiarism. + +
+
+ comment: NLPTT2023 publication, 10 Pages +
+
+
+
+
+ + ☆ Use of LLMs for Illicit Purposes: Threats, Prevention Measures, and + Vulnerabilities + + +
+ Spurred by the recent rapid increase in the development and distribution of +large language models (LLMs) across industry and academia, much recent work has +drawn attention to safety- and security-related threats and vulnerabilities of +LLMs, including in the context of potentially criminal activities. +Specifically, it has been shown that LLMs can be misused for fraud, +impersonation, and the generation of malware; while other authors have +considered the more general problem of AI alignment. It is important that +developers and practitioners alike are aware of security-related problems with +such models. In this paper, we provide an overview of existing - predominantly +scientific - efforts on identifying and mitigating threats and vulnerabilities +arising from LLMs. We present a taxonomy describing the relationship between +threats caused by the generative capabilities of LLMs, prevention measures +intended to address such threats, and vulnerabilities arising from imperfect +prevention measures. With our work, we hope to raise awareness of the +limitations of LLMs in light of such security concerns, among both experienced +developers and novel users of such technologies. + +
+
+ comment: Pre-print +
+
+
+
+
+ + ☆ WavMark: Watermarking for Audio Generation + + +
+ Recent breakthroughs in zero-shot voice synthesis have enabled imitating a +speaker's voice using just a few seconds of recording while maintaining a high +level of realism. Alongside its potential benefits, this powerful technology +introduces notable risks, including voice fraud and speaker impersonation. +Unlike the conventional approach of solely relying on passive methods for +detecting synthetic data, watermarking presents a proactive and robust defence +mechanism against these looming risks. This paper introduces an innovative +audio watermarking framework that encodes up to 32 bits of watermark within a +mere 1-second audio snippet. The watermark is imperceptible to human senses and +exhibits strong resilience against various attacks. It can serve as an +effective identifier for synthesized voices and holds potential for broader +applications in audio copyright protection. Moreover, this framework boasts +high flexibility, allowing for the combination of multiple watermark segments +to achieve heightened robustness and expanded capacity. Utilizing 10 to +20-second audio as the host, our approach demonstrates an average Bit Error +Rate (BER) of 0.48\% across ten common attacks, a remarkable reduction of over +2800\% in BER compared to the state-of-the-art watermarking tool. See +https://aka.ms/wavmark for demos of our work. + +
+
+
+
+
+ + ☆ Real-time Detection of AI-Generated Speech for DeepFake Voice Conversion + + +
+ There are growing implications surrounding generative AI in the speech domain +that enable voice cloning and real-time voice conversion from one individual to +another. This technology poses a significant ethical threat and could lead to +breaches of privacy and misrepresentation, thus there is an urgent need for +real-time detection of AI-generated speech for DeepFake Voice Conversion. To +address the above emerging issues, the DEEP-VOICE dataset is generated in this +study, comprised of real human speech from eight well-known figures and their +speech converted to one another using Retrieval-based Voice Conversion. +Presenting as a binary classification problem of whether the speech is real or +AI-generated, statistical analysis of temporal audio features through t-testing +reveals that there are significantly different distributions. Hyperparameter +optimisation is implemented for machine learning models to identify the source +of speech. Following the training of 208 individual machine learning models +over 10-fold cross validation, it is found that the Extreme Gradient Boosting +model can achieve an average classification accuracy of 99.3% and can classify +speech in real-time, at around 0.004 milliseconds given one second of speech. +All data generated for this study is released publicly for future research on +AI speech detection. + +
+
+
+
+
+ + ☆ Harnessing the Power of David against Goliath: Exploring Instruction + Data Generation without Using Closed-Source Models + + +
+ Instruction tuning is instrumental in enabling Large Language Models~(LLMs) +to follow user instructions to complete various open-domain tasks. The success +of instruction tuning depends on the availability of high-quality instruction +data. Owing to the exorbitant cost and substandard quality of human annotation, +recent works have been deeply engaged in the exploration of the utilization of +powerful closed-source models to generate instruction data automatically. +However, these methods carry potential risks arising from the usage +requirements of powerful closed-source models, which strictly forbid the +utilization of their outputs to develop machine learning models. To deal with +this problem, in this work, we explore alternative approaches to generate +high-quality instruction data that do not rely on closed-source models. Our +exploration includes an investigation of various existing instruction +generation methods, culminating in the integration of the most efficient +variant with two novel strategies to enhance the quality further. Evaluation +results from two benchmarks and the GPT-4 model demonstrate the effectiveness +of our generated instruction data, which can outperform Alpaca, a method +reliant on closed-source models. We hope that more progress can be achieved in +generating high-quality instruction data without using closed-source models. + +
+
+
+
+
+ + ☆ Improving Translation Faithfulness of Large Language Models via + Augmenting Instructions + + +
+ Large Language Models (LLMs) present strong general capabilities, and a +current compelling challenge is stimulating their specialized capabilities, +such as machine translation, through low-cost instruction tuning. The standard +instruction-following data is sequentially organized as the concatenation of an +instruction, an input, and a response. As the attention mechanism of LLMs has +limitations on local focus, LLMs tend to focus more on the words or sentences +nearby at each position. This leads to a high risk of instruction forgetting +during decoding. To alleviate the above issues, We propose SWIE +(Segment-Weighted Instruction Embedding) and an instruction-following dataset +OVERMISS. SWIE improves the model instruction understanding by adding a global +instruction representation on the following input and response representations. +OVERMISS improves model faithfulness by comparing over-translation and +miss-translation results with the correct translation. We apply our methods to +two main-stream open-source LLMs, BLOOM and LLaMA. The experimental results +demonstrate significant improvements in translation performance with SWIE based +on BLOOMZ-3b, particularly in zero-shot and long text translations due to +reduced instruction forgetting risk. Additionally, OVERMISS outperforms the +baseline in translation performance (e.g. an increase in BLEU scores from 0.69 +to 3.12 and an average improvement of 0.48 percentage comet scores for +LLaMA-7b) with further enhancements seen in models combining OVERMISS and SWIE +(e.g. the BLUE scores increase up to 0.56 from English to German across three +different backbones), and both exhibit improvements in the faithfulness metric +based on word alignment. + +
+
+ comment: Our code and datasets are released in Github: + https://github.com/pppa2019/swie_overmiss_llm4mt +
+
+
+
+
+ + ☆ From Chatter to Matter: Addressing Critical Steps of Emotion Recognition + Learning in Task-oriented Dialogue SIGDIAL 2023 + + +
+ Emotion recognition in conversations (ERC) is a crucial task for building +human-like conversational agents. While substantial efforts have been devoted +to ERC for chit-chat dialogues, the task-oriented counterpart is largely left +unattended. Directly applying chit-chat ERC models to task-oriented dialogues +(ToDs) results in suboptimal performance as these models overlook key features +such as the correlation between emotions and task completion in ToDs. In this +paper, we propose a framework that turns a chit-chat ERC model into a +task-oriented one, addressing three critical aspects: data, features and +objective. First, we devise two ways of augmenting rare emotions to improve ERC +performance. Second, we use dialogue states as auxiliary features to +incorporate key information from the goal of the user. Lastly, we leverage a +multi-aspect emotion definition in ToDs to devise a multi-task learning +objective and a novel emotion-distance weighted loss function. Our framework +yields significant improvements for a range of chit-chat ERC models on EmoWOZ, +a large-scale dataset for user emotion in ToDs. We further investigate the +generalisability of the best resulting model to predict user satisfaction in +different ToD datasets. A comparison with supervised baselines shows a strong +zero-shot capability, highlighting the potential usage of our framework in +wider scenarios. + +
+
+ comment: Accepted by SIGDIAL 2023 +
+
+
+
+
+ + ☆ Probabilistic Method of Measuring Linguistic Productivity + + +
+ In this paper I propose a new way of measuring linguistic productivity that +objectively assesses the ability of an affix to be used to coin new complex +words and, unlike other popular measures, is not directly dependent upon token +frequency. Specifically, I suggest that linguistic productivity may be viewed +as the probability of an affix to combine with a random base. The advantages of +this approach include the following. First, token frequency does not dominate +the productivity measure but naturally influences the sampling of bases. +Second, we are not just counting attested word types with an affix but rather +simulating the construction of these types and then checking whether they are +attested in the corpus. Third, a corpus-based approach and randomised design +assure that true neologisms and words coined long ago have equal chances to be +selected. The proposed algorithm is evaluated both on English and Russian data. +The obtained results provide some valuable insights into the relation of +linguistic productivity to the number of types and tokens. It looks like +burgeoning linguistic productivity manifests itself in an increasing number of +types. However, this process unfolds in two stages: first comes the increase in +high-frequency items, and only then follows the increase in low-frequency +items. + +
+
+
+
+
+ + ☆ Advancing Hungarian Text Processing with HuSpaCy: Efficient and Accurate + NLP Pipelines + + +
+ This paper presents a set of industrial-grade text processing models for +Hungarian that achieve near state-of-the-art performance while balancing +resource efficiency and accuracy. Models have been implemented in the spaCy +framework, extending the HuSpaCy toolkit with several improvements to its +architecture. Compared to existing NLP tools for Hungarian, all of our +pipelines feature all basic text processing steps including tokenization, +sentence-boundary detection, part-of-speech tagging, morphological feature +tagging, lemmatization, dependency parsing and named entity recognition with +high accuracy and throughput. We thoroughly evaluated the proposed +enhancements, compared the pipelines with state-of-the-art tools and +demonstrated the competitive performance of the new models in all text +preprocessing steps. All experiments are reproducible and the pipelines are +freely available under a permissive license. + +
+
+ comment: Submitted to TSD 2023 Conference +
+
+
+
+
+ + ☆ PromptMRG: Diagnosis-Driven Prompts for Medical Report Generation + + +
+ Automatic medical report generation (MRG) is of great research value as it +has the potential to relieve radiologists from the heavy burden of report +writing. Despite recent advancements, accurate MRG remains challenging due to +the need for precise clinical understanding and the identification of clinical +findings. Moreover, the imbalanced distribution of diseases makes the challenge +even more pronounced, as rare diseases are underrepresented in training data, +making their diagnostic performance unreliable. To address these challenges, we +propose diagnosis-driven prompts for medical report generation (PromptMRG), a +novel framework that aims to improve the diagnostic accuracy of MRG with the +guidance of diagnosis-aware prompts. Specifically, PromptMRG is based on +encoder-decoder architecture with an extra disease classification branch. When +generating reports, the diagnostic results from the classification branch are +converted into token prompts to explicitly guide the generation process. To +further improve the diagnostic accuracy, we design cross-modal feature +enhancement, which retrieves similar reports from the database to assist the +diagnosis of a query image by leveraging the knowledge from a pre-trained CLIP. +Moreover, the disease imbalanced issue is addressed by applying an adaptive +logit-adjusted loss to the classification branch based on the individual +learning status of each disease, which overcomes the barrier of text decoder's +inability to manipulate disease distributions. Experiments on two MRG +benchmarks show the effectiveness of the proposed method, where it obtains +state-of-the-art clinical efficacy performance on both datasets. + +
+
+
+
+
+ + ☆ Mind vs. Mouth: On Measuring Re-judge Inconsistency of Social Bias in + Large Language Models + + +
+ Recent researches indicate that Pre-trained Large Language Models (LLMs) +possess cognitive constructs similar to those observed in humans, prompting +researchers to investigate the cognitive aspects of LLMs. This paper focuses on +explicit and implicit social bias, a distinctive two-level cognitive construct +in psychology. It posits that individuals' explicit social bias, which is their +conscious expression of bias in the statements, may differ from their implicit +social bias, which represents their unconscious bias. We propose a two-stage +approach and discover a parallel phenomenon in LLMs known as "re-judge +inconsistency" in social bias. In the initial stage, the LLM is tasked with +automatically completing statements, potentially incorporating implicit social +bias. However, in the subsequent stage, the same LLM re-judges the biased +statement generated by itself but contradicts it. We propose that this re-judge +inconsistency can be similar to the inconsistency between human's unaware +implicit social bias and their aware explicit social bias. Experimental +investigations on ChatGPT and GPT-4 concerning common gender biases examined in +psychology corroborate the highly stable nature of the re-judge inconsistency. +This finding may suggest that diverse cognitive constructs emerge as LLMs' +capabilities strengthen. Consequently, leveraging psychological theories can +provide enhanced insights into the underlying mechanisms governing the +expressions of explicit and implicit constructs in LLMs. + +
+
+
+
+
+ + ☆ A Small and Fast BERT for Chinese Medical Punctuation Restoration + + +
+ In clinical dictation, utterances after automatic speech recognition (ASR) +without explicit punctuation marks may lead to the misunderstanding of dictated +reports. To give a precise and understandable clinical report with ASR, +automatic punctuation restoration is required. Considering a practical +scenario, we propose a fast and light pre-trained model for Chinese medical +punctuation restoration based on 'pretraining and fine-tuning' paradigm. In +this work, we distill pre-trained models by incorporating supervised +contrastive learning and a novel auxiliary pre-training task (Punctuation Mark +Prediction) to make it well-suited for punctuation restoration. Our experiments +on various distilled models reveal that our model can achieve 95% performance +while 10% model size relative to state-of-the-art Chinese RoBERTa. + +
+
+ comment: 5 pages, 2 figures +
+
+
+
+
+ + ☆ CALM : A Multi-task Benchmark for Comprehensive Assessment of Language + Model Bias + + +
+ As language models (LMs) become increasingly powerful, it is important to +quantify and compare them for sociodemographic bias with potential for harm. +Prior bias measurement datasets are sensitive to perturbations in their +manually designed templates, therefore unreliable. To achieve reliability, we +introduce the Comprehensive Assessment of Language Model bias (CALM), a +benchmark dataset to quantify bias in LMs across three tasks. We integrate 16 +existing datasets across different domains, such as Wikipedia and news +articles, to filter 224 templates from which we construct a dataset of 78,400 +examples. We compare the diversity of CALM with prior datasets on metrics such +as average semantic similarity, and variation in template length, and test the +sensitivity to small perturbations. We show that our dataset is more diverse +and reliable than previous datasets, thus better capture the breadth of +linguistic variation required to reliably evaluate model bias. We evaluate 20 +large language models including six prominent families of LMs such as Llama-2. +In two LM series, OPT and Bloom, we found that larger parameter models are more +biased than lower parameter models. We found the T0 series of models to be the +least biased. Furthermore, we noticed a tradeoff between gender and racial bias +with increasing model size in some model series. The code is available at +https://github.com/vipulgupta1011/CALM. + +
+
+
+
+
+ + ☆ CARE: Co-Attention Network for Joint Entity and Relation Extraction + + +
+ Joint entity and relation extraction is the fundamental task of information +extraction, consisting of two subtasks: named entity recognition and relation +extraction. Most existing joint extraction methods suffer from issues of +feature confusion or inadequate interaction between two subtasks. In this work, +we propose a Co-Attention network for joint entity and Relation Extraction +(CARE). Our approach involves learning separate representations for each +subtask, aiming to avoid feature overlap. At the core of our approach is the +co-attention module that captures two-way interaction between two subtasks, +allowing the model to leverage entity information for relation prediction and +vice versa, thus promoting mutual enhancement. Extensive experiments on three +joint entity-relation extraction benchmark datasets (NYT, WebNLG and SciERC) +show that our proposed model achieves superior performance, surpassing existing +baseline models. + +
+
+
+
+
+ + ☆ Large Language Model as Autonomous Decision Maker + + +
+ While large language models (LLMs) exhibit impressive language understanding +and in-context learning abilities, their decision-making ability still heavily +relies on the guidance of task-specific expert knowledge when solving +real-world tasks. To unleash the potential of LLMs as autonomous decision +makers, this paper presents an approach JuDec to endow LLMs with the +self-judgment ability, enabling LLMs to achieve autonomous judgment and +exploration for decision making. Specifically, in JuDec, Elo-based +Self-Judgment Mechanism is designed to assign Elo scores to decision steps to +judge their values and utilities via pairwise comparisons between two solutions +and then guide the decision-searching process toward the optimal solution +accordingly. Experimental results on the ToolBench dataset demonstrate JuDec's +superiority over baselines, achieving over 10% improvement in Pass Rate on +diverse tasks. It offers higher-quality solutions and reduces costs (ChatGPT +API calls), highlighting its effectiveness and efficiency. + +
+
+ comment: Work in progess +
+
+
+
+
+ + ☆ MultiPA: a multi-task speech pronunciation assessment system for a + closed and open response scenario + + +
+ The design of automatic speech pronunciation assessment can be categorized +into closed and open response scenarios, each with strengths and limitations. A +system with the ability to function in both scenarios can cater to diverse +learning needs and provide a more precise and holistic assessment of +pronunciation skills. In this study, we propose a Multi-task Pronunciation +Assessment model called MultiPA. MultiPA provides an alternative to Kaldi-based +systems in that it has simpler format requirements and better compatibility +with other neural network models. Compared with previous open response systems, +MultiPA provides a wider range of evaluations, encompassing assessments at both +the sentence and word-level. Our experimental results show that MultiPA +achieves comparable performance when working in closed response scenarios and +maintains more robust performance when directly used for open responses. + +
+
+
+
+
+ + ☆ GPTEval: A Survey on Assessments of ChatGPT and GPT-4 + + +
+ The emergence of ChatGPT has generated much speculation in the press about +its potential to disrupt social and economic systems. Its astonishing language +ability has aroused strong curiosity among scholars about its performance in +different domains. There have been many studies evaluating the ability of +ChatGPT and GPT-4 in different tasks and disciplines. However, a comprehensive +review summarizing the collective assessment findings is lacking. The objective +of this survey is to thoroughly analyze prior assessments of ChatGPT and GPT-4, +focusing on its language and reasoning abilities, scientific knowledge, and +ethical considerations. Furthermore, an examination of the existing evaluation +methods is conducted, offering several recommendations for future research in +evaluating large language models. + +
+
+
+
+
+ + ☆ American Stories: A Large-Scale Structured Text Dataset of Historical + U.S. Newspapers + + +
+ Existing full text datasets of U.S. public domain newspapers do not recognize +the often complex layouts of newspaper scans, and as a result the digitized +content scrambles texts from articles, headlines, captions, advertisements, and +other layout regions. OCR quality can also be low. This study develops a novel, +deep learning pipeline for extracting full article texts from newspaper images +and applies it to the nearly 20 million scans in Library of Congress's public +domain Chronicling America collection. The pipeline includes layout detection, +legibility classification, custom OCR, and association of article texts +spanning multiple bounding boxes. To achieve high scalability, it is built with +efficient architectures designed for mobile phones. The resulting American +Stories dataset provides high quality data that could be used for pre-training +a large language model to achieve better understanding of historical English +and historical world knowledge. The dataset could also be added to the external +database of a retrieval-augmented language model to make historical information +- ranging from interpretations of political events to minutiae about the lives +of people's ancestors - more widely accessible. Furthermore, structured article +texts facilitate using transformer-based methods for popular social science +applications like topic classification, detection of reproduced content, and +news story clustering. Finally, American Stories provides a massive silver +quality dataset for innovating multimodal layout analysis models and other +multimodal applications. + +
+
+
+
+
+ + ☆ Sentence Embedding Models for Ancient Greek Using Multilingual Knowledge + Distillation + + +
+ Contextual language models have been trained on Classical languages, +including Ancient Greek and Latin, for tasks such as lemmatization, +morphological tagging, part of speech tagging, authorship attribution, and +detection of scribal errors. However, high-quality sentence embedding models +for these historical languages are significantly more difficult to achieve due +to the lack of training data. In this work, we use a multilingual knowledge +distillation approach to train BERT models to produce sentence embeddings for +Ancient Greek text. The state-of-the-art sentence embedding approaches for +high-resource languages use massive datasets, but our distillation approach +allows our Ancient Greek models to inherit the properties of these models while +using a relatively small amount of translated sentence data. We build a +parallel sentence dataset using a sentence-embedding alignment method to align +Ancient Greek documents with English translations, and use this dataset to +train our models. We evaluate our models on translation search, semantic +similarity, and semantic retrieval tasks and investigate translation bias. We +make our training and evaluation datasets freely available at +https://github.com/kevinkrahn/ancient-greek-datasets . + +
+
+ comment: Paper accepted for publication at the First Workshop on Ancient + Language Processing (ALP) 2023; 10 pages, 3 figures, 9 tables +
+
+
+
+
+ + ☆ Towards a Holistic Approach: Understanding Sociodemographic Biases in + NLP Models using an Interdisciplinary Lens + + +
+ The rapid growth in the usage and applications of Natural Language Processing +(NLP) in various sociotechnical solutions has highlighted the need for a +comprehensive understanding of bias and its impact on society. While research +on bias in NLP has expanded, several challenges persist that require attention. +These include the limited focus on sociodemographic biases beyond race and +gender, the narrow scope of analysis predominantly centered on models, and the +technocentric implementation approaches. This paper addresses these challenges +and advocates for a more interdisciplinary approach to understanding bias in +NLP. The work is structured into three facets, each exploring a specific aspect +of bias in NLP. + +
+
+
+
+
+ + ☆ Formal specification terminology for demographic agent-based models of + fixed-step single-clocked simulations + + +
+ This document presents adequate formal terminology for the mathematical +specification of a subset of Agent Based Models (ABMs) in the field of +Demography. The simulation of the targeted ABMs follows a fixed-step +single-clocked pattern. The proposed terminology further improves the model +understanding and can act as a stand-alone methodology for the specification +and optionally the documentation of a significant set of (demographic) ABMs. +Nevertheless, it is imaginable the this terminology probably with further +extensions can be merged with the largely-informal widely-used model +documentation and communication O.D.D. protocol [Grimm and et al., 2020, +Amouroux et al., 2010] to reduce many sources of ambiguity, hindering model +replications by other modelers. A published demographic model documentation, +largely simplified version of the Lone Parent Model [Gostoli and Silverman, +2020] is separately published in [Elsheikh, 2023b] as illustration for the +formal terminology. The model was implemented in the Julia language [Elsheikh, +2023a] based on the Agents.jl julia package [Datseris et al., 2022]. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2307.16548 +
+
+
+
+
+ + ☆ Lexical Diversity in Kinship Across Languages and Dialects + + +
+ Languages are known to describe the world in diverse ways. Across lexicons, +diversity is pervasive, appearing through phenomena such as lexical gaps and +untranslatability. However, in computational resources, such as multilingual +lexical databases, diversity is hardly ever represented. In this paper, we +introduce a method to enrich computational lexicons with content relating to +linguistic diversity. The method is verified through two large-scale case +studies on kinship terminology, a domain known to be diverse across languages +and cultures: one case study deals with seven Arabic dialects, while the other +one with three Indonesian languages. Our results, made available as browseable +and downloadable computational resources, extend prior linguistics research on +kinship terminology, and provide insight into the extent of diversity even +within linguistically and culturally close communities. + +
+
+
+
+
+ + ☆ Financial News Analytics Using Fine-Tuned Llama 2 GPT Model + + +
+ The paper considers the possibility to fine-tune Llama 2 Large Language Model +(LLM) for the multitask analysis of financial news. For fine-tuning, the +PEFT/LoRA based approach was used. In the study, the model was fine-tuned for +the following tasks: analysing a text from financial market perspectives, +highlighting main points of a text, summarizing a text and extracting named +entities with appropriate sentiments. The obtained results show that the +fine-tuned Llama 2 model can perform a multitask financial news analysis with a +specified structure of response, part of response can be a structured text and +another part of data can have JSON format for further processing. Extracted +sentiments for named entities can be considered as predictive features in +supervised machine learning models with quantitative target variables. + +
+
+
+
+
+ + ♻ ☆ Halo: Estimation and Reduction of Hallucinations in Open-Source Weak + Large Language Models + + +
+ Large Language Models (LLMs) have revolutionized Natural Language Processing +(NLP). Although convenient for research and practical applications, open-source +LLMs with fewer parameters often suffer from severe hallucinations compared to +their larger counterparts. This paper focuses on measuring and reducing +hallucinations in BLOOM 7B, a representative of such weaker open-source LLMs +that are publicly available for research and commercial applications. We +introduce HaloCheck, a lightweight BlackBox knowledge-free framework designed +to quantify the severity of hallucinations in LLMs. Additionally, we explore +techniques like knowledge injection and teacher-student approaches to alleviate +hallucinations in low-parameter LLMs. Our experiments effectively demonstrate +the reduction of hallucinations in challenging domains for these LLMs. + +
+
+
+
+
+ + ♻ ☆ Evaluation of ChatGPT on Biomedical Tasks: A Zero-Shot Comparison with + Fine-Tuned Generative Transformers ACL 2023 + + +
+ ChatGPT is a large language model developed by OpenAI. Despite its impressive +performance across various tasks, no prior work has investigated its capability +in the biomedical domain yet. To this end, this paper aims to evaluate the +performance of ChatGPT on various benchmark biomedical tasks, such as relation +extraction, document classification, question answering, and summarization. To +the best of our knowledge, this is the first work that conducts an extensive +evaluation of ChatGPT in the biomedical domain. Interestingly, we find based on +our evaluation that in biomedical datasets that have smaller training sets, +zero-shot ChatGPT even outperforms the state-of-the-art fine-tuned generative +transformer models, such as BioGPT and BioBART. This suggests that ChatGPT's +pre-training on large text corpora makes it quite specialized even in the +biomedical domain. Our findings demonstrate that ChatGPT has the potential to +be a valuable tool for various tasks in the biomedical domain that lack large +annotated data. + +
+
+ comment: Accepted by BioNLP@ACL 2023 +
+
+
+
+
+ + ♻ ☆ A Survey of Controllable Text Generation using Transformer-based + Pre-trained Language Models + + +
+ Controllable Text Generation (CTG) is emerging area in the field of natural +language generation (NLG). It is regarded as crucial for the development of +advanced text generation technologies that better meet the specific constraints +in practical applications. In recent years, methods using large-scale +pre-trained language models (PLMs), in particular the widely used +transformer-based PLMs, have become a new paradigm of NLG, allowing generation +of more diverse and fluent text. However, due to the limited level of +interpretability of deep neural networks, the controllability of these methods +need to be guaranteed. To this end, controllable text generation using +transformer-based PLMs has become a rapidly growing yet challenging new +research hotspot. A diverse range of approaches have emerged in the recent 3-4 +years, targeting different CTG tasks that require different types of controlled +constraints. In this paper, we present a systematic critical review on the +common tasks, main approaches, and evaluation methods in this area. Finally, we +discuss the challenges that the field is facing, and put forward various +promising future directions. To the best of our knowledge, this is the first +survey paper to summarize the state-of-the-art CTG techniques from the +perspective of Transformer-based PLMs. We hope it can help researchers and +practitioners in the related fields to quickly track the academic and +technological frontier, providing them with a landscape of the area and a +roadmap for future research. + +
+
+ comment: Accpeted by ACM Computing Surveys Journal +
+
+
+
+
+ + ♻ ☆ Improving Semantic Matching through Dependency-Enhanced Pre-trained + Model with Adaptive Fusion EMNLP 2022 + + +
+ Transformer-based pre-trained models like BERT have achieved great progress +on Semantic Sentence Matching. Meanwhile, dependency prior knowledge has also +shown general benefits in multiple NLP tasks. However, how to efficiently +integrate dependency prior structure into pre-trained models to better model +complex semantic matching relations is still unsettled. In this paper, we +propose the \textbf{D}ependency-Enhanced \textbf{A}daptive \textbf{F}usion +\textbf{A}ttention (\textbf{DAFA}), which explicitly introduces dependency +structure into pre-trained models and adaptively fuses it with semantic +information. Specifically, \textbf{\emph{(i)}} DAFA first proposes a +structure-sensitive paradigm to construct a dependency matrix for calibrating +attention weights. It adopts an adaptive fusion module to integrate the +obtained dependency information and the original semantic signals. Moreover, +DAFA reconstructs the attention calculation flow and provides better +interpretability. By applying it on BERT, our method achieves state-of-the-art +or competitive performance on 10 public datasets, demonstrating the benefits of +adaptively fusing dependency structure in semantic matching task. + +
+
+ comment: Accepted by Findings of EMNLP 2022 +
+
+
+
+
+ + ♻ ☆ PromptBench: Towards Evaluating the Robustness of Large Language Models + on Adversarial Prompts + + +
+ The increasing reliance on Large Language Models (LLMs) across academia and +industry necessitates a comprehensive understanding of their robustness to +prompts. In response to this vital need, we introduce PromptBench, a robustness +benchmark designed to measure LLMs' resilience to adversarial prompts. This +study uses a plethora of adversarial textual attacks targeting prompts across +multiple levels: character, word, sentence, and semantic. These prompts are +then employed in diverse tasks, such as sentiment analysis, natural language +inference, reading comprehension, machine translation, and math +problem-solving. Our study generates 4,032 adversarial prompts, meticulously +evaluated over 8 tasks and 13 datasets, with 567,084 test samples in total. Our +findings demonstrate that contemporary LLMs are vulnerable to adversarial +prompts. Furthermore, we present comprehensive analysis to understand the +mystery behind prompt robustness and its transferability. We then offer +insightful robustness analysis and pragmatic recommendations for prompt +composition, beneficial to both researchers and everyday users. We make our +code, prompts, and methodologies to generate adversarial prompts publicly +accessible, thereby enabling and encouraging collaborative exploration in this +pivotal field: https://github.com/microsoft/promptbench. + +
+
+ comment: Technical report; updated with new experiments and related work; 27 + pages; code is at: https://github.com/microsoft/promptbench +
+
+
+
+
+ + ♻ ☆ Structure-CLIP: Towards Scene Graph Knowledge to Enhance Multi-modal + Structured Representations + + +
+ Large-scale vision-language pre-training has achieved significant performance +in multi-modal understanding and generation tasks. However, existing methods +often perform poorly on image-text matching tasks that require structured +representations, i.e., representations of objects, attributes, and relations. +Previous models cannot make a distinction between ``An astronaut rides a horse" +and ``A horse rides an astronaut". This is because they fail to fully leverage +structured knowledge when learning representations in multi-modal scenarios. In +this paper, we present an end-to-end framework Structure-CLIP, which integrates +Scene Graph Knowledge (SGK) to enhance multi-modal structured representations. +Firstly, we use scene graphs to guide the construction of semantic negative +examples, which results in an increased emphasis on learning structured +representations. Moreover, a Knowledge-Enhance Encoder (KEE) is proposed to +leverage SGK as input to further enhance structured representations. To verify +the effectiveness of the proposed framework, we pre-train our model with the +aforementioned approaches and conduct experiments on downstream tasks. +Experimental results demonstrate that Structure-CLIP achieves state-of-the-art +(SOTA) performance on VG-Attribution and VG-Relation datasets, with 12.5% and +4.1% ahead of the multi-modal SOTA model respectively. Meanwhile, the results +on MSCOCO indicate that Structure-CLIP significantly enhances the structured +representations while maintaining the ability of general representations. Our +code will be available soon. + +
+
+ comment: Version 2.0. Improve grammar and experiments +
+
+
+
+
+ + ♻ ☆ Natural Language is All a Graph Needs + + +
+ The emergence of large-scale pre-trained language models, such as ChatGPT, +has revolutionized various research fields in artificial intelligence. +Transformers-based large language models (LLMs) have gradually replaced CNNs +and RNNs to unify fields of computer vision and natural language processing. +Compared with the data that exists relatively independently such as images, +videos or texts, graph is a type of data that contains rich structural and +relational information. Meanwhile, natural language, as one of the most +expressive mediums, excels in describing complex structures. However, existing +work on incorporating graph learning problems into the generative language +modeling framework remains very limited. As the importance of large language +models continues to grow, it becomes essential to explore whether LLMs can also +replace GNNs as the foundation model for graphs. In this paper, we propose +InstructGLM (Instruction-finetuned Graph Language Model), systematically design +highly scalable prompts based on natural language instructions, and use natural +language to describe the geometric structure and node features of the graph for +instruction tuning an LLM to perform learning and inference on graphs in a +generative manner. Our method exceeds all competitive GNN baselines on +ogbn-arxiv, Cora and PubMed datasets, which demonstrates the effectiveness of +our method and sheds light on generative large language models as the +foundation model for graph machine learning. + +
+
+ comment: 21 pages, 2 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ A Massive Scale Semantic Similarity Dataset of Historical English + + +
+ A diversity of tasks use language models trained on semantic similarity data. +While there are a variety of datasets that capture semantic similarity, they +are either constructed from modern web data or are relatively small datasets +created in the past decade by human annotators. This study utilizes a novel +source, newly digitized articles from off-copyright, local U.S. newspapers, to +assemble a massive-scale semantic similarity dataset spanning 70 years from +1920 to 1989 and containing nearly 400M positive semantic similarity pairs. +Historically, around half of articles in U.S. local newspapers came from +newswires like the Associated Press. While local papers reproduced articles +from the newswire, they wrote their own headlines, which form abstractive +summaries of the associated articles. We associate articles and their headlines +by exploiting document layouts and language understanding. We then use deep +neural methods to detect which articles are from the same underlying source, in +the presence of substantial noise and abridgement. The headlines of reproduced +articles form positive semantic similarity pairs. The resulting publicly +available HEADLINES dataset is significantly larger than most existing semantic +similarity datasets and covers a much longer span of time. It will facilitate +the application of contrastively trained semantic similarity models to a +variety of tasks, including the study of semantic change across space and time. + +
+
+
+
+
+ + ♻ ☆ Region-Aware Pretraining for Open-Vocabulary Object Detection with + Vision Transformers CVPR 2023 + + +
+ We present Region-aware Open-vocabulary Vision Transformers (RO-ViT) - a +contrastive image-text pretraining recipe to bridge the gap between image-level +pretraining and open-vocabulary object detection. At the pretraining phase, we +propose to randomly crop and resize regions of positional embeddings instead of +using the whole image positional embeddings. This better matches the use of +positional embeddings at region-level in the detection finetuning phase. In +addition, we replace the common softmax cross entropy loss in contrastive +learning with focal loss to better learn the informative yet difficult +examples. Finally, we leverage recent advances in novel object proposals to +improve open-vocabulary detection finetuning. We evaluate our full model on the +LVIS and COCO open-vocabulary detection benchmarks and zero-shot transfer. +RO-ViT achieves a state-of-the-art 34.1 $AP_r$ on LVIS, surpassing the best +existing approach by +7.8 points in addition to competitive zero-shot transfer +detection. Surprisingly, RO-ViT improves the image-level representation as well +and achieves the state of the art on 9 out of 12 metrics on COCO and Flickr +image-text retrieval benchmarks, outperforming competitive approaches with +larger models. + +
+
+ comment: CVPR 2023 Highlight (https://github.com/mcahny/rovit); adds LAION-2B + result +
+
+
+
+
+ + ♻ ☆ Can Authorship Representation Learning Capture Stylistic Features? ACL 2023 + + +
+ Automatically disentangling an author's style from the content of their +writing is a longstanding and possibly insurmountable problem in computational +linguistics. At the same time, the availability of large text corpora furnished +with author labels has recently enabled learning authorship representations in +a purely data-driven manner for authorship attribution, a task that ostensibly +depends to a greater extent on encoding writing style than encoding content. +However, success on this surrogate task does not ensure that such +representations capture writing style since authorship could also be correlated +with other latent variables, such as topic. In an effort to better understand +the nature of the information these representations convey, and specifically to +validate the hypothesis that they chiefly encode writing style, we +systematically probe these representations through a series of targeted +experiments. The results of these experiments suggest that representations +learned for the surrogate authorship prediction task are indeed sensitive to +writing style. As a consequence, authorship representations may be expected to +be robust to certain kinds of data shift, such as topic drift over time. +Additionally, our findings may open the door to downstream applications that +require stylistic representations, such as style transfer. + +
+
+ comment: appearing at TACL 2023 +
+
+
+
+
+ + ♻ ☆ Domain-specific ChatBots for Science using Embeddings + + +
+ Large language models (LLMs) have emerged as powerful machine-learning +systems capable of handling a myriad of tasks. Tuned versions of these systems +have been turned into chatbots that can respond to user queries on a vast +diversity of topics, providing informative and creative replies. However, their +application to physical science research remains limited owing to their +incomplete knowledge in these areas, contrasted with the needs of rigor and +sourcing in science domains. Here, we demonstrate how existing methods and +software tools can be easily combined to yield a domain-specific chatbot. The +system ingests scientific documents in existing formats, and uses text +embedding lookup to provide the LLM with domain-specific contextual information +when composing its reply. We similarly demonstrate that existing image +embedding methods can be used for search and retrieval across publication +figures. These results confirm that LLMs are already suitable for use by +physical scientists in accelerating their research efforts. + +
+
+ comment: 14 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ ViewRefer: Grasp the Multi-view Knowledge for 3D Visual Grounding with + GPT and Prototype Guidance ICCV 2023 + + +
+ Understanding 3D scenes from multi-view inputs has been proven to alleviate +the view discrepancy issue in 3D visual grounding. However, existing methods +normally neglect the view cues embedded in the text modality and fail to weigh +the relative importance of different views. In this paper, we propose +ViewRefer, a multi-view framework for 3D visual grounding exploring how to +grasp the view knowledge from both text and 3D modalities. For the text branch, +ViewRefer leverages the diverse linguistic knowledge of large-scale language +models, e.g., GPT, to expand a single grounding text to multiple +geometry-consistent descriptions. Meanwhile, in the 3D modality, a transformer +fusion module with inter-view attention is introduced to boost the interaction +of objects across views. On top of that, we further present a set of learnable +multi-view prototypes, which memorize scene-agnostic knowledge for different +views, and enhance the framework from two perspectives: a view-guided attention +module for more robust text features, and a view-guided scoring strategy during +the final prediction. With our designed paradigm, ViewRefer achieves superior +performance on three benchmarks and surpasses the second-best by +2.8%, +1.5%, +and +1.35% on Sr3D, Nr3D, and ScanRefer. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Grammar-Based Grounded Lexicon Learning NeurIPS 2021 + + +
+ We present Grammar-Based Grounded Lexicon Learning (G2L2), a lexicalist +approach toward learning a compositional and grounded meaning representation of +language from grounded data, such as paired images and texts. At the core of +G2L2 is a collection of lexicon entries, which map each word to a tuple of a +syntactic type and a neuro-symbolic semantic program. For example, the word +shiny has a syntactic type of adjective; its neuro-symbolic semantic program +has the symbolic form {\lambda}x. filter(x, SHINY), where the concept SHINY is +associated with a neural network embedding, which will be used to classify +shiny objects. Given an input sentence, G2L2 first looks up the lexicon entries +associated with each token. It then derives the meaning of the sentence as an +executable neuro-symbolic program by composing lexical meanings based on +syntax. The recovered meaning programs can be executed on grounded inputs. To +facilitate learning in an exponentially-growing compositional space, we +introduce a joint parsing and expected execution algorithm, which does local +marginalization over derivations to reduce the training time. We evaluate G2L2 +on two domains: visual reasoning and language-driven navigation. Results show +that G2L2 can generalize from small amounts of data to novel compositions of +words. + +
+
+ comment: Minor typo fixes. NeurIPS 2021. Project page: + https://g2l2.csail.mit.edu/ +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 153 + +
+
+
+ + ☆ ROAM: Robust and Object-aware Motion Generation using Neural Pose + Descriptors + + +
+ Existing automatic approaches for 3D virtual character motion synthesis +supporting scene interactions do not generalise well to new objects outside +training distributions, even when trained on extensive motion capture datasets +with diverse objects and annotated interactions. This paper addresses this +limitation and shows that robustness and generalisation to novel scene objects +in 3D object-aware character synthesis can be achieved by training a motion +model with as few as one reference object. We leverage an implicit feature +representation trained on object-only datasets, which encodes an +SE(3)-equivariant descriptor field around the object. Given an unseen object +and a reference pose-object pair, we optimise for the object-aware pose that is +closest in the feature space to the reference pose. Finally, we use l-NSM, +i.e., our motion generation model that is trained to seamlessly transition from +locomotion to object interaction with the proposed bidirectional pose blending +scheme. Through comprehensive numerical comparisons to state-of-the-art methods +and in a user study, we demonstrate substantial improvements in 3D virtual +character motion and interaction quality and robustness to scenarios with +unseen objects. Our project page is available at +https://vcai.mpi-inf.mpg.de/projects/ROAM/. + +
+
+ comment: 12 pages, 10 figures; project page: + https://vcai.mpi-inf.mpg.de/projects/ROAM/ +
+
+
+
+
+ + ☆ NeO 360: Neural Fields for Sparse View Synthesis of Outdoor Scenes ICCV + + +
+ Recent implicit neural representations have shown great results for novel +view synthesis. However, existing methods require expensive per-scene +optimization from many views hence limiting their application to real-world +unbounded urban settings where the objects of interest or backgrounds are +observed from very few views. To mitigate this challenge, we introduce a new +approach called NeO 360, Neural fields for sparse view synthesis of outdoor +scenes. NeO 360 is a generalizable method that reconstructs 360{\deg} scenes +from a single or a few posed RGB images. The essence of our approach is in +capturing the distribution of complex real-world outdoor 3D scenes and using a +hybrid image-conditional triplanar representation that can be queried from any +world point. Our representation combines the best of both voxel-based and +bird's-eye-view (BEV) representations and is more effective and expressive than +each. NeO 360's representation allows us to learn from a large collection of +unbounded 3D scenes while offering generalizability to new views and novel +scenes from as few as a single image during inference. We demonstrate our +approach on the proposed challenging 360{\deg} unbounded dataset, called NeRDS +360, and show that NeO 360 outperforms state-of-the-art generalizable methods +for novel view synthesis while also offering editing and composition +capabilities. Project page: +https://zubair-irshad.github.io/projects/neo360.html + +
+
+ comment: Accepted to International Conference on Computer Vision (ICCV), 2023. + Project page: https://zubair-irshad.github.io/projects/neo360.html +
+
+
+
+
+ + ☆ Scenimefy: Learning to Craft Anime Scene via Semi-Supervised + Image-to-Image Translation ICCV 2023 + + +
+ Automatic high-quality rendering of anime scenes from complex real-world +images is of significant practical value. The challenges of this task lie in +the complexity of the scenes, the unique features of anime style, and the lack +of high-quality datasets to bridge the domain gap. Despite promising attempts, +previous efforts are still incompetent in achieving satisfactory results with +consistent semantic preservation, evident stylization, and fine details. In +this study, we propose Scenimefy, a novel semi-supervised image-to-image +translation framework that addresses these challenges. Our approach guides the +learning with structure-consistent pseudo paired data, simplifying the pure +unsupervised setting. The pseudo data are derived uniquely from a +semantic-constrained StyleGAN leveraging rich model priors like CLIP. We +further apply segmentation-guided data selection to obtain high-quality pseudo +supervision. A patch-wise contrastive style loss is introduced to improve +stylization and fine details. Besides, we contribute a high-resolution anime +scene dataset to facilitate future research. Our extensive experiments +demonstrate the superiority of our method over state-of-the-art baselines in +terms of both perceptual quality and quantitative performance. + +
+
+ comment: ICCV 2023. The first two authors contributed equally. Code: + https://github.com/Yuxinn-J/Scenimefy Project page: + https://yuxinn-j.github.io/projects/Scenimefy.html +
+
+
+
+
+ + ☆ Qwen-VL: A Frontier Large Vision-Language Model with Versatile Abilities + + +
+ We introduce the Qwen-VL series, a set of large-scale vision-language models +designed to perceive and understand both text and images. Comprising Qwen-VL +and Qwen-VL-Chat, these models exhibit remarkable performance in tasks like +image captioning, question answering, visual localization, and flexible +interaction. The evaluation covers a wide range of tasks including zero-shot +captioning, visual or document visual question answering, and grounding. We +demonstrate the Qwen-VL outperforms existing Large Vision Language Models +(LVLMs). We present their architecture, training, capabilities, and +performance, highlighting their contributions to advancing multimodal +artificial intelligence. Code, demo and models are available at +https://github.com/QwenLM/Qwen-VL. + +
+
+ comment: Code, demo and models are available at + https://github.com/QwenLM/Qwen-VL +
+
+
+
+
+ + ☆ POCO: 3D Pose and Shape Estimation with Confidence + + +
+ The regression of 3D Human Pose and Shape (HPS) from an image is becoming +increasingly accurate. This makes the results useful for downstream tasks like +human action recognition or 3D graphics. Yet, no regressor is perfect, and +accuracy can be affected by ambiguous image evidence or by poses and appearance +that are unseen during training. Most current HPS regressors, however, do not +report the confidence of their outputs, meaning that downstream tasks cannot +differentiate accurate estimates from inaccurate ones. To address this, we +develop POCO, a novel framework for training HPS regressors to estimate not +only a 3D human body, but also their confidence, in a single feed-forward pass. +Specifically, POCO estimates both the 3D body pose and a per-sample variance. +The key idea is to introduce a Dual Conditioning Strategy (DCS) for regressing +uncertainty that is highly correlated to pose reconstruction quality. The POCO +framework can be applied to any HPS regressor and here we evaluate it by +modifying HMR, PARE, and CLIFF. In all cases, training the network to reason +about uncertainty helps it learn to more accurately estimate 3D pose. While +this was not our goal, the improvement is modest but consistent. Our main +motivation is to provide uncertainty estimates for downstream tasks; we +demonstrate this in two ways: (1) We use the confidence estimates to bootstrap +HPS training. Given unlabelled image data, we take the confident estimates of a +POCO-trained regressor as pseudo ground truth. Retraining with this +automatically-curated data improves accuracy. (2) We exploit uncertainty in +video pose estimation by automatically identifying uncertain frames (e.g. due +to occlusion) and inpainting these from confident frames. Code and models will +be available for research at https://poco.is.tue.mpg.de. + +
+
+
+
+
+ + ☆ Dense Text-to-Image Generation with Attention Modulation ICCV2023 + + +
+ Existing text-to-image diffusion models struggle to synthesize realistic +images given dense captions, where each text prompt provides a detailed +description for a specific image region. To address this, we propose +DenseDiffusion, a training-free method that adapts a pre-trained text-to-image +model to handle such dense captions while offering control over the scene +layout. We first analyze the relationship between generated images' layouts and +the pre-trained model's intermediate attention maps. Next, we develop an +attention modulation method that guides objects to appear in specific regions +according to layout guidance. Without requiring additional fine-tuning or +datasets, we improve image generation performance given dense captions +regarding both automatic and human evaluation scores. In addition, we achieve +similar-quality visual results with models specifically trained with layout +conditions. + +
+
+ comment: Accepted by ICCV2023. Code and data are available at + https://github.com/naver-ai/DenseDiffusion +
+
+
+
+
+ + ☆ MapPrior: Bird's-Eye View Map Layout Estimation with Generative Models + + +
+ Despite tremendous advancements in bird's-eye view (BEV) perception, existing +models fall short in generating realistic and coherent semantic map layouts, +and they fail to account for uncertainties arising from partial sensor +information (such as occlusion or limited coverage). In this work, we introduce +MapPrior, a novel BEV perception framework that combines a traditional +discriminative BEV perception model with a learned generative model for +semantic map layouts. Our MapPrior delivers predictions with better accuracy, +realism, and uncertainty awareness. We evaluate our model on the large-scale +nuScenes benchmark. At the time of submission, MapPrior outperforms the +strongest competing method, with significantly improved MMD and ECE scores in +camera- and LiDAR-based BEV perception. + +
+
+
+
+
+ + ☆ Motion-Guided Masking for Spatiotemporal Representation Learning ICCV 2023 + + +
+ Several recent works have directly extended the image masked autoencoder +(MAE) with random masking into video domain, achieving promising results. +However, unlike images, both spatial and temporal information are important for +video understanding. This suggests that the random masking strategy that is +inherited from the image MAE is less effective for video MAE. This motivates +the design of a novel masking algorithm that can more efficiently make use of +video saliency. Specifically, we propose a motion-guided masking algorithm +(MGM) which leverages motion vectors to guide the position of each mask over +time. Crucially, these motion-based correspondences can be directly obtained +from information stored in the compressed format of the video, which makes our +method efficient and scalable. On two challenging large-scale video benchmarks +(Kinetics-400 and Something-Something V2), we equip video MAE with our MGM and +achieve up to +$1.3\%$ improvement compared to previous state-of-the-art +methods. Additionally, our MGM achieves equivalent performance to previous +video MAE using up to $66\%$ fewer training epochs. Lastly, we show that MGM +generalizes better to downstream transfer learning and domain adaptation tasks +on the UCF101, HMDB51, and Diving48 datasets, achieving up to +$4.9\%$ +improvement compared to baseline methods. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ Less is More: Towards Efficient Few-shot 3D Semantic Segmentation via + Training-free Networks + + +
+ To reduce the reliance on large-scale datasets, recent works in 3D +segmentation resort to few-shot learning. Current 3D few-shot semantic +segmentation methods first pre-train the models on `seen' classes, and then +evaluate their generalization performance on `unseen' classes. However, the +prior pre-training stage not only introduces excessive time overhead, but also +incurs a significant domain gap on `unseen' classes. To tackle these issues, we +propose an efficient Training-free Few-shot 3D Segmentation netwrok, TFS3D, and +a further training-based variant, TFS3D-T. Without any learnable parameters, +TFS3D extracts dense representations by trigonometric positional encodings, and +achieves comparable performance to previous training-based methods. Due to the +elimination of pre-training, TFS3D can alleviate the domain gap issue and save +a substantial amount of time. Building upon TFS3D, TFS3D-T only requires to +train a lightweight query-support transferring attention (QUEST), which +enhances the interaction between the few-shot query and support data. +Experiments demonstrate TFS3D-T improves previous state-of-the-art methods by ++6.93% and +17.96% mIoU respectively on S3DIS and ScanNet, while reducing the +training time by -90%, indicating superior effectiveness and efficiency. + +
+
+ comment: Code is available at https://github.com/yangyangyang127/TFS3D +
+
+
+
+
+ + ☆ Towards Realistic Zero-Shot Classification via Self Structural Semantic + Alignment + + +
+ Large-scale pre-trained Vision Language Models (VLMs) have proven effective +for zero-shot classification. Despite the success, most traditional VLMs-based +methods are restricted by the assumption of partial source supervision or ideal +vocabularies, which rarely satisfy the open-world scenario. In this paper, we +aim at a more challenging setting, Realistic Zero-Shot Classification, which +assumes no annotation but instead a broad vocabulary. To address this +challenge, we propose the Self Structural Semantic Alignment (S^3A) framework, +which extracts the structural semantic information from unlabeled data while +simultaneously self-learning. Our S^3A framework adopts a unique +Cluster-Vote-Prompt-Realign (CVPR) algorithm, which iteratively groups +unlabeled data to derive structural semantics for pseudo-supervision. Our CVPR +process includes iterative clustering on images, voting within each cluster to +identify initial class candidates from the vocabulary, generating +discriminative prompts with large language models to discern confusing +candidates, and realigning images and the vocabulary as structural semantic +alignment. Finally, we propose to self-learn the CLIP image encoder with both +individual and structural semantic alignment through a teacher-student learning +strategy. Our comprehensive experiments across various generic and fine-grained +benchmarks demonstrate that the S^3A method offers substantial improvements +over existing VLMs-based approaches, achieving a more than 15% accuracy +improvement over CLIP on average. Our codes, models, and prompts are publicly +released at https://github.com/sheng-eatamath/S3A. + +
+
+ comment: submission at 24 Aug +
+
+
+
+
+ + ☆ DLIP: Distilling Language-Image Pre-training + + +
+ Vision-Language Pre-training (VLP) shows remarkable progress with the +assistance of extremely heavy parameters, which challenges deployment in real +applications. Knowledge distillation is well recognized as the essential +procedure in model compression. However, existing knowledge distillation +techniques lack an in-depth investigation and analysis of VLP, and practical +guidelines for VLP-oriented distillation are still not yet explored. In this +paper, we present DLIP, a simple yet efficient Distilling Language-Image +Pre-training framework, through which we investigate how to distill a light VLP +model. Specifically, we dissect the model distillation from multiple +dimensions, such as the architecture characteristics of different modules and +the information transfer of different modalities. We conduct comprehensive +experiments and provide insights on distilling a light but performant VLP +model. Experimental results reveal that DLIP can achieve a state-of-the-art +accuracy/efficiency trade-off across diverse cross-modal tasks, e.g., +image-text retrieval, image captioning and visual question answering. For +example, DLIP compresses BLIP by 1.9x, from 213M to 108M parameters, while +achieving comparable or better performance. Furthermore, DLIP succeeds in +retaining more than 95% of the performance with 22.4% parameters and 24.8% +FLOPs compared to the teacher model and accelerates inference speed by 2.7x. + +
+
+
+
+
+ + ☆ Label Budget Allocation in Multi-Task Learning + + +
+ The cost of labeling data often limits the performance of machine learning +systems. In multi-task learning, related tasks provide information to each +other and improve overall performance, but the label cost can vary among tasks. +How should the label budget (i.e. the amount of money spent on labeling) be +allocated among different tasks to achieve optimal multi-task performance? We +are the first to propose and formally define the label budget allocation +problem in multi-task learning and to empirically show that different budget +allocation strategies make a big difference to its performance. We propose a +Task-Adaptive Budget Allocation algorithm to robustly generate the optimal +budget allocation adaptive to different multi-task learning settings. +Specifically, we estimate and then maximize the extent of new information +obtained from the allocated budget as a proxy for multi-task learning +performance. Experiments on PASCAL VOC and Taskonomy demonstrate the efficacy +of our approach over other widely used heuristic labeling strategies. + +
+
+
+
+
+ + ☆ Perspective-aware Convolution for Monocular 3D Object Detection + + +
+ Monocular 3D object detection is a crucial and challenging task for +autonomous driving vehicle, while it uses only a single camera image to infer +3D objects in the scene. To address the difficulty of predicting depth using +only pictorial clue, we propose a novel perspective-aware convolutional layer +that captures long-range dependencies in images. By enforcing convolutional +kernels to extract features along the depth axis of every image pixel, we +incorporates perspective information into network architecture. We integrate +our perspective-aware convolutional layer into a 3D object detector and +demonstrate improved performance on the KITTI3D dataset, achieving a 23.9\% +average precision in the easy benchmark. These results underscore the +importance of modeling scene clues for accurate depth inference and highlight +the benefits of incorporating scene structure in network design. Our +perspective-aware convolutional layer has the potential to enhance object +detection accuracy by providing more precise and context-aware feature +extraction. + +
+
+
+
+
+ + ☆ Panoptic-Depth Color Map for Combination of Depth and Image Segmentation + + +
+ Image segmentation and depth estimation are crucial tasks in computer vision, +especially in autonomous driving scenarios. Although these tasks are typically +addressed separately, we propose an innovative approach to combine them in our +novel deep learning network, Panoptic-DepthLab. By incorporating an additional +depth estimation branch into the segmentation network, it can predict the depth +of each instance segment. Evaluating on Cityscape dataset, we demonstrate the +effectiveness of our method in achieving high-quality segmentation results with +depth and visualize it with a color map. Our proposed method demonstrates a new +possibility of combining different tasks and networks to generate a more +comprehensive image recognition result to facilitate the safety of autonomous +driving vehicles. + +
+
+
+
+
+ + ☆ Towards Realistic Unsupervised Fine-tuning with CLIP + + +
+ The emergence of vision-language models (VLMs), such as CLIP, has spurred a +significant research effort towards their application for downstream supervised +learning tasks. Although some previous studies have explored the unsupervised +fine-tuning of CLIP, they often rely on prior knowledge in the form of class +names associated with ground truth labels. In this paper, we delve into a +realistic unsupervised fine-tuning scenario by assuming that the unlabeled data +might contain out-of-distribution samples from unknown classes. Furthermore, we +emphasize the importance of simultaneously enhancing out-of-distribution +detection capabilities alongside the recognition of instances associated with +predefined class labels. + To tackle this problem, we present a simple, efficient, and effective +fine-tuning approach called Universal Entropy Optimization (UEO). UEO leverages +sample-level confidence to approximately minimize the conditional entropy of +confident instances and maximize the marginal entropy of less confident +instances. Apart from optimizing the textual prompts, UEO also incorporates +optimization of channel-wise affine transformations within the visual branch of +CLIP. Through extensive experiments conducted across 15 domains and 4 different +types of prior knowledge, we demonstrate that UEO surpasses baseline methods in +terms of both generalization and out-of-distribution detection. + +
+
+
+
+
+ + ☆ Robot Pose Nowcasting: Forecast the Future to Improve the Present + + +
+ In recent years, the effective and safe collaboration between humans and +machines has gained significant importance, particularly in the Industry 4.0 +scenario. A critical prerequisite for realizing this collaborative paradigm is +precisely understanding the robot's 3D pose within its environment. Therefore, +in this paper, we introduce a novel vision-based system leveraging depth data +to accurately establish the 3D locations of robotic joints. Specifically, we +prove the ability of the proposed system to enhance its current pose estimation +accuracy by jointly learning to forecast future poses. Indeed, we introduce the +concept of Pose Nowcasting, denoting the capability of a system to exploit the +learned knowledge of the future to improve the estimation of the present. The +experimental evaluation is conducted on two different datasets, providing +state-of-the-art and real-time performance and confirming the validity of the +proposed method on both the robotic and human scenarios. + +
+
+
+
+
+ + ☆ SCoRD: Subject-Conditional Relation Detection with Text-Augmented Data + + +
+ We propose Subject-Conditional Relation Detection SCoRD, where conditioned on +an input subject, the goal is to predict all its relations to other objects in +a scene along with their locations. Based on the Open Images dataset, we +propose a challenging OIv6-SCoRD benchmark such that the training and testing +splits have a distribution shift in terms of the occurrence statistics of +$\langle$subject, relation, object$\rangle$ triplets. To solve this problem, we +propose an auto-regressive model that given a subject, it predicts its +relations, objects, and object locations by casting this output as a sequence +of tokens. First, we show that previous scene-graph prediction methods fail to +produce as exhaustive an enumeration of relation-object pairs when conditioned +on a subject on this benchmark. Particularly, we obtain a recall@3 of 83.8% for +our relation-object predictions compared to the 49.75% obtained by a recent +scene graph detector. Then, we show improved generalization on both +relation-object and object-box predictions by leveraging during training +relation-object pairs obtained automatically from textual captions and for +which no object-box annotations are available. Particularly, for +$\langle$subject, relation, object$\rangle$ triplets for which no object +locations are available during training, we are able to obtain a recall@3 of +42.59% for relation-object pairs and 32.27% for their box locations. + +
+
+
+
+
+ + ☆ CDAN: Convolutional Dense Attention-guided Network for Low-light Image + Enhancement + + +
+ Low-light images, characterized by inadequate illumination, pose challenges +of diminished clarity, muted colors, and reduced details. Low-light image +enhancement, an essential task in computer vision, aims to rectify these issues +by improving brightness, contrast, and overall perceptual quality, thereby +facilitating accurate analysis and interpretation. This paper introduces the +Convolutional Dense Attention-guided Network (CDAN), a novel solution for +enhancing low-light images. CDAN integrates an autoencoder-based architecture +with convolutional and dense blocks, complemented by an attention mechanism and +skip connections. This architecture ensures efficient information propagation +and feature learning. Furthermore, a dedicated post-processing phase refines +color balance and contrast. Our approach demonstrates notable progress compared +to state-of-the-art results in low-light image enhancement, showcasing its +robustness across a wide range of challenging scenarios. Our model performs +remarkably on benchmark datasets, effectively mitigating under-exposure and +proficiently restoring textures and colors in diverse low-light scenarios. This +achievement underscores CDAN's potential for diverse computer vision tasks, +notably enabling robust object detection and recognition in challenging +low-light conditions. + +
+
+ comment: 18 pages, 13 figures +
+
+
+
+
+ + ☆ Can Linguistic Knowledge Improve Multimodal Alignment in Vision-Language + Pretraining? + + +
+ The multimedia community has shown a significant interest in perceiving and +representing the physical world with multimodal pretrained neural network +models, and among them, the visual-language pertaining (VLP) is, currently, the +most captivating topic. However, there have been few endeavors dedicated to the +exploration of 1) whether essential linguistic knowledge (e.g., semantics and +syntax) can be extracted during VLP, and 2) how such linguistic knowledge +impact or enhance the multimodal alignment. In response, here we aim to +elucidate the impact of comprehensive linguistic knowledge, including semantic +expression and syntactic structure, on multimodal alignment. Specifically, we +design and release the SNARE, the first large-scale multimodal alignment +probing benchmark, to detect the vital linguistic components, e.g., lexical, +semantic, and syntax knowledge, containing four tasks: Semantic structure, +Negation logic, Attribute ownership, and Relationship composition. Based on our +proposed probing benchmarks, our holistic analyses of five advanced VLP models +illustrate that the VLP model: i) shows insensitivity towards complex syntax +structures and relies on content words for sentence comprehension; ii) +demonstrates limited comprehension of combinations between sentences and +negations; iii) faces challenges in determining the presence of actions or +spatial relationships within visual information and struggles with verifying +the correctness of triple combinations. We make our benchmark and code +available at \url{https://github.com/WangFei-2019/SNARE/}. + +
+
+
+
+
+ + ☆ Beyond Document Page Classification: Design, Datasets, and Challenges + + +
+ This paper highlights the need to bring document classification benchmarking +closer to real-world applications, both in the nature of data tested ($X$: +multi-channel, multi-paged, multi-industry; $Y$: class distributions and label +set variety) and in classification tasks considered ($f$: multi-page document, +page stream, and document bundle classification, ...). We identify the lack of +public multi-page document classification datasets, formalize different +classification tasks arising in application scenarios, and motivate the value +of targeting efficient multi-page document representations. An experimental +study on proposed multi-page document classification datasets demonstrates that +current benchmarks have become irrelevant and need to be updated to evaluate +complete documents, as they naturally occur in practice. This reality check +also calls for more mature evaluation methodologies, covering calibration +evaluation, inference complexity (time-memory), and a range of realistic +distribution shifts (e.g., born-digital vs. scanning noise, shifting page +order). Our study ends on a hopeful note by recommending concrete avenues for +future improvements.} + +
+
+ comment: 8 pages, under review +
+
+
+
+
+ + ☆ Boosting Semantic Segmentation from the Perspective of Explicit Class + Embeddings + + +
+ Semantic segmentation is a computer vision task that associates a label with +each pixel in an image. Modern approaches tend to introduce class embeddings +into semantic segmentation for deeply utilizing category semantics, and regard +supervised class masks as final predictions. In this paper, we explore the +mechanism of class embeddings and have an insight that more explicit and +meaningful class embeddings can be generated based on class masks purposely. +Following this observation, we propose ECENet, a new segmentation paradigm, in +which class embeddings are obtained and enhanced explicitly during interacting +with multi-stage image features. Based on this, we revisit the traditional +decoding process and explore inverted information flow between segmentation +masks and class embeddings. Furthermore, to ensure the discriminability and +informativity of features from backbone, we propose a Feature Reconstruction +module, which combines intrinsic and diverse branches together to ensure the +concurrence of diversity and redundancy in features. Experiments show that our +ECENet outperforms its counterparts on the ADE20K dataset with much less +computational cost and achieves new state-of-the-art results on PASCAL-Context +dataset. The code will be released at https://gitee.com/mindspore/models and +https://github.com/Carol-lyh/ECENet. + +
+
+
+
+
+ + ☆ Multi-stage feature decorrelation constraints for improving CNN + classification performance + + +
+ For the convolutional neural network (CNN) used for pattern classification, +the training loss function is usually applied to the final output of the +network, except for some regularization constraints on the network parameters. +However, with the increasing of the number of network layers, the influence of +the loss function on the network front layers gradually decreases, and the +network parameters tend to fall into local optimization. At the same time, it +is found that the trained network has significant information redundancy at all +stages of features, which reduces the effectiveness of feature mapping at all +stages and is not conducive to the change of the subsequent parameters of the +network in the direction of optimality. Therefore, it is possible to obtain a +more optimized solution of the network and further improve the classification +accuracy of the network by designing a loss function for restraining the front +stage features and eliminating the information redundancy of the front stage +features .For CNN, this article proposes a multi-stage feature decorrelation +loss (MFD Loss), which refines effective features and eliminates information +redundancy by constraining the correlation of features at all stages. +Considering that there are many layers in CNN, through experimental comparison +and analysis, MFD Loss acts on multiple front layers of CNN, constrains the +output features of each layer and each channel, and performs supervision +training jointly with classification loss function during network training. +Compared with the single Softmax Loss supervised learning, the experiments on +several commonly used datasets on several typical CNNs prove that the +classification performance of Softmax Loss+MFD Loss is significantly better. +Meanwhile, the comparison experiments before and after the combination of MFD +Loss and some other typical loss functions verify its good universality. + +
+
+
+
+
+ + ☆ VNI-Net: Vector Neurons-based Rotation-Invariant Descriptor for LiDAR + Place Recognition + + +
+ LiDAR-based place recognition plays a crucial role in Simultaneous +Localization and Mapping (SLAM) and LiDAR localization. + Despite the emergence of various deep learning-based and hand-crafting-based +methods, rotation-induced place recognition failure remains a critical +challenge. + Existing studies address this limitation through specific training strategies +or network structures. + However, the former does not produce satisfactory results, while the latter +focuses mainly on the reduced problem of SO(2) rotation invariance. Methods +targeting SO(3) rotation invariance suffer from limitations in discrimination +capability. + In this paper, we propose a new method that employs Vector Neurons Network +(VNN) to achieve SO(3) rotation invariance. + We first extract rotation-equivariant features from neighboring points and +map low-dimensional features to a high-dimensional space through VNN. + Afterwards, we calculate the Euclidean and Cosine distance in the +rotation-equivariant feature space as rotation-invariant feature descriptors. + Finally, we aggregate the features using GeM pooling to obtain global +descriptors. + To address the significant information loss when formulating +rotation-invariant descriptors, we propose computing distances between features +at different layers within the Euclidean space neighborhood. + This greatly improves the discriminability of the point cloud descriptors +while ensuring computational efficiency. + Experimental results on public datasets show that our approach significantly +outperforms other baseline methods implementing rotation invariance, while +achieving comparable results with current state-of-the-art place recognition +methods that do not consider rotation issues. + +
+
+
+
+
+ + ☆ ToonTalker: Cross-Domain Face Reenactment + + +
+ We target cross-domain face reenactment in this paper, i.e., driving a +cartoon image with the video of a real person and vice versa. Recently, many +works have focused on one-shot talking face generation to drive a portrait with +a real video, i.e., within-domain reenactment. Straightforwardly applying those +methods to cross-domain animation will cause inaccurate expression transfer, +blur effects, and even apparent artifacts due to the domain shift between +cartoon and real faces. Only a few works attempt to settle cross-domain face +reenactment. The most related work AnimeCeleb requires constructing a dataset +with pose vector and cartoon image pairs by animating 3D characters, which +makes it inapplicable anymore if no paired data is available. In this paper, we +propose a novel method for cross-domain reenactment without paired data. +Specifically, we propose a transformer-based framework to align the motions +from different domains into a common latent space where motion transfer is +conducted via latent code addition. Two domain-specific motion encoders and two +learnable motion base memories are used to capture domain properties. A source +query transformer and a driving one are exploited to project domain-specific +motion to the canonical space. The edited motion is projected back to the +domain of the source with a transformer. Moreover, since no paired data is +provided, we propose a novel cross-domain training scheme using data from two +domains with the designed analogy constraint. Besides, we contribute a cartoon +dataset in Disney style. Extensive evaluations demonstrate the superiority of +our method over competing methods. + +
+
+
+
+
+ + ☆ SkipcrossNets: Adaptive Skip-cross Fusion for Road Detection + + +
+ Multi-modal fusion is increasingly being used for autonomous driving tasks, +as images from different modalities provide unique information for feature +extraction. However, the existing two-stream networks are only fused at a +specific network layer, which requires a lot of manual attempts to set up. As +the CNN goes deeper, the two modal features become more and more advanced and +abstract, and the fusion occurs at the feature level with a large gap, which +can easily hurt the performance. In this study, we propose a novel fusion +architecture called skip-cross networks (SkipcrossNets), which combines +adaptively LiDAR point clouds and camera images without being bound to a +certain fusion epoch. Specifically, skip-cross connects each layer to each +layer in a feed-forward manner, and for each layer, the feature maps of all +previous layers are used as input and its own feature maps are used as input to +all subsequent layers for the other modality, enhancing feature propagation and +multi-modal features fusion. This strategy facilitates selection of the most +similar feature layers from two data pipelines, providing a complementary +effect for sparse point cloud features during fusion processes. The network is +also divided into several blocks to reduce the complexity of feature fusion and +the number of model parameters. The advantages of skip-cross fusion were +demonstrated through application to the KITTI and A2D2 datasets, achieving a +MaxF score of 96.85% on KITTI and an F1 score of 84.84% on A2D2. The model +parameters required only 2.33 MB of memory at a speed of 68.24 FPS, which could +be viable for mobile terminals and embedded devices. + +
+
+
+
+
+ + ☆ Learned Local Attention Maps for Synthesising Vessel Segmentations + + +
+ Magnetic resonance angiography (MRA) is an imaging modality for visualising +blood vessels. It is useful for several diagnostic applications and for +assessing the risk of adverse events such as haemorrhagic stroke (resulting +from the rupture of aneurysms in blood vessels). However, MRAs are not acquired +routinely, hence, an approach to synthesise blood vessel segmentations from +more routinely acquired MR contrasts such as T1 and T2, would be useful. We +present an encoder-decoder model for synthesising segmentations of the main +cerebral arteries in the circle of Willis (CoW) from only T2 MRI. We propose a +two-phase multi-objective learning approach, which captures both global and +local features. It uses learned local attention maps generated by dilating the +segmentation labels, which forces the network to only extract information from +the T2 MRI relevant to synthesising the CoW. Our synthetic vessel segmentations +generated from only T2 MRI achieved a mean Dice score of $0.79 \pm 0.03$ in +testing, compared to state-of-the-art segmentation networks such as transformer +U-Net ($0.71 \pm 0.04$) and nnU-net($0.68 \pm 0.05$), while using only a +fraction of the parameters. The main qualitative difference between our +synthetic vessel segmentations and the comparative models was in the sharper +resolution of the CoW vessel segments, especially in the posterior circulation. + +
+
+
+
+
+ + ☆ Implicit Obstacle Map-driven Indoor Navigation Model for Robust Obstacle + Avoidance ACM MM 2023 + + +
+ Robust obstacle avoidance is one of the critical steps for successful +goal-driven indoor navigation tasks.Due to the obstacle missing in the visual +image and the possible missed detection issue, visual image-based obstacle +avoidance techniques still suffer from unsatisfactory robustness. To mitigate +it, in this paper, we propose a novel implicit obstacle map-driven indoor +navigation framework for robust obstacle avoidance, where an implicit obstacle +map is learned based on the historical trial-and-error experience rather than +the visual image. In order to further improve the navigation efficiency, a +non-local target memory aggregation module is designed to leverage a non-local +network to model the intrinsic relationship between the target semantic and the +target orientation clues during the navigation process so as to mine the most +target-correlated object clues for the navigation decision. Extensive +experimental results on AI2-Thor and RoboTHOR benchmarks verify the excellent +obstacle avoidance and navigation efficiency of our proposed method. The core +source code is available at https://github.com/xwaiyy123/object-navigation. + +
+
+ comment: 9 pages, 7 figures, 43 references. This paper has been accepted for + ACM MM 2023 +
+
+
+
+
+ + ☆ FaceTouch: Detecting hand-to-face touch with supervised contrastive + learning to assist in tracing infectious disease + + +
+ Through our respiratory system, many viruses and diseases frequently spread +and pass from one person to another. Covid-19 served as an example of how +crucial it is to track down and cut back on contacts to stop its spread. There +is a clear gap in finding automatic methods that can detect hand-to-face +contact in complex urban scenes or indoors. In this paper, we introduce a +computer vision framework, called FaceTouch, based on deep learning. It +comprises deep sub-models to detect humans and analyse their actions. FaceTouch +seeks to detect hand-to-face touches in the wild, such as through video chats, +bus footage, or CCTV feeds. Despite partial occlusion of faces, the introduced +system learns to detect face touches from the RGB representation of a given +scene by utilising the representation of the body gestures such as arm +movement. This has been demonstrated to be useful in complex urban scenarios +beyond simply identifying hand movement and its closeness to faces. Relying on +Supervised Contrastive Learning, the introduced model is trained on our +collected dataset, given the absence of other benchmark datasets. The framework +shows a strong validation in unseen datasets which opens the door for potential +deployment. + +
+
+ comment: Set to be published in the PLoS ONE Journal +
+
+
+
+
+ + ☆ EFormer: Enhanced Transformer towards Semantic-Contour Features of + Foreground for Portraits Matting + + +
+ The portrait matting task aims to extract an alpha matte with complete +semantics and finely-detailed contours. In comparison to CNN-based approaches, +transformers with self-attention allow a larger receptive field, enabling it to +better capture long-range dependencies and low-frequency semantic information +of a portrait. However, the recent research shows that self-attention mechanism +struggle with modeling high-frequency information and capturing fine contour +details, which can lead to bias while predicting the portrait's contours. To +address the problem, we propose EFormer to enhance the model's attention +towards semantic and contour features. Especially the latter, which is +surrounded by a large amount of high-frequency details. We build a semantic and +contour detector (SCD) to accurately capture the distribution of semantic and +contour features. And we further design contour-edge extraction branch and +semantic extraction branch for refining contour features and complete semantic +information. Finally, we fuse the two kinds of features and leverage the +segmentation head to generate the predicted portrait matte. Remarkably, EFormer +is an end-to-end trimap-free method and boasts a simple structure. Experiments +conducted on VideoMatte240K-JPEGSD and AIM datasets demonstrate that EFormer +outperforms previous portrait matte methods. + +
+
+ comment: 17 pages, 6 figures +
+
+
+
+
+ + ☆ Robotic Scene Segmentation with Memory Network for Runtime Surgical + Context Inference IROS + + +
+ Surgical context inference has recently garnered significant attention in +robot-assisted surgery as it can facilitate workflow analysis, skill +assessment, and error detection. However, runtime context inference is +challenging since it requires timely and accurate detection of the interactions +among the tools and objects in the surgical scene based on the segmentation of +video data. On the other hand, existing state-of-the-art video segmentation +methods are often biased against infrequent classes and fail to provide +temporal consistency for segmented masks. This can negatively impact the +context inference and accurate detection of critical states. In this study, we +propose a solution to these challenges using a Space Time Correspondence +Network (STCN). STCN is a memory network that performs binary segmentation and +minimizes the effects of class imbalance. The use of a memory bank in STCN +allows for the utilization of past image and segmentation information, thereby +ensuring consistency of the masks. Our experiments using the publicly available +JIGSAWS dataset demonstrate that STCN achieves superior segmentation +performance for objects that are difficult to segment, such as needle and +thread, and improves context inference compared to the state-of-the-art. We +also demonstrate that segmentation and context inference can be performed at +runtime without compromising performance. + +
+
+ comment: accepted at The IEEE/RSJ International Conference on Intelligent + Robots and Systems (IROS) 2023 +
+
+
+
+
+ + ☆ On Offline Evaluation of 3D Object Detection for Autonomous Driving ICCV'23 + + +
+ Prior work in 3D object detection evaluates models using offline metrics like +average precision since closed-loop online evaluation on the downstream driving +task is costly. However, it is unclear how indicative offline results are of +driving performance. In this work, we perform the first empirical evaluation +measuring how predictive different detection metrics are of driving performance +when detectors are integrated into a full self-driving stack. We conduct +extensive experiments on urban driving in the CARLA simulator using 16 object +detection models. We find that the nuScenes Detection Score has a higher +correlation to driving performance than the widely used average precision +metric. In addition, our results call for caution on the exclusive reliance on +the emerging class of `planner-centric' metrics. + +
+
+ comment: Appears in: IEEE International Conference on Computer Vision + (ICCV'23) Workshops +
+
+
+
+
+ + ☆ LISTER: Neighbor Decoding for Length-Insensitive Scene Text Recognition ICCV 2023 + + +
+ The diversity in length constitutes a significant characteristic of text. Due +to the long-tail distribution of text lengths, most existing methods for scene +text recognition (STR) only work well on short or seen-length text, lacking the +capability of recognizing longer text or performing length extrapolation. This +is a crucial issue, since the lengths of the text to be recognized are usually +not given in advance in real-world applications, but it has not been adequately +investigated in previous works. Therefore, we propose in this paper a method +called Length-Insensitive Scene TExt Recognizer (LISTER), which remedies the +limitation regarding the robustness to various text lengths. Specifically, a +Neighbor Decoder is proposed to obtain accurate character attention maps with +the assistance of a novel neighbor matrix regardless of the text lengths. +Besides, a Feature Enhancement Module is devised to model the long-range +dependency with low computation cost, which is able to perform iterations with +the neighbor decoder to enhance the feature map progressively. To the best of +our knowledge, we are the first to achieve effective length-insensitive scene +text recognition. Extensive experiments demonstrate that the proposed LISTER +algorithm exhibits obvious superiority on long text recognition and the ability +for length extrapolation, while comparing favourably with the previous +state-of-the-art methods on standard benchmarks for STR (mainly short text). + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ IP-UNet: Intensity Projection UNet Architecture for 3D Medical Volume + Segmentation + + +
+ CNNs have been widely applied for medical image analysis. However, limited +memory capacity is one of the most common drawbacks of processing +high-resolution 3D volumetric data. 3D volumes are usually cropped or downsized +first before processing, which can result in a loss of resolution, increase +class imbalance, and affect the performance of the segmentation algorithms. In +this paper, we propose an end-to-end deep learning approach called IP-UNet. +IP-UNet is a UNet-based model that performs multi-class segmentation on +Intensity Projection (IP) of 3D volumetric data instead of the memory-consuming +3D volumes. IP-UNet uses limited memory capability for training without losing +the original 3D image resolution. We compare the performance of three models in +terms of segmentation accuracy and computational cost: 1) Slice-by-slice 2D +segmentation of the CT scan images using a conventional 2D UNet model. 2) +IP-UNet that operates on data obtained by merging the extracted Maximum +Intensity Projection (MIP), Closest Vessel Projection (CVP), and Average +Intensity Projection (AvgIP) representations of the source 3D volumes, then +applying the UNet model on the output IP images. 3) 3D-UNet model directly +reads the 3D volumes constructed from a series of CT scan images and outputs +the 3D volume of the predicted segmentation. We test the performance of these +methods on 3D volumetric images for automatic breast calcification detection. +Experimental results show that IP-Unet can achieve similar segmentation +accuracy with 3D-Unet but with much better performance. It reduces the training +time by 70\% and memory consumption by 92\%. + +
+
+
+
+
+ + ☆ PartSeg: Few-shot Part Segmentation via Part-aware Prompt Learning + + +
+ In this work, we address the task of few-shot part segmentation, which aims +to segment the different parts of an unseen object using very few labeled +examples. It is found that leveraging the textual space of a powerful +pre-trained image-language model (such as CLIP) can be beneficial in learning +visual features. Therefore, we develop a novel method termed PartSeg for +few-shot part segmentation based on multimodal learning. Specifically, we +design a part-aware prompt learning method to generate part-specific prompts +that enable the CLIP model to better understand the concept of ``part'' and +fully utilize its textual space. Furthermore, since the concept of the same +part under different object categories is general, we establish relationships +between these parts during the prompt learning process. We conduct extensive +experiments on the PartImageNet and Pascal$\_$Part datasets, and the +experimental results demonstrated that our proposed method achieves +state-of-the-art performance. + +
+
+
+
+
+ + ☆ Learning Heavily-Degraded Prior for Underwater Object Detection + + +
+ Underwater object detection suffers from low detection performance because +the distance and wavelength dependent imaging process yield evident image +quality degradations such as haze-like effects, low visibility, and color +distortions. Therefore, we commit to resolving the issue of underwater object +detection with compounded environmental degradations. Typical approaches +attempt to develop sophisticated deep architecture to generate high-quality +images or features. However, these methods are only work for limited ranges +because imaging factors are either unstable, too sensitive, or compounded. +Unlike these approaches catering for high-quality images or features, this +paper seeks transferable prior knowledge from detector-friendly images. The +prior guides detectors removing degradations that interfere with detection. It +is based on statistical observations that, the heavily degraded regions of +detector-friendly (DFUI) and underwater images have evident feature +distribution gaps while the lightly degraded regions of them overlap each +other. Therefore, we propose a residual feature transference module (RFTM) to +learn a mapping between deep representations of the heavily degraded patches of +DFUI- and underwater- images, and make the mapping as a heavily degraded prior +(HDP) for underwater detection. Since the statistical properties are +independent to image content, HDP can be learned without the supervision of +semantic labels and plugged into popular CNNbased feature extraction networks +to improve their performance on underwater object detection. Without bells and +whistles, evaluations on URPC2020 and UODD show that our methods outperform +CNN-based detectors by a large margin. Our method with higher speeds and less +parameters still performs better than transformer-based detectors. Our code and +DFUI dataset can be found in +https://github.com/xiaoDetection/Learning-Heavily-Degraed-Prior. + +
+
+
+
+
+ + ☆ Asymmetric Co-Training with Explainable Cell Graph Ensembling for + Histopathological Image Classification + + +
+ Convolutional neural networks excel in histopathological image +classification, yet their pixel-level focus hampers explainability. Conversely, +emerging graph convolutional networks spotlight cell-level features and medical +implications. However, limited by their shallowness and suboptimal use of +high-dimensional pixel data, GCNs underperform in multi-class histopathological +image classification. To make full use of pixel-level and cell-level features +dynamically, we propose an asymmetric co-training framework combining a deep +graph convolutional network and a convolutional neural network for multi-class +histopathological image classification. To improve the explainability of the +entire framework by embedding morphological and topological distribution of +cells, we build a 14-layer deep graph convolutional network to handle cell +graph data. For the further utilization and dynamic interactions between +pixel-level and cell-level information, we also design a co-training strategy +to integrate the two asymmetric branches. Notably, we collect a private +clinically acquired dataset termed LUAD7C, including seven subtypes of lung +adenocarcinoma, which is rare and more challenging. We evaluated our approach +on the private LUAD7C and public colorectal cancer datasets, showcasing its +superior performance, explainability, and generalizability in multi-class +histopathological image classification. + +
+
+
+
+
+ + ☆ FastSurfer-HypVINN: Automated sub-segmentation of the hypothalamus and + adjacent structures on high-resolutional brain MRI + + +
+ The hypothalamus plays a crucial role in the regulation of a broad range of +physiological, behavioural, and cognitive functions. However, despite its +importance, only a few small-scale neuroimaging studies have investigated its +substructures, likely due to the lack of fully automated segmentation tools to +address scalability and reproducibility issues of manual segmentation. While +the only previous attempt to automatically sub-segment the hypothalamus with a +neural network showed promise for 1.0 mm isotropic T1-weighted (T1w) MRI, there +is a need for an automated tool to sub-segment also high-resolutional (HiRes) +MR scans, as they are becoming widely available, and include structural detail +also from multi-modal MRI. We, therefore, introduce a novel, fast, and fully +automated deep learning method named HypVINN for sub-segmentation of the +hypothalamus and adjacent structures on 0.8 mm isotropic T1w and T2w brain MR +images that is robust to missing modalities. We extensively validate our model +with respect to segmentation accuracy, generalizability, in-session test-retest +reliability, and sensitivity to replicate hypothalamic volume effects (e.g. +sex-differences). The proposed method exhibits high segmentation performance +both for standalone T1w images as well as for T1w/T2w image pairs. Even with +the additional capability to accept flexible inputs, our model matches or +exceeds the performance of state-of-the-art methods with fixed inputs. We, +further, demonstrate the generalizability of our method in experiments with 1.0 +mm MR scans from both the Rhineland Study and the UK Biobank. Finally, HypVINN +can perform the segmentation in less than a minute (GPU) and will be available +in the open source FastSurfer neuroimaging software suite, offering a +validated, efficient, and scalable solution for evaluating imaging-derived +phenotypes of the hypothalamus. + +
+
+ comment: Submitted to Imaging Neuroscience +
+
+
+
+
+ + ☆ DeepLOC: Deep Learning-based Bone Pathology Localization and + Classification in Wrist X-ray Images + + +
+ In recent years, computer-aided diagnosis systems have shown great potential +in assisting radiologists with accurate and efficient medical image analysis. +This paper presents a novel approach for bone pathology localization and +classification in wrist X-ray images using a combination of YOLO (You Only Look +Once) and the Shifted Window Transformer (Swin) with a newly proposed block. +The proposed methodology addresses two critical challenges in wrist X-ray +analysis: accurate localization of bone pathologies and precise classification +of abnormalities. The YOLO framework is employed to detect and localize bone +pathologies, leveraging its real-time object detection capabilities. +Additionally, the Swin, a transformer-based module, is utilized to extract +contextual information from the localized regions of interest (ROIs) for +accurate classification. + +
+
+ comment: AIST-2023 accepted paper +
+
+
+
+
+ + ☆ VIGC: Visual Instruction Generation and Correction + + +
+ The integration of visual encoders and large language models (LLMs) has +driven recent progress in multimodal large language models (MLLMs). However, +the scarcity of high-quality instruction-tuning data for vision-language tasks +remains a challenge. The current leading paradigm, such as LLaVA, relies on +language-only GPT-4 to generate data, which requires pre-annotated image +captions and detection bounding boxes, suffering from understanding image +details. A practical solution to this problem would be to utilize the available +multimodal large language models (MLLMs) to generate instruction data for +vision-language tasks. However, it's worth noting that the currently accessible +MLLMs are not as powerful as their LLM counterparts, as they tend to produce +inadequate responses and generate false information. As a solution for +addressing the current issue, this paper proposes the Visual Instruction +Generation and Correction (VIGC) framework that enables multimodal large +language models to generate instruction-tuning data and progressively enhance +its quality on-the-fly. Specifically, Visual Instruction Generation (VIG) +guides the vision-language model to generate diverse instruction-tuning data. +To ensure generation quality, Visual Instruction Correction (VIC) adopts an +iterative update mechanism to correct any inaccuracies in data produced by VIG, +effectively reducing the risk of hallucination. Leveraging the diverse, +high-quality data generated by VIGC, we finetune mainstream models and validate +data quality based on various evaluations. Experimental results demonstrate +that VIGC not only compensates for the shortcomings of language-only data +generation methods, but also effectively enhances the benchmark performance. +The models, datasets, and code will be made publicly available. + +
+
+
+
+
+ + ☆ Ground-to-Aerial Person Search: Benchmark Dataset and Approach ACM MM 2023 + + +
+ In this work, we construct a large-scale dataset for Ground-to-Aerial Person +Search, named G2APS, which contains 31,770 images of 260,559 annotated bounding +boxes for 2,644 identities appearing in both of the UAVs and ground +surveillance cameras. To our knowledge, this is the first dataset for +cross-platform intelligent surveillance applications, where the UAVs could work +as a powerful complement for the ground surveillance cameras. To more +realistically simulate the actual cross-platform Ground-to-Aerial surveillance +scenarios, the surveillance cameras are fixed about 2 meters above the ground, +while the UAVs capture videos of persons at different location, with a variety +of view-angles, flight attitudes and flight modes. Therefore, the dataset has +the following unique characteristics: 1) drastic view-angle changes between +query and gallery person images from cross-platform cameras; 2) diverse +resolutions, poses and views of the person images under 9 rich real-world +scenarios. On basis of the G2APS benchmark dataset, we demonstrate detailed +analysis about current two-step and end-to-end person search methods, and +further propose a simple yet effective knowledge distillation scheme on the +head of the ReID network, which achieves state-of-the-art performances on both +of the G2APS and the previous two public person search datasets, i.e., PRW and +CUHK-SYSU. The dataset and source code available on +\url{https://github.com/yqc123456/HKD_for_person_search}. + +
+
+ comment: Accepted by ACM MM 2023 +
+
+
+
+
+ + ☆ A Parse-Then-Place Approach for Generating Graphic Layouts from Textual + Descriptions ICCV2023 + + +
+ Creating layouts is a fundamental step in graphic design. In this work, we +propose to use text as the guidance to create graphic layouts, i.e., +Text-to-Layout, aiming to lower the design barriers. Text-to-Layout is a +challenging task, because it needs to consider the implicit, combined, and +incomplete layout constraints from text, each of which has not been studied in +previous work. To address this, we present a two-stage approach, named +parse-then-place. The approach introduces an intermediate representation (IR) +between text and layout to represent diverse layout constraints. With IR, +Text-to-Layout is decomposed into a parse stage and a place stage. The parse +stage takes a textual description as input and generates an IR, in which the +implicit constraints from the text are transformed into explicit ones. The +place stage generates layouts based on the IR. To model combined and incomplete +constraints, we use a Transformer-based layout generation model and carefully +design a way to represent constraints and layouts as sequences. Besides, we +adopt the pretrain-then-finetune strategy to boost the performance of the +layout generation model with large-scale unlabeled layouts. To evaluate our +approach, we construct two Text-to-Layout datasets and conduct experiments on +them. Quantitative results, qualitative analysis, and user studies demonstrate +the effectiveness of our approach. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ A Continual Learning Approach for Cross-Domain White Blood Cell + Classification MICCAI 2023 + + +
+ Accurate classification of white blood cells in peripheral blood is essential +for diagnosing hematological diseases. Due to constantly evolving clinical +settings, data sources, and disease classifications, it is necessary to update +machine learning classification models regularly for practical real-world use. +Such models significantly benefit from sequentially learning from incoming data +streams without forgetting previously acquired knowledge. However, models can +suffer from catastrophic forgetting, causing a drop in performance on previous +tasks when fine-tuned on new data. Here, we propose a rehearsal-based continual +learning approach for class incremental and domain incremental scenarios in +white blood cell classification. To choose representative samples from previous +tasks, we employ exemplar set selection based on the model's predictions. This +involves selecting the most confident samples and the most challenging samples +identified through uncertainty estimation of the model. We thoroughly evaluated +our proposed approach on three white blood cell classification datasets that +differ in color, resolution, and class composition, including scenarios where +new domains or new classes are introduced to the model with every task. We also +test a long class incremental experiment with both new domains and new classes. +Our results demonstrate that our approach outperforms established baselines in +continual learning, including existing iCaRL and EWC methods for classifying +white blood cells in cross-domain environments. + +
+
+ comment: Accepted for publication at workshop on Domain Adaptation and + Representation Transfer (DART) in International Conference on Medical Image + Computing and Computer Assisted Intervention (MICCAI 2023) +
+
+
+
+
+ + ☆ A Study of Age and Sex Bias in Multiple Instance Learning based + Classification of Acute Myeloid Leukemia Subtypes MICCAI 2023 + + +
+ Accurate classification of Acute Myeloid Leukemia (AML) subtypes is crucial +for clinical decision-making and patient care. In this study, we investigate +the potential presence of age and sex bias in AML subtype classification using +Multiple Instance Learning (MIL) architectures. To that end, we train multiple +MIL models using different levels of sex imbalance in the training set and +excluding certain age groups. To assess the sex bias, we evaluate the +performance of the models on male and female test sets. For age bias, models +are tested against underrepresented age groups in the training data. We find a +significant effect of sex and age bias on the performance of the model for AML +subtype classification. Specifically, we observe that females are more likely +to be affected by sex imbalance dataset and certain age groups, such as +patients with 72 to 86 years of age with the RUNX1::RUNX1T1 genetic subtype, +are significantly affected by an age bias present in the training data. +Ensuring inclusivity in the training data is thus essential for generating +reliable and equitable outcomes in AML genetic subtype classification, +ultimately benefiting diverse patient populations. + +
+
+ comment: Accepted for publication at workshop on Fairness of AI in Medical + Imaging in International Conference on Medical Image Computing and Computer + Assisted Intervention (MICCAI 2023) +
+
+
+
+
+ + ☆ Masked Feature Modelling: Feature Masking for the Unsupervised + Pre-training of a Graph Attention Network Block for Bottom-up Video Event + Recognition + + +
+ In this paper, we introduce Masked Feature Modelling (MFM), a novel approach +for the unsupervised pre-training of a Graph Attention Network (GAT) block. MFM +utilizes a pretrained Visual Tokenizer to reconstruct masked features of +objects within a video, leveraging the MiniKinetics dataset. We then +incorporate the pre-trained GAT block into a state-of-the-art bottom-up +supervised video-event recognition architecture, ViGAT, to improve the model's +starting point and overall accuracy. Experimental evaluations on the YLI-MED +dataset demonstrate the effectiveness of MFM in improving event recognition +performance. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ Don't Look into the Sun: Adversarial Solarization Attacks on Image + Classifiers + + +
+ Assessing the robustness of deep neural networks against out-of-distribution +inputs is crucial, especially in safety-critical domains like autonomous +driving, but also in safety systems where malicious actors can digitally alter +inputs to circumvent safety guards. However, designing effective +out-of-distribution tests that encompass all possible scenarios while +preserving accurate label information is a challenging task. Existing +methodologies often entail a compromise between variety and constraint levels +for attacks and sometimes even both. In a first step towards a more holistic +robustness evaluation of image classification models, we introduce an attack +method based on image solarization that is conceptually straightforward yet +avoids jeopardizing the global structure of natural images independent of the +intensity. Through comprehensive evaluations of multiple ImageNet models, we +demonstrate the attack's capacity to degrade accuracy significantly, provided +it is not integrated into the training augmentations. Interestingly, even then, +no full immunity to accuracy deterioration is achieved. In other settings, the +attack can often be simplified into a black-box attack with model-independent +parameters. Defenses against other corruptions do not consistently extend to be +effective against our specific attack. + Project website: https://github.com/paulgavrikov/adversarial_solarization + +
+
+
+
+
+ + ☆ An All Deep System for Badminton Game Analysis IJCAI + + +
+ The CoachAI Badminton 2023 Track1 initiative aim to automatically detect +events within badminton match videos. Detecting small objects, especially the +shuttlecock, is of quite importance and demands high precision within the +challenge. Such detection is crucial for tasks like hit count, hitting time, +and hitting location. However, even after revising the well-regarded +shuttlecock detecting model, TrackNet, our object detection models still fall +short of the desired accuracy. To address this issue, we've implemented various +deep learning methods to tackle the problems arising from noisy detectied data, +leveraging diverse data types to improve precision. In this report, we detail +the detection model modifications we've made and our approach to the 11 tasks. +Notably, our system garnered a score of 0.78 out of 1.0 in the challenge. + +
+
+ comment: Golden Award for IJCAI CoachAI Challenge 2023: Team NTNUEE AIoTLab +
+
+
+
+
+ + ☆ Tag-Based Annotation for Avatar Face Creation + + +
+ Currently, digital avatars can be created manually using human images as +reference. Systems such as Bitmoji are excellent producers of detailed avatar +designs, with hundreds of choices for customization. A supervised learning +model could be trained to generate avatars automatically, but the hundreds of +possible options create difficulty in securing non-noisy data to train a model. +As a solution, we train a model to produce avatars from human images using +tag-based annotations. This method provides better annotator agreement, leading +to less noisy data and higher quality model predictions. Our contribution is an +application of tag-based annotation to train a model for avatar face creation. +We design tags for 3 different facial facial features offered by Bitmoji, and +train a model using tag-based annotation to predict the nose. + +
+
+ comment: 9 pages, 5 figures, 18 tables +
+
+
+
+
+ + ☆ Towards Hierarchical Regional Transformer-based Multiple Instance + Learning ICCV 2023 + + +
+ The classification of gigapixel histopathology images with deep multiple +instance learning models has become a critical task in digital pathology and +precision medicine. In this work, we propose a Transformer-based multiple +instance learning approach that replaces the traditional learned attention +mechanism with a regional, Vision Transformer inspired self-attention +mechanism. We present a method that fuses regional patch information to derive +slide-level predictions and show how this regional aggregation can be stacked +to hierarchically process features on different distance levels. To increase +predictive accuracy, especially for datasets with small, local morphological +features, we introduce a method to focus the image processing on high attention +regions during inference. Our approach is able to significantly improve +performance over the baseline on two histopathology datasets and points towards +promising directions for further research. + +
+
+ comment: To be published as ICCV 2023 workshop paper +
+
+
+
+
+ + ☆ Cross-Video Contextual Knowledge Exploration and Exploitation for + Ambiguity Reduction in Weakly Supervised Temporal Action Localization + + +
+ Weakly supervised temporal action localization (WSTAL) aims to localize +actions in untrimmed videos using video-level labels. Despite recent advances, +existing approaches mainly follow a localization-by-classification pipeline, +generally processing each segment individually, thereby exploiting only limited +contextual information. As a result, the model will lack a comprehensive +understanding (e.g. appearance and temporal structure) of various action +patterns, leading to ambiguity in classification learning and temporal +localization. Our work addresses this from a novel perspective, by exploring +and exploiting the cross-video contextual knowledge within the dataset to +recover the dataset-level semantic structure of action instances via weak +labels only, thereby indirectly improving the holistic understanding of +fine-grained action patterns and alleviating the aforementioned ambiguities. +Specifically, an end-to-end framework is proposed, including a Robust +Memory-Guided Contrastive Learning (RMGCL) module and a Global Knowledge +Summarization and Aggregation (GKSA) module. First, the RMGCL module explores +the contrast and consistency of cross-video action features, assisting in +learning more structured and compact embedding space, thus reducing ambiguity +in classification learning. Further, the GKSA module is used to efficiently +summarize and propagate the cross-video representative action knowledge in a +learnable manner to promote holistic action patterns understanding, which in +turn allows the generation of high-confidence pseudo-labels for self-learning, +thus alleviating ambiguity in temporal localization. Extensive experiments on +THUMOS14, ActivityNet1.3, and FineAction demonstrate that our method +outperforms the state-of-the-art methods, and can be easily plugged into other +WSTAL methods. + +
+
+ comment: Submitted to TCSVT. 14 pages and 7 figures +
+
+
+
+
+ + ☆ HR-Pro: Point-supervised Temporal Action Localization via Hierarchical + Reliability Propagation + + +
+ Point-supervised Temporal Action Localization (PSTAL) is an emerging research +direction for label-efficient learning. However, current methods mainly focus +on optimizing the network either at the snippet-level or the instance-level, +neglecting the inherent reliability of point annotations at both levels. In +this paper, we propose a Hierarchical Reliability Propagation (HR-Pro) +framework, which consists of two reliability-aware stages: Snippet-level +Discrimination Learning and Instance-level Completeness Learning, both stages +explore the efficient propagation of high-confidence cues in point annotations. +For snippet-level learning, we introduce an online-updated memory to store +reliable snippet prototypes for each class. We then employ a Reliability-aware +Attention Block to capture both intra-video and inter-video dependencies of +snippets, resulting in more discriminative and robust snippet representation. +For instance-level learning, we propose a point-based proposal generation +approach as a means of connecting snippets and instances, which produces +high-confidence proposals for further optimization at the instance level. +Through multi-level reliability-aware learning, we obtain more reliable +confidence scores and more accurate temporal boundaries of predicted proposals. +Our HR-Pro achieves state-of-the-art performance on multiple challenging +benchmarks, including an impressive average mAP of 60.3% on THUMOS14. Notably, +our HR-Pro largely surpasses all previous point-supervised methods, and even +outperforms several competitive fully supervised methods. Code will be +available at https://github.com/pipixin321/HR-Pro. + +
+
+ comment: 12 pages, 8 figures +
+
+
+
+
+ + ☆ APLA: Additional Perturbation for Latent Noise with Adversarial Training + Enables Consistency + + +
+ Diffusion models have exhibited promising progress in video generation. +However, they often struggle to retain consistent details within local regions +across frames. One underlying cause is that traditional diffusion models +approximate Gaussian noise distribution by utilizing predictive noise, without +fully accounting for the impact of inherent information within the input +itself. Additionally, these models emphasize the distinction between +predictions and references, neglecting information intrinsic to the videos. To +address this limitation, inspired by the self-attention mechanism, we propose a +novel text-to-video (T2V) generation network structure based on diffusion +models, dubbed Additional Perturbation for Latent noise with Adversarial +training (APLA). Our approach only necessitates a single video as input and +builds upon pre-trained stable diffusion networks. Notably, we introduce an +additional compact network, known as the Video Generation Transformer (VGT). +This auxiliary component is designed to extract perturbations from the inherent +information contained within the input, thereby refining inconsistent pixels +during temporal predictions. We leverage a hybrid architecture of transformers +and convolutions to compensate for temporal intricacies, enhancing consistency +between different frames within the video. Experiments demonstrate a noticeable +improvement in the consistency of the generated videos both qualitatively and +quantitatively. + +
+
+
+
+
+ + ☆ PromptMRG: Diagnosis-Driven Prompts for Medical Report Generation + + +
+ Automatic medical report generation (MRG) is of great research value as it +has the potential to relieve radiologists from the heavy burden of report +writing. Despite recent advancements, accurate MRG remains challenging due to +the need for precise clinical understanding and the identification of clinical +findings. Moreover, the imbalanced distribution of diseases makes the challenge +even more pronounced, as rare diseases are underrepresented in training data, +making their diagnostic performance unreliable. To address these challenges, we +propose diagnosis-driven prompts for medical report generation (PromptMRG), a +novel framework that aims to improve the diagnostic accuracy of MRG with the +guidance of diagnosis-aware prompts. Specifically, PromptMRG is based on +encoder-decoder architecture with an extra disease classification branch. When +generating reports, the diagnostic results from the classification branch are +converted into token prompts to explicitly guide the generation process. To +further improve the diagnostic accuracy, we design cross-modal feature +enhancement, which retrieves similar reports from the database to assist the +diagnosis of a query image by leveraging the knowledge from a pre-trained CLIP. +Moreover, the disease imbalanced issue is addressed by applying an adaptive +logit-adjusted loss to the classification branch based on the individual +learning status of each disease, which overcomes the barrier of text decoder's +inability to manipulate disease distributions. Experiments on two MRG +benchmarks show the effectiveness of the proposed method, where it obtains +state-of-the-art clinical efficacy performance on both datasets. + +
+
+
+
+
+ + ☆ PoseSync: Robust pose based video synchronization + + +
+ Pose based video sychronization can have applications in multiple domains +such as gameplay performance evaluation, choreography or guiding athletes. The +subject's actions could be compared and evaluated against those performed by +professionals side by side. In this paper, we propose an end to end pipeline +for synchronizing videos based on pose. The first step crops the region where +the person present in the image followed by pose detection on the cropped +image. This is followed by application of Dynamic Time Warping(DTW) on angle/ +distance measures between the pose keypoints leading to a scale and shift +invariant pose matching pipeline. + +
+
+
+
+
+ + ☆ Logic-induced Diagnostic Reasoning for Semi-supervised Semantic + Segmentation ICCV 2023 + + +
+ Recent advances in semi-supervised semantic segmentation have been heavily +reliant on pseudo labeling to compensate for limited labeled data, disregarding +the valuable relational knowledge among semantic concepts. To bridge this gap, +we devise LogicDiag, a brand new neural-logic semi-supervised learning +framework. Our key insight is that conflicts within pseudo labels, identified +through symbolic knowledge, can serve as strong yet commonly ignored learning +signals. LogicDiag resolves such conflicts via reasoning with logic-induced +diagnoses, enabling the recovery of (potentially) erroneous pseudo labels, +ultimately alleviating the notorious error accumulation problem. We showcase +the practical application of LogicDiag in the data-hungry segmentation +scenario, where we formalize the structured abstraction of semantic concepts as +a set of logic rules. Extensive experiments on three standard semi-supervised +semantic segmentation benchmarks demonstrate the effectiveness and generality +of LogicDiag. Moreover, LogicDiag highlights the promising opportunities +arising from the systematic integration of symbolic reasoning into the +prevalent statistical, neural learning approaches. + +
+
+ comment: Accepted to ICCV 2023; Code: https://github.com/leonnnop/LogicDiag +
+
+
+
+
+ + ☆ Self-supervised Learning of Implicit Shape Representation with Dense + Correspondence for Deformable Objects ICCV 2023 + + +
+ Learning 3D shape representation with dense correspondence for deformable +objects is a fundamental problem in computer vision. Existing approaches often +need additional annotations of specific semantic domain, e.g., skeleton poses +for human bodies or animals, which require extra annotation effort and suffer +from error accumulation, and they are limited to specific domain. In this +paper, we propose a novel self-supervised approach to learn neural implicit +shape representation for deformable objects, which can represent shapes with a +template shape and dense correspondence in 3D. Our method does not require the +priors of skeleton and skinning weight, and only requires a collection of +shapes represented in signed distance fields. To handle the large deformation, +we constrain the learned template shape in the same latent space with the +training shapes, design a new formulation of local rigid constraint that +enforces rigid transformation in local region and addresses local reflection +issue, and present a new hierarchical rigid constraint to reduce the ambiguity +due to the joint learning of template shape and correspondences. Extensive +experiments show that our model can represent shapes with large deformations. +We also show that our shape representation can support two typical +applications, such as texture transfer and shape editing, with competitive +performance. The code and models are available at +https://iscas3dv.github.io/deformshape + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Grounded Entity-Landmark Adaptive Pre-training for Vision-and-Language + Navigation ICCV 2023 + + +
+ Cross-modal alignment is one key challenge for Vision-and-Language Navigation +(VLN). Most existing studies concentrate on mapping the global instruction or +single sub-instruction to the corresponding trajectory. However, another +critical problem of achieving fine-grained alignment at the entity level is +seldom considered. To address this problem, we propose a novel Grounded +Entity-Landmark Adaptive (GELA) pre-training paradigm for VLN tasks. To achieve +the adaptive pre-training paradigm, we first introduce grounded entity-landmark +human annotations into the Room-to-Room (R2R) dataset, named GEL-R2R. +Additionally, we adopt three grounded entity-landmark adaptive pre-training +objectives: 1) entity phrase prediction, 2) landmark bounding box prediction, +and 3) entity-landmark semantic alignment, which explicitly supervise the +learning of fine-grained cross-modal alignment between entity phrases and +environment landmarks. Finally, we validate our model on two downstream +benchmarks: VLN with descriptive instructions (R2R) and dialogue instructions +(CVDN). The comprehensive experiments show that our GELA model achieves +state-of-the-art results on both tasks, demonstrating its effectiveness and +generalizability. + +
+
+ comment: ICCV 2023 Oral +
+
+
+
+
+ + ☆ LORD: Leveraging Open-Set Recognition with Unknown Data ICCV 2023 + + +
+ Handling entirely unknown data is a challenge for any deployed classifier. +Classification models are typically trained on a static pre-defined dataset and +are kept in the dark for the open unassigned feature space. As a result, they +struggle to deal with out-of-distribution data during inference. Addressing +this task on the class-level is termed open-set recognition (OSR). However, +most OSR methods are inherently limited, as they train closed-set classifiers +and only adapt the downstream predictions to OSR. This work presents LORD, a +framework to Leverage Open-set Recognition by exploiting unknown Data. LORD +explicitly models open space during classifier training and provides a +systematic evaluation for such approaches. We identify three model-agnostic +training strategies that exploit background data and applied them to +well-established classifiers. Due to LORD's extensive evaluation protocol, we +consistently demonstrate improved recognition of unknown data. The benchmarks +facilitate in-depth analysis across various requirement levels. To mitigate +dependency on extensive and costly background datasets, we explore mixup as an +off-the-shelf data generation technique. Our experiments highlight mixup's +effectiveness as a substitute for background datasets. Lightweight constraints +on mixup synthesis further improve OSR performance. + +
+
+ comment: Accepted at ICCV 2023 Workshop (Out-Of-Distribution Generalization in + Computer Vision) +
+
+
+
+
+ + ☆ REB: Reducing Biases in Representation for Industrial Anomaly Detection + + +
+ Existing K-nearest neighbor (KNN) retrieval-based methods usually conduct +industrial anomaly detection in two stages: obtain feature representations with +a pre-trained CNN model and perform distance measures for defect detection. +However, the features are not fully exploited as they ignore domain bias and +the difference of local density in feature space, which limits the detection +performance. In this paper, we propose Reducing Biases (REB) in representation +by considering the domain bias of the pre-trained model and building a +self-supervised learning task for better domain adaption with a defect +generation strategy (DefectMaker) imitating the natural defects. Additionally, +we propose a local density KNN (LDKNN) to reduce the local density bias and +obtain effective anomaly detection. We achieve a promising result of 99.5\% +AUROC on the widely used MVTec AD benchmark. We also achieve 88.0\% AUROC on +the challenging MVTec LOCO AD dataset and bring an improvement of 4.7\% AUROC +to the state-of-the-art result. All results are obtained with smaller backbone +networks such as Vgg11 and Resnet18, which indicates the effectiveness and +efficiency of REB for practical industrial applications. + +
+
+ comment: 11 pages, 5 figures, 5 tables +
+
+
+
+
+ + ☆ StreamMapNet: Streaming Mapping Network for Vectorized Online HD Map + Construction + + +
+ High-Definition (HD) maps are essential for the safety of autonomous driving +systems. While existing techniques employ camera images and onboard sensors to +generate vectorized high-precision maps, they are constrained by their reliance +on single-frame input. This approach limits their stability and performance in +complex scenarios such as occlusions, largely due to the absence of temporal +information. Moreover, their performance diminishes when applied to broader +perception ranges. In this paper, we present StreamMapNet, a novel online +mapping pipeline adept at long-sequence temporal modeling of videos. +StreamMapNet employs multi-point attention and temporal information which +empowers the construction of large-range local HD maps with high stability and +further addresses the limitations of existing methods. Furthermore, we +critically examine widely used online HD Map construction benchmark and +datasets, Argoverse2 and nuScenes, revealing significant bias in the existing +evaluation protocols. We propose to resplit the benchmarks according to +geographical spans, promoting fair and precise evaluations. Experimental +results validate that StreamMapNet significantly outperforms existing methods +across all settings while maintaining an online inference speed of $14.2$ FPS. + +
+
+
+
+
+ + ☆ NOVA: NOvel View Augmentation for Neural Composition of Dynamic Objects ICCV + + +
+ We propose a novel-view augmentation (NOVA) strategy to train NeRFs for +photo-realistic 3D composition of dynamic objects in a static scene. Compared +to prior work, our framework significantly reduces blending artifacts when +inserting multiple dynamic objects into a 3D scene at novel views and times; +achieves comparable PSNR without the need for additional ground truth +modalities like optical flow; and overall provides ease, flexibility, and +scalability in neural composition. Our codebase is on GitHub. + +
+
+ comment: Accepted for publication in ICCV Computer Vision for Metaverse + Workshop 2023 (code is available at https://github.com/dakshitagrawal/NoVA) +
+
+
+
+
+ + ☆ Hyperbolic Audio-visual Zero-shot Learning ICCV 2023 + + +
+ Audio-visual zero-shot learning aims to classify samples consisting of a pair +of corresponding audio and video sequences from classes that are not present +during training. An analysis of the audio-visual data reveals a large degree of +hyperbolicity, indicating the potential benefit of using a hyperbolic +transformation to achieve curvature-aware geometric learning, with the aim of +exploring more complex hierarchical data structures for this task. The proposed +approach employs a novel loss function that incorporates cross-modality +alignment between video and audio features in the hyperbolic space. +Additionally, we explore the use of multiple adaptive curvatures for hyperbolic +projections. The experimental results on this very challenging task demonstrate +that our proposed hyperbolic approach for zero-shot learning outperforms the +SOTA method on three datasets: VGGSound-GZSL, UCF-GZSL, and ActivityNet-GZSL +achieving a harmonic mean (HM) improvement of around 3.0%, 7.0%, and 5.3%, +respectively. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Synchronize Feature Extracting and Matching: A Single Branch Framework + for 3D Object Tracking ICCV 2023 + + +
+ Siamese network has been a de facto benchmark framework for 3D LiDAR object +tracking with a shared-parametric encoder extracting features from template and +search region, respectively. This paradigm relies heavily on an additional +matching network to model the cross-correlation/similarity of the template and +search region. In this paper, we forsake the conventional Siamese paradigm and +propose a novel single-branch framework, SyncTrack, synchronizing the feature +extracting and matching to avoid forwarding encoder twice for template and +search region as well as introducing extra parameters of matching network. The +synchronization mechanism is based on the dynamic affinity of the Transformer, +and an in-depth analysis of the relevance is provided theoretically. Moreover, +based on the synchronization, we introduce a novel Attentive Points-Sampling +strategy into the Transformer layers (APST), replacing the random/Farthest +Points Sampling (FPS) method with sampling under the supervision of attentive +relations between the template and search region. It implies connecting +point-wise sampling with the feature learning, beneficial to aggregating more +distinctive and geometric features for tracking with sparse points. Extensive +experiments on two benchmark datasets (KITTI and NuScenes) show that SyncTrack +achieves state-of-the-art performance in real-time tracking. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Hybrid Models for Facial Emotion Recognition in Children + + +
+ This paper focuses on the use of emotion recognition techniques to assist +psychologists in performing children's therapy through remotely robot operated +sessions. In the field of psychology, the use of agent-mediated therapy is +growing increasingly given recent advances in robotics and computer science. +Specifically, the use of Embodied Conversational Agents (ECA) as an +intermediary tool can help professionals connect with children who face social +challenges such as Attention Deficit Hyperactivity Disorder (ADHD), Autism +Spectrum Disorder (ASD) or even who are physically unavailable due to being in +regions of armed conflict, natural disasters, or other circumstances. In this +context, emotion recognition represents an important feedback for the +psychotherapist. In this article, we initially present the result of a +bibliographical research associated with emotion recognition in children. This +research revealed an initial overview on algorithms and datasets widely used by +the community. Then, based on the analysis carried out on the results of the +bibliographical research, we used the technique of dense optical flow features +to improve the ability of identifying emotions in children in uncontrolled +environments. From the output of a hybrid model of Convolutional Neural +Network, two intermediary features are fused before being processed by a final +classifier. The proposed architecture was called HybridCNNFusion. Finally, we +present the initial results achieved in the recognition of children's emotions +using a dataset of Brazilian children. + +
+
+
+
+
+ + ☆ Mutual-Guided Dynamic Network for Image Fusion + + +
+ Image fusion aims to generate a high-quality image from multiple images +captured under varying conditions. The key problem of this task is to preserve +complementary information while filtering out irrelevant information for the +fused result. However, existing methods address this problem by leveraging +static convolutional neural networks (CNNs), suffering two inherent limitations +during feature extraction, i.e., being unable to handle spatial-variant +contents and lacking guidance from multiple inputs. In this paper, we propose a +novel mutual-guided dynamic network (MGDN) for image fusion, which allows for +effective information utilization across different locations and inputs. +Specifically, we design a mutual-guided dynamic filter (MGDF) for adaptive +feature extraction, composed of a mutual-guided cross-attention (MGCA) module +and a dynamic filter predictor, where the former incorporates additional +guidance from different inputs and the latter generates spatial-variant kernels +for different locations. In addition, we introduce a parallel feature fusion +(PFF) module to effectively fuse local and global information of the extracted +features. To further reduce the redundancy among the extracted features while +simultaneously preserving their shared structural information, we devise a +novel loss function that combines the minimization of normalized mutual +information (NMI) with an estimated gradient mask. Experimental results on five +benchmark datasets demonstrate that our proposed method outperforms existing +methods on four image fusion tasks. The code and model are publicly available +at: https://github.com/Guanys-dar/MGDN. + +
+
+ comment: ACMMM 2023 accepted +
+
+
+
+
+ + ☆ HuBo-VLM: Unified Vision-Language Model designed for HUman roBOt + interaction tasks + + +
+ Human robot interaction is an exciting task, which aimed to guide robots +following instructions from human. Since huge gap lies between human natural +language and machine codes, end to end human robot interaction models is fair +challenging. Further, visual information receiving from sensors of robot is +also a hard language for robot to perceive. In this work, HuBo-VLM is proposed +to tackle perception tasks associated with human robot interaction including +object detection and visual grounding by a unified transformer based vision +language model. Extensive experiments on the Talk2Car benchmark demonstrate the +effectiveness of our approach. Code would be publicly available in +https://github.com/dzcgaara/HuBo-VLM. + +
+
+
+
+
+ + ☆ SCP: Spherical-Coordinate-based Learned Point Cloud Compression + + +
+ In recent years, the task of learned point cloud compression has gained +prominence. An important type of point cloud, the spinning LiDAR point cloud, +is generated by spinning LiDAR on vehicles. This process results in numerous +circular shapes and azimuthal angle invariance features within the point +clouds. However, these two features have been largely overlooked by previous +methodologies. In this paper, we introduce a model-agnostic method called +Spherical-Coordinate-based learned Point cloud compression (SCP), designed to +leverage the aforementioned features fully. Additionally, we propose a +multi-level Octree for SCP to mitigate the reconstruction error for distant +areas within the Spherical-coordinate-based Octree. SCP exhibits excellent +universality, making it applicable to various learned point cloud compression +techniques. Experimental results demonstrate that SCP surpasses previous +state-of-the-art methods by up to 29.14% in point-to-point PSNR BD-Rate. + +
+
+
+
+
+ + ☆ Channel and Spatial Relation-Propagation Network for RGB-Thermal + Semantic Segmentation + + +
+ RGB-Thermal (RGB-T) semantic segmentation has shown great potential in +handling low-light conditions where RGB-based segmentation is hindered by poor +RGB imaging quality. The key to RGB-T semantic segmentation is to effectively +leverage the complementarity nature of RGB and thermal images. Most existing +algorithms fuse RGB and thermal information in feature space via concatenation, +element-wise summation, or attention operations in either unidirectional +enhancement or bidirectional aggregation manners. However, they usually +overlook the modality gap between RGB and thermal images during feature fusion, +resulting in modality-specific information from one modality contaminating the +other. In this paper, we propose a Channel and Spatial Relation-Propagation +Network (CSRPNet) for RGB-T semantic segmentation, which propagates only +modality-shared information across different modalities and alleviates the +modality-specific information contamination issue. Our CSRPNet first performs +relation-propagation in channel and spatial dimensions to capture the +modality-shared features from the RGB and thermal features. CSRPNet then +aggregates the modality-shared features captured from one modality with the +input feature from the other modality to enhance the input feature without the +contamination issue. While being fused together, the enhanced RGB and thermal +features will be also fed into the subsequent RGB or thermal feature extraction +layers for interactive feature fusion, respectively. We also introduce a +dual-path cascaded feature refinement module that aggregates multi-layer +features to produce two refined features for semantic and boundary prediction. +Extensive experimental results demonstrate that CSRPNet performs favorably +against state-of-the-art algorithms. + +
+
+
+
+
+ + ☆ FedSoL: Bridging Global Alignment and Local Generality in Federated + Learning + + +
+ Federated Learning (FL) aggregates locally trained models from individual +clients to construct a global model. While FL enables learning a model with +data privacy, it often suffers from significant performance degradation when +client data distributions are heterogeneous. Many previous FL algorithms have +addressed this issue by introducing various proximal restrictions. These +restrictions aim to encourage global alignment by constraining the deviation of +local learning from the global objective. However, they inherently limit local +learning by interfering with the original local objectives. Recently, an +alternative approach has emerged to improve local learning generality. By +obtaining local models within a smooth loss landscape, this approach mitigates +conflicts among different local objectives of the clients. Yet, it does not +ensure stable global alignment, as local learning does not take the global +objective into account. In this study, we propose Federated Stability on +Learning (FedSoL), which combines both the concepts of global alignment and +local generality. In FedSoL, the local learning seeks a parameter region robust +against proximal perturbations. This strategy introduces an implicit proximal +restriction effect in local learning while maintaining the original local +objective for parameter update. Our experiments show that FedSoL consistently +achieves state-of-the-art performance on various setups. + +
+
+
+
+
+ + ☆ SieveNet: Selecting Point-Based Features for Mesh Networks + + +
+ Meshes are widely used in 3D computer vision and graphics, but their +irregular topology poses challenges in applying them to existing neural network +architectures. Recent advances in mesh neural networks turn to remeshing and +push the boundary of pioneer methods that solely take the raw meshes as input. +Although the remeshing offers a regular topology that significantly facilitates +the design of mesh network architectures, features extracted from such remeshed +proxies may struggle to retain the underlying geometry faithfully, limiting the +subsequent neural network's capacity. To address this issue, we propose +SieveNet, a novel paradigm that takes into account both the regular topology +and the exact geometry. Specifically, this method utilizes structured mesh +topology from remeshing and accurate geometric information from +distortion-aware point sampling on the surface of the original mesh. +Furthermore, our method eliminates the need for hand-crafted feature +engineering and can leverage off-the-shelf network architectures such as the +vision transformer. Comprehensive experimental results on classification and +segmentation tasks well demonstrate the effectiveness and superiority of our +method. + +
+
+ comment: The project homepage is https://sievenet.github.io/ +
+
+
+
+
+ + ☆ Uniformly Distributed Category Prototype-Guided Vision-Language + Framework for Long-Tail Recognition + + +
+ Recently, large-scale pre-trained vision-language models have presented +benefits for alleviating class imbalance in long-tailed recognition. However, +the long-tailed data distribution can corrupt the representation space, where +the distance between head and tail categories is much larger than the distance +between two tail categories. This uneven feature space distribution causes the +model to exhibit unclear and inseparable decision boundaries on the uniformly +distributed test set, which lowers its performance. To address these +challenges, we propose the uniformly category prototype-guided vision-language +framework to effectively mitigate feature space bias caused by data imbalance. +Especially, we generate a set of category prototypes uniformly distributed on a +hypersphere. Category prototype-guided mechanism for image-text matching makes +the features of different classes converge to these distinct and uniformly +distributed category prototypes, which maintain a uniform distribution in the +feature space, and improve class boundaries. Additionally, our proposed +irrelevant text filtering and attribute enhancement module allows the model to +ignore irrelevant noisy text and focus more on key attribute information, +thereby enhancing the robustness of our framework. In the image recognition +fine-tuning stage, to address the positive bias problem of the learnable +classifier, we design the class feature prototype-guided classifier, which +compensates for the performance of tail classes while maintaining the +performance of head classes. Our method outperforms previous vision-language +methods for long-tailed learning work by a large margin and achieves +state-of-the-art performance. + +
+
+ comment: 11pages, 5figures +
+
+
+
+
+ + ☆ I3DOD: Towards Incremental 3D Object Detection via Prompting + + +
+ 3D object detection has achieved significant performance in many fields, +e.g., robotics system, autonomous driving, and augmented reality. However, most +existing methods could cause catastrophic forgetting of old classes when +performing on the class-incremental scenarios. Meanwhile, the current +class-incremental 3D object detection methods neglect the relationships between +the object localization information and category semantic information and +assume all the knowledge of old model is reliable. To address the above +challenge, we present a novel Incremental 3D Object Detection framework with +the guidance of prompting, i.e., I3DOD. Specifically, we propose a task-shared +prompts mechanism to learn the matching relationships between the object +localization information and category semantic information. After training on +the current task, these prompts will be stored in our prompt pool, and perform +the relationship of old classes in the next task. Moreover, we design a +reliable distillation strategy to transfer knowledge from two aspects: a +reliable dynamic distillation is developed to filter out the negative knowledge +and transfer the reliable 3D knowledge to new detection model; the relation +feature is proposed to capture the responses relation in feature space and +protect plasticity of the model when learning novel 3D classes. To the end, we +conduct comprehensive experiments on two benchmark datasets and our method +outperforms the state-of-the-art object detection methods by 0.6% - 2.7% in +terms of mAP@0.25. + +
+
+ comment: 6 pages, 5 figures +
+
+
+
+
+ + ☆ Masked Autoencoders are Efficient Class Incremental Learners ICCV 2023 + + +
+ Class Incremental Learning (CIL) aims to sequentially learn new classes while +avoiding catastrophic forgetting of previous knowledge. We propose to use +Masked Autoencoders (MAEs) as efficient learners for CIL. MAEs were originally +designed to learn useful representations through reconstructive unsupervised +learning, and they can be easily integrated with a supervised loss for +classification. Moreover, MAEs can reliably reconstruct original input images +from randomly selected patches, which we use to store exemplars from past tasks +more efficiently for CIL. We also propose a bilateral MAE framework to learn +from image-level and embedding-level fusion, which produces better-quality +reconstructed images and more stable representations. Our experiments confirm +that our approach performs better than the state-of-the-art on CIFAR-100, +ImageNet-Subset, and ImageNet-Full. The code is available at +https://github.com/scok30/MAE-CIL . + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ☆ Parameter-Efficient Transfer Learning for Remote Sensing Image-Text + Retrieval + + +
+ Vision-and-language pre-training (VLP) models have experienced a surge in +popularity recently. By fine-tuning them on specific datasets, significant +performance improvements have been observed in various tasks. However, full +fine-tuning of VLP models not only consumes a significant amount of +computational resources but also has a significant environmental impact. +Moreover, as remote sensing (RS) data is constantly being updated, full +fine-tuning may not be practical for real-world applications. To address this +issue, in this work, we investigate the parameter-efficient transfer learning +(PETL) method to effectively and efficiently transfer visual-language knowledge +from the natural domain to the RS domain on the image-text retrieval task. To +this end, we make the following contributions. 1) We construct a novel and +sophisticated PETL framework for the RS image-text retrieval (RSITR) task, +which includes the pretrained CLIP model, a multimodal remote sensing adapter, +and a hybrid multi-modal contrastive (HMMC) learning objective; 2) To deal with +the problem of high intra-modal similarity in RS data, we design a simple yet +effective HMMC loss; 3) We provide comprehensive empirical studies for +PETL-based RS image-text retrieval. Our results demonstrate that the proposed +method is promising and of great potential for practical applications. 4) We +benchmark extensive state-of-the-art PETL methods on the RSITR task. Our +proposed model only contains 0.16M training parameters, which can achieve a +parameter reduction of 98.9% compared to full fine-tuning, resulting in +substantial savings in training costs. Our retrieval performance exceeds +traditional methods by 7-13% and achieves comparable or better performance than +full fine-tuning. This work can provide new ideas and useful insights for RS +vision-language tasks. + +
+
+
+
+
+ + ☆ FFEINR: Flow Feature-Enhanced Implicit Neural Representation for + Spatio-temporal Super-Resolution + + +
+ Large-scale numerical simulations are capable of generating data up to +terabytes or even petabytes. As a promising method of data reduction, +super-resolution (SR) has been widely studied in the scientific visualization +community. However, most of them are based on deep convolutional neural +networks (CNNs) or generative adversarial networks (GANs) and the scale factor +needs to be determined before constructing the network. As a result, a single +training session only supports a fixed factor and has poor generalization +ability. To address these problems, this paper proposes a Feature-Enhanced +Implicit Neural Representation (FFEINR) for spatio-temporal super-resolution of +flow field data. It can take full advantage of the implicit neural +representation in terms of model structure and sampling resolution. The neural +representation is based on a fully connected network with periodic activation +functions, which enables us to obtain lightweight models. The learned +continuous representation can decode the low-resolution flow field input data +to arbitrary spatial and temporal resolutions, allowing for flexible +upsampling. The training process of FFEINR is facilitated by introducing +feature enhancements for the input layer, which complements the contextual +information of the flow field.To demonstrate the effectiveness of the proposed +method, a series of experiments are conducted on different datasets by setting +different hyperparameters. The results show that FFEINR achieves significantly +better results than the trilinear interpolation method. + +
+
+
+
+
+ + ☆ DD-GCN: Directed Diffusion Graph Convolutional Network for + Skeleton-based Human Action Recognition + + +
+ Graph Convolutional Networks (GCNs) have been widely used in skeleton-based +human action recognition. In GCN-based methods, the spatio-temporal graph is +fundamental for capturing motion patterns. However, existing approaches ignore +the physical dependency and synchronized spatio-temporal correlations between +joints, which limits the representation capability of GCNs. To solve these +problems, we construct the directed diffusion graph for action modeling and +introduce the activity partition strategy to optimize the weight sharing +mechanism of graph convolution kernels. In addition, we present the +spatio-temporal synchronization encoder to embed synchronized spatio-temporal +semantics. Finally, we propose Directed Diffusion Graph Convolutional Network +(DD-GCN) for action recognition, and the experiments on three public datasets: +NTU-RGB+D, NTU-RGB+D 120, and NW-UCLA, demonstrate the state-of-the-art +performance of our method. + +
+
+ comment: ICEM 2023 +
+
+
+
+
+ + ☆ Source-Free Collaborative Domain Adaptation via Multi-Perspective + Feature Enrichment for Functional MRI Analysis + + +
+ Resting-state functional MRI (rs-fMRI) is increasingly employed in multi-site +research to aid neurological disorder analysis. Existing studies usually suffer +from significant cross-site/domain data heterogeneity caused by site effects +such as differences in scanners/protocols. Many methods have been proposed to +reduce fMRI heterogeneity between source and target domains, heavily relying on +the availability of source data. But acquiring source data is challenging due +to privacy concerns and/or data storage burdens in multi-site studies. To this +end, we design a source-free collaborative domain adaptation (SCDA) framework +for fMRI analysis, where only a pretrained source model and unlabeled target +data are accessible. Specifically, a multi-perspective feature enrichment +method (MFE) is developed for target fMRI analysis, consisting of multiple +collaborative branches to dynamically capture fMRI features of unlabeled target +data from multiple views. Each branch has a data-feeding module, a +spatiotemporal feature encoder, and a class predictor. A mutual-consistency +constraint is designed to encourage pair-wise consistency of latent features of +the same input generated from these branches for robust representation +learning. To facilitate efficient cross-domain knowledge transfer without +source data, we initialize MFE using parameters of a pretrained source model. +We also introduce an unsupervised pretraining strategy using 3,806 unlabeled +fMRIs from three large-scale auxiliary databases, aiming to obtain a general +feature encoder. Experimental results on three public datasets and one private +dataset demonstrate the efficacy of our method in cross-scanner and cross-study +prediction tasks. The model pretrained on large-scale rs-fMRI data has been +released to the public. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ☆ MOFA: A Model Simplification Roadmap for Image Restoration on Mobile + Devices ICCV + + +
+ Image restoration aims to restore high-quality images from degraded +counterparts and has seen significant advancements through deep learning +techniques. The technique has been widely applied to mobile devices for tasks +such as mobile photography. Given the resource limitations on mobile devices, +such as memory constraints and runtime requirements, the efficiency of models +during deployment becomes paramount. Nevertheless, most previous works have +primarily concentrated on analyzing the efficiency of single modules and +improving them individually. This paper examines the efficiency across +different layers. We propose a roadmap that can be applied to further +accelerate image restoration models prior to deployment while simultaneously +increasing PSNR (Peak Signal-to-Noise Ratio) and SSIM (Structural Similarity +Index). The roadmap first increases the model capacity by adding more +parameters to partial convolutions on FLOPs non-sensitive layers. Then, it +applies partial depthwise convolution coupled with decoupling +upsampling/downsampling layers to accelerate the model speed. Extensive +experiments demonstrate that our approach decreases runtime by up to 13% and +reduces the number of parameters by up to 23%, while increasing PSNR and SSIM +on several image restoration datasets. Source Code of our method is available +at \href{https://github.com/xiangyu8/MOFA}{https://github.com/xiangyu8/MOFA}. + +
+
+ comment: Accepted by 2023 ICCV Workshop (RCV) +
+
+
+
+
+ + ☆ American Stories: A Large-Scale Structured Text Dataset of Historical + U.S. Newspapers + + +
+ Existing full text datasets of U.S. public domain newspapers do not recognize +the often complex layouts of newspaper scans, and as a result the digitized +content scrambles texts from articles, headlines, captions, advertisements, and +other layout regions. OCR quality can also be low. This study develops a novel, +deep learning pipeline for extracting full article texts from newspaper images +and applies it to the nearly 20 million scans in Library of Congress's public +domain Chronicling America collection. The pipeline includes layout detection, +legibility classification, custom OCR, and association of article texts +spanning multiple bounding boxes. To achieve high scalability, it is built with +efficient architectures designed for mobile phones. The resulting American +Stories dataset provides high quality data that could be used for pre-training +a large language model to achieve better understanding of historical English +and historical world knowledge. The dataset could also be added to the external +database of a retrieval-augmented language model to make historical information +- ranging from interpretations of political events to minutiae about the lives +of people's ancestors - more widely accessible. Furthermore, structured article +texts facilitate using transformer-based methods for popular social science +applications like topic classification, detection of reproduced content, and +news story clustering. Finally, American Stories provides a massive silver +quality dataset for innovating multimodal layout analysis models and other +multimodal applications. + +
+
+
+
+
+ + ☆ CompaCT: Fractal-Based Heuristic Pixel Segmentation for Lossless + Compression of High-Color DICOM Medical Images + + +
+ Medical image compression is a widely studied field of data processing due to +its prevalence in modern digital databases. This domain requires a high color +depth of 12 bits per pixel component for accurate analysis by physicians, +primarily in the DICOM format. Standard raster-based compression of images via +filtering is well-known; however, it remains suboptimal in the medical domain +due to non-specialized implementations. This study proposes a lossless medical +image compression algorithm, CompaCT, that aims to target spatial features and +patterns of pixel concentration for dynamically enhanced data processing. The +algorithm employs fractal pixel traversal coupled with a novel approach of +segmentation and meshing between pixel blocks for preprocessing. Furthermore, +delta and entropy coding are applied to this concept for a complete compression +pipeline. The proposal demonstrates that the data compression achieved via +fractal segmentation preprocessing yields enhanced image compression results +while remaining lossless in its reconstruction accuracy. CompaCT is evaluated +in its compression ratios on 3954 high-color CT scans against the efficiency of +industry-standard compression techniques (i.e., JPEG2000, RLE, ZIP, PNG). Its +reconstruction performance is assessed with error metrics to verify lossless +image recovery after decompression. The results demonstrate that CompaCT can +compress and losslessly reconstruct medical images, being 37% more +space-efficient than industry-standard compression systems. + +
+
+ comment: (8/24/2023) v1a: 16 pages, 9 figures, Word PDF +
+
+
+
+
+ + ☆ Interpretable Image Quality Assessment via CLIP with Multiple + Antonym-Prompt Pairs + + +
+ No reference image quality assessment (NR-IQA) is a task to estimate the +perceptual quality of an image without its corresponding original image. It is +even more difficult to perform this task in a zero-shot manner, i.e., without +task-specific training. In this paper, we propose a new zero-shot and +interpretable NRIQA method that exploits the ability of a pre-trained +visionlanguage model to estimate the correlation between an image and a textual +prompt. The proposed method employs a prompt pairing strategy and multiple +antonym-prompt pairs corresponding to carefully selected descriptive features +corresponding to the perceptual image quality. Thus, the proposed method is +able to identify not only the perceptual quality evaluation of the image, but +also the cause on which the quality evaluation is based. Experimental results +show that the proposed method outperforms existing zero-shot NR-IQA methods in +terms of accuracy and can evaluate the causes of perceptual quality +degradation. + +
+
+ comment: 2pages, 1 figure +
+
+
+
+
+ + ☆ EgoBlur: Responsible Innovation in Aria + + +
+ Project Aria pushes the frontiers of Egocentric AI with large-scale +real-world data collection using purposely designed glasses with privacy first +approach. To protect the privacy of bystanders being recorded by the glasses, +our research protocols are designed to ensure recorded video is processed by an +AI anonymization model that removes bystander faces and vehicle license plates. +Detected face and license plate regions are processed with a Gaussian blur such +that these personal identification information (PII) regions are obscured. This +process helps to ensure that anonymized versions of the video is retained for +research purposes. In Project Aria, we have developed a state-of-the-art +anonymization system EgoBlur. In this paper, we present extensive analysis of +EgoBlur on challenging datasets comparing its performance with other +state-of-the-art systems from industry and academia including extensive +Responsible AI analysis on recently released Casual Conversations V2 dataset. + +
+
+
+
+
+ + ☆ Benchmarking Data Efficiency and Computational Efficiency of Temporal + Action Localization Models ICCV 2023 + + +
+ In temporal action localization, given an input video, the goal is to predict +which actions it contains, where they begin, and where they end. Training and +testing current state-of-the-art deep learning models requires access to large +amounts of data and computational power. However, gathering such data is +challenging and computational resources might be limited. This work explores +and measures how current deep temporal action localization models perform in +settings constrained by the amount of data or computational power. We measure +data efficiency by training each model on a subset of the training set. We find +that TemporalMaxer outperforms other models in data-limited settings. +Furthermore, we recommend TriDet when training time is limited. To test the +efficiency of the models during inference, we pass videos of different lengths +through each model. We find that TemporalMaxer requires the least computational +resources, likely due to its simple architecture. + +
+
+ comment: Accepted to the CVEU workshop at ICCV 2023 +
+
+
+
+
+ + ☆ Preserving Modality Structure Improves Multi-Modal Learning ICCV 2023 + + +
+ Self-supervised learning on large-scale multi-modal datasets allows learning +semantically meaningful embeddings in a joint multi-modal representation space +without relying on human annotations. These joint embeddings enable zero-shot +cross-modal tasks like retrieval and classification. However, these methods +often struggle to generalize well on out-of-domain data as they ignore the +semantic structure present in modality-specific embeddings. In this context, we +propose a novel Semantic-Structure-Preserving Consistency approach to improve +generalizability by preserving the modality-specific relationships in the joint +embedding space. To capture modality-specific semantic relationships between +samples, we propose to learn multiple anchors and represent the multifaceted +relationship between samples with respect to their relationship with these +anchors. To assign multiple anchors to each sample, we propose a novel +Multi-Assignment Sinkhorn-Knopp algorithm. Our experimentation demonstrates +that our proposed approach learns semantically meaningful anchors in a +self-supervised manner. Furthermore, our evaluation on MSR-VTT and YouCook2 +datasets demonstrates that our proposed multi-anchor assignment based solution +achieves state-of-the-art performance and generalizes to both inand +out-of-domain datasets. Code: https://github.com/Swetha5/Multi_Sinkhorn_Knopp + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ☆ SurGNN: Explainable visual scene understanding and assessment of + surgical skill using graph neural networks + + +
+ This paper explores how graph neural networks (GNNs) can be used to enhance +visual scene understanding and surgical skill assessment. By using GNNs to +analyze the complex visual data of surgical procedures represented as graph +structures, relevant features can be extracted and surgical skill can be +predicted. Additionally, GNNs provide interpretable results, revealing the +specific actions, instruments, or anatomical structures that contribute to the +predicted skill metrics. This can be highly beneficial for surgical educators +and trainees, as it provides valuable insights into the factors that contribute +to successful surgical performance and outcomes. SurGNN proposes two concurrent +approaches -- one supervised and the other self-supervised. The paper also +briefly discusses other automated surgical skill evaluation techniques and +highlights the limitations of hand-crafted features in capturing the +intricacies of surgical expertise. We use the proposed methods to achieve +state-of-the-art results on EndoVis19, and custom datasets. The working +implementation of the code can be found at https://github.com/. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ Full-dose PET Synthesis from Low-dose PET Using High-efficiency + Diffusion Denoising Probabilistic Model + + +
+ To reduce the risks associated with ionizing radiation, a reduction of +radiation exposure in PET imaging is needed. However, this leads to a +detrimental effect on image contrast and quantification. High-quality PET +images synthesized from low-dose data offer a solution to reduce radiation +exposure. We introduce a diffusion-model-based approach for estimating +full-dose PET images from low-dose ones: the PET Consistency Model (PET-CM) +yielding synthetic quality comparable to state-of-the-art diffusion-based +synthesis models, but with greater efficiency. There are two steps: a forward +process that adds Gaussian noise to a full dose PET image at multiple +timesteps, and a reverse diffusion process that employs a PET Shifted-window +Vision Transformer (PET-VIT) network to learn the denoising procedure +conditioned on the corresponding low-dose PETs. In PET-CM, the reverse process +learns a consistency function for direct denoising of Gaussian noise to a clean +full-dose PET. We evaluated the PET-CM in generating full-dose images using +only 1/8 and 1/4 of the standard PET dose. Comparing 1/8 dose to full-dose +images, PET-CM demonstrated impressive performance with normalized mean +absolute error (NMAE) of 1.233+/-0.131%, peak signal-to-noise ratio (PSNR) of +33.915+/-0.933dB, structural similarity index (SSIM) of 0.964+/-0.009, and +normalized cross-correlation (NCC) of 0.968+/-0.011, with an average generation +time of 62 seconds per patient. This is a significant improvement compared to +the state-of-the-art diffusion-based model with PET-CM reaching this result 12x +faster. In the 1/4 dose to full-dose image experiments, PET-CM is also +competitive, achieving an NMAE 1.058+/-0.092%, PSNR of 35.548+/-0.805dB, SSIM +of 0.978+/-0.005, and NCC 0.981+/-0.007 The results indicate promising low-dose +PET image quality improvements for clinical applications. + +
+
+
+
+
+ + ☆ Data-Side Efficiencies for Lightweight Convolutional Neural Networks + + +
+ We examine how the choice of data-side attributes for two important visual +tasks of image classification and object detection can aid in the choice or +design of lightweight convolutional neural networks. We show by experimentation +how four data attributes - number of classes, object color, image resolution, +and object scale affect neural network model size and efficiency. Intra- and +inter-class similarity metrics, based on metric learning, are defined to guide +the evaluation of these attributes toward achieving lightweight models. +Evaluations made using these metrics are shown to require 30x less computation +than running full inference tests. We provide, as an example, applying the +metrics and methods to choose a lightweight model for a robot path planning +application and achieve computation reduction of 66% and accuracy gain of 3.5% +over the pre-method model. + +
+
+ comment: 10 pages, 5 figures, 6 tables +
+
+
+
+
+ + ☆ Enhancing Perception and Immersion in Pre-Captured Environments through + Learning-Based Eye Height Adaptation + + +
+ Pre-captured immersive environments using omnidirectional cameras provide a +wide range of virtual reality applications. Previous research has shown that +manipulating the eye height in egocentric virtual environments can +significantly affect distance perception and immersion. However, the influence +of eye height in pre-captured real environments has received less attention due +to the difficulty of altering the perspective after finishing the capture +process. To explore this influence, we first propose a pilot study that +captures real environments with multiple eye heights and asks participants to +judge the egocentric distances and immersion. If a significant influence is +confirmed, an effective image-based approach to adapt pre-captured real-world +environments to the user's eye height would be desirable. Motivated by the +study, we propose a learning-based approach for synthesizing novel views for +omnidirectional images with altered eye heights. This approach employs a +multitask architecture that learns depth and semantic segmentation in two +formats, and generates high-quality depth and semantic segmentation to +facilitate the inpainting stage. With the improved omnidirectional-aware +layered depth image, our approach synthesizes natural and realistic visuals for +eye height adaptation. Quantitative and qualitative evaluation shows favorable +results against state-of-the-art methods, and an extensive user study verifies +improved perception and immersion for pre-captured real-world environments. + +
+
+ comment: 10 pages, 13 figures, 3 tables, submitted to ISMAR 2023 +
+
+
+
+
+ + ☆ Spherical Vision Transformer for 360-degree Video Saliency Prediction BMVC 2023 + + +
+ The growing interest in omnidirectional videos (ODVs) that capture the full +field-of-view (FOV) has gained 360-degree saliency prediction importance in +computer vision. However, predicting where humans look in 360-degree scenes +presents unique challenges, including spherical distortion, high resolution, +and limited labelled data. We propose a novel vision-transformer-based model +for omnidirectional videos named SalViT360 that leverages tangent image +representations. We introduce a spherical geometry-aware spatiotemporal +self-attention mechanism that is capable of effective omnidirectional video +understanding. Furthermore, we present a consistency-based unsupervised +regularization term for projection-based 360-degree dense-prediction models to +reduce artefacts in the predictions that occur after inverse projection. Our +approach is the first to employ tangent images for omnidirectional saliency +prediction, and our experimental results on three ODV saliency datasets +demonstrate its effectiveness compared to the state-of-the-art. + +
+
+ comment: 12 pages, 4 figures, accepted to BMVC 2023 +
+
+
+
+
+ + ♻ ☆ Differentiable Microscopy Designs an All Optical Phase Retrieval + Microscope + + +
+ Since the late 16th century, scientists have continuously innovated and +developed new microscope types for various applications. Creating a new +architecture from the ground up requires substantial scientific expertise and +creativity, often spanning years or even decades. In this study, we propose an +alternative approach called "Differentiable Microscopy," which introduces a +top-down design paradigm for optical microscopes. Using all-optical phase +retrieval as an illustrative example, we demonstrate the effectiveness of +data-driven microscopy design through $\partial\mu$. Furthermore, we conduct +comprehensive comparisons with competing methods, showcasing the consistent +superiority of our learned designs across multiple datasets, including +biological samples. To substantiate our ideas, we experimentally validate the +functionality of one of the learned designs, providing a proof of concept. The +proposed differentiable microscopy framework supplements the creative process +of designing new optical systems and would perhaps lead to unconventional but +better optical designs. + +
+
+
+
+
+ + ♻ ☆ LOPR: Latent Occupancy PRediction using Generative Models + + +
+ Environment prediction frameworks are integral for autonomous vehicles, +enabling safe navigation in dynamic environments. LiDAR generated occupancy +grid maps (L-OGMs) offer a robust bird's eye-view scene representation that +facilitates joint scene predictions without relying on manual labeling unlike +commonly used trajectory prediction frameworks. Prior approaches have optimized +deterministic L-OGM prediction architectures directly in grid cell space. While +these methods have achieved some degree of success in prediction, they +occasionally grapple with unrealistic and incorrect predictions. We claim that +the quality and realism of the forecasted occupancy grids can be enhanced with +the use of generative models. We propose a framework that decouples occupancy +prediction into: representation learning and stochastic prediction within the +learned latent space. Our approach allows for conditioning the model on other +available sensor modalities such as RGB-cameras and high definition maps. We +demonstrate that our approach achieves state-of-the-art performance and is +readily transferable between different robotic platforms on the real-world +NuScenes, Waymo Open, and a custom dataset we collected on an experimental +vehicle platform. + +
+
+
+
+
+ + ♻ ☆ FIESTA: Autoencoders for accurate fiber segmentation in tractography + + +
+ White matter bundle segmentation is a cornerstone of modern tractography to +study the brain's structural connectivity in domains such as neurological +disorders, neurosurgery, and aging. In this study, we present FIESTA (FIbEr +Segmentation in Tractography using Autoencoders), a reliable and robust, fully +automated, and easily semi-automatically calibrated pipeline based on deep +autoencoders that can dissect and fully populate white matter bundles. This +pipeline is built upon previous works that demonstrated how autoencoders can be +used successfully for streamline filtering, bundle segmentation, and streamline +generation in tractography. Our proposed method improves bundle segmentation +coverage by recovering hard-to-track bundles with generative sampling through +the latent space seeding of the subject bundle and the atlas bundle. A latent +space of streamlines is learned using autoencoder-based modeling combined with +contrastive learning. Using an atlas of bundles in standard space (MNI), our +proposed method segments new tractograms using the autoencoder latent distance +between each tractogram streamline and its closest neighbor bundle in the atlas +of bundles. Intra-subject bundle reliability is improved by recovering +hard-to-track streamlines, using the autoencoder to generate new streamlines +that increase the spatial coverage of each bundle while remaining anatomically +correct. Results show that our method is more reliable than state-of-the-art +automated virtual dissection methods such as RecoBundles, RecoBundlesX, +TractSeg, White Matter Analysis and XTRACT. Our framework allows for the +transition from one anatomical bundle definition to another with marginal +calibration efforts. Overall, these results show that our framework improves +the practicality and usability of current state-of-the-art bundle segmentation +framework. + +
+
+ comment: 36 pages, 13 figures, accepted in NeuroImage +
+
+
+
+
+ + ♻ ☆ CNOS: A Strong Baseline for CAD-based Novel Object Segmentation ICCV 2023 + + +
+ We propose a simple three-stage approach to segment unseen objects in RGB +images using their CAD models. Leveraging recent powerful foundation models, +DINOv2 and Segment Anything, we create descriptors and generate proposals, +including binary masks for a given input RGB image. By matching proposals with +reference descriptors created from CAD models, we achieve precise object ID +assignment along with modal masks. We experimentally demonstrate that our +method achieves state-of-the-art results in CAD-based novel object +segmentation, surpassing existing approaches on the seven core datasets of the +BOP challenge by 19.8% AP using the same BOP evaluation protocol. Our source +code is available at https://github.com/nv-nguyen/cnos. + +
+
+ comment: ICCV 2023, R6D Workshop +
+
+
+
+
+ + ♻ ☆ Self-regulating Prompts: Foundational Model Adaptation without + Forgetting ICCV-2023 + + +
+ Prompt learning has emerged as an efficient alternative for fine-tuning +foundational models, such as CLIP, for various downstream tasks. Conventionally +trained using the task-specific objective, i.e., cross-entropy loss, prompts +tend to overfit downstream data distributions and find it challenging to +capture task-agnostic general features from the frozen CLIP. This leads to the +loss of the model's original generalization capability. To address this issue, +our work introduces a self-regularization framework for prompting called +PromptSRC (Prompting with Self-regulating Constraints). PromptSRC guides the +prompts to optimize for both task-specific and task-agnostic general +representations using a three-pronged approach by: (a) regulating prompted +representations via mutual agreement maximization with the frozen model, (b) +regulating with self-ensemble of prompts over the training trajectory to encode +their complementary strengths, and (c) regulating with textual diversity to +mitigate sample diversity imbalance with the visual branch. To the best of our +knowledge, this is the first regularization framework for prompt learning that +avoids overfitting by jointly attending to pre-trained model features, the +training trajectory during prompting, and the textual diversity. PromptSRC +explicitly steers the prompts to learn a representation space that maximizes +performance on downstream tasks without compromising CLIP generalization. We +perform extensive experiments on 4 benchmarks where PromptSRC overall performs +favorably well compared to the existing methods. Our code and pre-trained +models are publicly available at: https://github.com/muzairkhattak/PromptSRC. + +
+
+ comment: Accepted to ICCV-2023. Camera-Ready version. Project page: + https://muzairkhattak.github.io/PromptSRC/ +
+
+
+
+
+ + ♻ ☆ Video-FocalNets: Spatio-Temporal Focal Modulation for Video Action + Recognition ICCV-2023 + + +
+ Recent video recognition models utilize Transformer models for long-range +spatio-temporal context modeling. Video transformer designs are based on +self-attention that can model global context at a high computational cost. In +comparison, convolutional designs for videos offer an efficient alternative but +lack long-range dependency modeling. Towards achieving the best of both +designs, this work proposes Video-FocalNet, an effective and efficient +architecture for video recognition that models both local and global contexts. +Video-FocalNet is based on a spatio-temporal focal modulation architecture that +reverses the interaction and aggregation steps of self-attention for better +efficiency. Further, the aggregation step and the interaction step are both +implemented using efficient convolution and element-wise multiplication +operations that are computationally less expensive than their self-attention +counterparts on video representations. We extensively explore the design space +of focal modulation-based spatio-temporal context modeling and demonstrate our +parallel spatial and temporal encoding design to be the optimal choice. +Video-FocalNets perform favorably well against the state-of-the-art +transformer-based models for video recognition on five large-scale datasets +(Kinetics-400, Kinetics-600, SS-v2, Diving-48, and ActivityNet-1.3) at a lower +computational cost. Our code/models are released at +https://github.com/TalalWasim/Video-FocalNets. + +
+
+ comment: Accepted to ICCV-2023. Camera-Ready version. Project page: + https://TalalWasim.github.io/Video-FocalNets/ +
+
+
+
+
+ + ♻ ☆ Efficient data transport over multimode light-pipes with Megapixel + images using differentiable ray tracing and Machine-learning + + +
+ Retrieving images transmitted through multi-mode fibers is of growing +interest, thanks to their ability to confine and transport light efficiently in +a compact system. Here, we demonstrate machine-learning-based decoding of +large-scale digital images (pages), maximizing page capacity for optical +storage applications. Using a millimeter-sized square cross-section waveguide, +we image an 8-bit spatial light modulator, presenting data as a matrix of +symbols. Normally, decoders will incur a prohibitive O(n^2) computational +scaling to decode n symbols in spatially scrambled data. However, by combining +a digital twin of the setup with a U-Net, we can retrieve up to 66 kB using +efficient convolutional operations only. We compare trainable ray-tracing-based +with eigenmode-based twins and show the former to be superior thanks to its +ability to overcome the simulation-to-experiment gap by adjusting to optical +imperfections. We train the pipeline end-to-end using a differentiable +mutual-information estimator based on the von-Mises distribution, generally +applicable to phase-coding channels. + +
+
+ comment: 21 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Improving Sample Quality of Diffusion Models Using Self-Attention + Guidance ICCV 2023 + + +
+ Denoising diffusion models (DDMs) have attracted attention for their +exceptional generation quality and diversity. This success is largely +attributed to the use of class- or text-conditional diffusion guidance methods, +such as classifier and classifier-free guidance. In this paper, we present a +more comprehensive perspective that goes beyond the traditional guidance +methods. From this generalized perspective, we introduce novel condition- and +training-free strategies to enhance the quality of generated images. As a +simple solution, blur guidance improves the suitability of intermediate samples +for their fine-scale information and structures, enabling diffusion models to +generate higher quality samples with a moderate guidance scale. Improving upon +this, Self-Attention Guidance (SAG) uses the intermediate self-attention maps +of diffusion models to enhance their stability and efficacy. Specifically, SAG +adversarially blurs only the regions that diffusion models attend to at each +iteration and guides them accordingly. Our experimental results show that our +SAG improves the performance of various diffusion models, including ADM, IDDPM, +Stable Diffusion, and DiT. Moreover, combining SAG with conventional guidance +methods leads to further improvement. + +
+
+ comment: Accepted to ICCV 2023. Project Page: + https://ku-cvlab.github.io/Self-Attention-Guidance +
+
+
+
+
+ + ♻ ☆ Zolly: Zoom Focal Length Correctly for Perspective-Distorted Human Mesh + Reconstruction + + +
+ As it is hard to calibrate single-view RGB images in the wild, existing 3D +human mesh reconstruction (3DHMR) methods either use a constant large focal +length or estimate one based on the background environment context, which can +not tackle the problem of the torso, limb, hand or face distortion caused by +perspective camera projection when the camera is close to the human body. The +naive focal length assumptions can harm this task with the incorrectly +formulated projection matrices. To solve this, we propose Zolly, the first +3DHMR method focusing on perspective-distorted images. Our approach begins with +analysing the reason for perspective distortion, which we find is mainly caused +by the relative location of the human body to the camera center. We propose a +new camera model and a novel 2D representation, termed distortion image, which +describes the 2D dense distortion scale of the human body. We then estimate the +distance from distortion scale features rather than environment context +features. Afterwards, we integrate the distortion feature with image features +to reconstruct the body mesh. To formulate the correct projection matrix and +locate the human body position, we simultaneously use perspective and +weak-perspective projection loss. Since existing datasets could not handle this +task, we propose the first synthetic dataset PDHuman and extend two real-world +datasets tailored for this task, all containing perspective-distorted human +images. Extensive experiments show that Zolly outperforms existing +state-of-the-art methods on both perspective-distorted datasets and the +standard benchmark (3DPW). + +
+
+
+
+
+ + ♻ ☆ Multimodal Image Synthesis and Editing: The Generative AI Era + + +
+ As information exists in various modalities in real world, effective +interaction and fusion among multimodal information plays a key role for the +creation and perception of multimodal data in computer vision and deep learning +research. With superb power in modeling the interaction among multimodal +information, multimodal image synthesis and editing has become a hot research +topic in recent years. Instead of providing explicit guidance for network +training, multimodal guidance offers intuitive and flexible means for image +synthesis and editing. On the other hand, this field is also facing several +challenges in alignment of multimodal features, synthesis of high-resolution +images, faithful evaluation metrics, etc. In this survey, we comprehensively +contextualize the advance of the recent multimodal image synthesis and editing +and formulate taxonomies according to data modalities and model types. We start +with an introduction to different guidance modalities in image synthesis and +editing, and then describe multimodal image synthesis and editing approaches +extensively according to their model types. After that, we describe benchmark +datasets and evaluation metrics as well as corresponding experimental results. +Finally, we provide insights about the current research challenges and possible +directions for future research. A project associated with this survey is +available at https://github.com/fnzhan/Generative-AI. + +
+
+ comment: TPAMI 2023 +
+
+
+
+
+ + ♻ ☆ MIPS-Fusion: Multi-Implicit-Submaps for Scalable and Robust Online + Neural RGB-D Reconstruction + + +
+ We introduce MIPS-Fusion, a robust and scalable online RGB-D reconstruction +method based on a novel neural implicit representation -- +multi-implicit-submap. Different from existing neural RGB-D reconstruction +methods lacking either flexibility with a single neural map or scalability due +to extra storage of feature grids, we propose a pure neural representation +tackling both difficulties with a divide-and-conquer design. In our method, +neural submaps are incrementally allocated alongside the scanning trajectory +and efficiently learned with local neural bundle adjustments. The submaps can +be refined individually in a back-end optimization and optimized jointly to +realize submap-level loop closure. Meanwhile, we propose a hybrid tracking +approach combining randomized and gradient-based pose optimizations. For the +first time, randomized optimization is made possible in neural tracking with +several key designs to the learning process, enabling efficient and robust +tracking even under fast camera motions. The extensive evaluation demonstrates +that our method attains higher reconstruction quality than the state of the +arts for large-scale scenes and under fast camera motions. + +
+
+
+
+
+ + ♻ ☆ Algorithmic progress in computer vision + + +
+ We investigate algorithmic progress in image classification on ImageNet, +perhaps the most well-known test bed for computer vision. We estimate a model, +informed by work on neural scaling laws, and infer a decomposition of progress +into the scaling of compute, data, and algorithms. Using Shapley values to +attribute performance improvements, we find that algorithmic improvements have +been roughly as important as the scaling of compute for progress computer +vision. Our estimates indicate that algorithmic innovations mostly take the +form of compute-augmenting algorithmic advances (which enable researchers to +get better performance from less compute), not data-augmenting algorithmic +advances. We find that compute-augmenting algorithmic advances are made at a +pace more than twice as fast as the rate usually associated with Moore's law. +In particular, we estimate that compute-augmenting innovations halve compute +requirements every nine months (95\% confidence interval: 4 to 25 months). + +
+
+
+
+
+ + ♻ ☆ Boosting Convolution with Efficient MLP-Permutation for Volumetric + Medical Image Segmentation + + +
+ Recently, the advent of vision Transformer (ViT) has brought substantial +advancements in 3D dataset benchmarks, particularly in 3D volumetric medical +image segmentation (Vol-MedSeg). Concurrently, multi-layer perceptron (MLP) +network has regained popularity among researchers due to their comparable +results to ViT, albeit with the exclusion of the resource-intensive +self-attention module. In this work, we propose a novel permutable hybrid +network for Vol-MedSeg, named PHNet, which capitalizes on the strengths of both +convolution neural networks (CNNs) and MLP. PHNet addresses the intrinsic +isotropy problem of 3D volumetric data by employing a combination of 2D and 3D +CNNs to extract local features. Besides, we propose an efficient multi-layer +permute perceptron (MLPP) module that captures long-range dependence while +preserving positional information. This is achieved through an axis +decomposition operation that permutes the input tensor along different axes, +thereby enabling the separate encoding of the positional information. +Furthermore, MLPP tackles the resolution sensitivity issue of MLP in Vol-MedSeg +with a token segmentation operation, which divides the feature into smaller +tokens and processes them individually. Extensive experimental results validate +that PHNet outperforms the state-of-the-art methods with lower computational +costs on the widely-used yet challenging COVID-19-20 and Synapse benchmarks. +The ablation study also demonstrates the effectiveness of PHNet in harnessing +the strengths of both CNNs and MLP. + +
+
+
+
+
+ + ♻ ☆ VeriCompress: A Tool to Streamline the Synthesis of Verified Robust + Compressed Neural Networks from Scratch + + +
+ AI's widespread integration has led to neural networks (NNs) deployment on +edge and similar limited-resource platforms for safety-critical scenarios. Yet, +NN's fragility raises concerns about reliable inference. Moreover, constrained +platforms demand compact networks. This study introduces VeriCompress, a tool +that automates the search and training of compressed models with robustness +guarantees. These models are well-suited for safety-critical applications and +adhere to predefined architecture and size limitations, making them deployable +on resource-restricted platforms. The method trains models 2-3 times faster +than the state-of-the-art approaches, surpassing relevant baseline approaches +by average accuracy and robustness gains of 15.1 and 9.8 percentage points, +respectively. When deployed on a resource-restricted generic platform, these +models require 5-8 times less memory and 2-4 times less inference time than +models used in verified robustness literature. Our comprehensive evaluation +across various model architectures and datasets, including MNIST, CIFAR, SVHN, +and a relevant pedestrian detection dataset, showcases VeriCompress's capacity +to identify compressed verified robust models with reduced computation overhead +compared to current standards. This underscores its potential as a valuable +tool for end users, such as developers of safety-critical applications on edge +or Internet of Things platforms, empowering them to create suitable models for +safety-critical, resource-constrained platforms in their respective domains. + +
+
+ comment: 9 pages, 5 tables, 1 figure +
+
+
+
+
+ + ♻ ☆ Reliable Multimodality Eye Disease Screening via Mixture of Student's t + Distributions MICCAI 2023 + + +
+ Multimodality eye disease screening is crucial in ophthalmology as it +integrates information from diverse sources to complement their respective +performances. However, the existing methods are weak in assessing the +reliability of each unimodality, and directly fusing an unreliable modality may +cause screening errors. To address this issue, we introduce a novel +multimodality evidential fusion pipeline for eye disease screening, EyeMoSt, +which provides a measure of confidence for unimodality and elegantly integrates +the multimodality information from a multi-distribution fusion perspective. +Specifically, our model estimates both local uncertainty for unimodality and +global uncertainty for the fusion modality to produce reliable classification +results. More importantly, the proposed mixture of Student's $t$ distributions +adaptively integrates different modalities to endow the model with heavy-tailed +properties, increasing robustness and reliability. Our experimental findings on +both public and in-house datasets show that our model is more reliable than +current methods. Additionally, EyeMost has the potential ability to serve as a +data quality discriminator, enabling reliable decision-making for multimodality +eye disease screening. + +
+
+ comment: MICCAI 2023 (Early accept):11 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Edge-aware Hard Clustering Graph Pooling for Brain Imaging Data + + +
+ Graph Convolutional Networks (GCNs) can capture non-Euclidean spatial +dependence between different brain regions, and the graph pooling operator in +GCNs is key to enhancing the representation learning capability and acquiring +abnormal brain maps. However, the majority of existing research designs graph +pooling operators only from the perspective of nodes while disregarding the +original edge features, in a way that not only confines graph pooling +application scenarios, but also diminishes its ability to capture critical +substructures. In this study, a clustering graph pooling method that first +supports multidimensional edge features, called Edge-aware hard clustering +graph pooling (EHCPool), is developed. EHCPool proposes the first +'Edge-to-node' score evaluation criterion based on edge features to assess node +feature significance. To more effectively capture the critical subgraphs, a +novel Iteration n-top strategy is further designed to adaptively learn sparse +hard clustering assignments for graphs. Subsequently, an innovative N-E +Aggregation strategy is presented to aggregate node and edge feature +information in each independent subgraph. The proposed model was evaluated on +multi-site brain imaging public datasets and yielded state-of-the-art +performance. We believe this method is the first deep learning tool with the +potential to probe different types of abnormal functional brain networks from +data-driven perspective. + +
+
+
+
+
+ + ♻ ☆ Dealing with Small Datasets for Deep Learning in Medical Imaging: An + Evaluation of Self-Supervised Pre-Training on CT Scans Comparing Contrastive + and Masked Autoencoder Methods for Convolutional Models + + +
+ Deep learning in medical imaging has the potential to minimize the risk of +diagnostic errors, reduce radiologist workload, and accelerate diagnosis. +Training such deep learning models requires large and accurate datasets, with +annotations for all training samples. However, in the medical imaging domain, +annotated datasets for specific tasks are often small due to the high +complexity of annotations, limited access, or the rarity of diseases. To +address this challenge, deep learning models can be pre-trained on large image +datasets without annotations using methods from the field of self-supervised +learning. After pre-training, small annotated datasets are sufficient to +fine-tune the models for a specific task. The most popular self-supervised +pre-training approaches in medical imaging are based on contrastive learning. +However, recent studies in natural image processing indicate a strong potential +for masked autoencoder approaches. Our work compares state-of-the-art +contrastive learning methods with the recently introduced masked autoencoder +approach "SparK" for convolutional neural networks (CNNs) on medical images. +Therefore we pre-train on a large unannotated CT image dataset and fine-tune on +several CT classification tasks. Due to the challenge of obtaining sufficient +annotated training data in medical imaging, it is of particular interest to +evaluate how the self-supervised pre-training methods perform when fine-tuning +on small datasets. By experimenting with gradually reducing the training +dataset size for fine-tuning, we find that the reduction has different effects +depending on the type of pre-training chosen. The SparK pre-training method is +more robust to the training dataset size than the contrastive methods. Based on +our results, we propose the SparK pre-training for medical imaging tasks with +only small annotated datasets. + +
+
+ comment: This paper is under review. The code will be released if accepted +
+
+
+
+
+ + ♻ ☆ Self-Supervised Training with Autoencoders for Visual Anomaly Detection + + +
+ Deep autoencoders provide an effective tool for learning non-linear +dimensionality reduction in an unsupervised way. Recently, they have been used +for the task of anomaly detection in the visual domain. By optimizing for the +reconstruction error using anomaly-free examples, the common belief is that a +corresponding network should fail to accurately reconstruct anomalous regions +in the application phase. This goal is typically addressed by controlling the +capacity of the network, either by reducing the size of the bottleneck layer or +by enforcing sparsity constraints on the activations. However, neither of these +techniques does explicitly penalize reconstruction of anomalous signals often +resulting in poor detection. We tackle this problem by adapting a +self-supervised learning regime that allows the use of discriminative +information during training but focuses on the data manifold of normal +examples. We emphasize that inference with our approach is very efficient +during training and prediction requiring a single forward pass for each input +image. Our experiments on the MVTec AD dataset demonstrate high detection and +localization performance. On the texture-subset, in particular, our approach +consistently outperforms recent anomaly detection methods by a significant +margin. + +
+
+
+
+
+ + ♻ ☆ Learning A Coarse-to-Fine Diffusion Transformer for Image Restoration + + +
+ Recent years have witnessed the remarkable performance of diffusion models in +various vision tasks. However, for image restoration that aims to recover clear +images with sharper details from given degraded observations, diffusion-based +methods may fail to recover promising results due to inaccurate noise +estimation. Moreover, simple constraining noises cannot effectively learn +complex degradation information, which subsequently hinders the model capacity. +To solve the above problems, we propose a coarse-to-fine diffusion Transformer +(C2F-DFT) for image restoration. Specifically, our C2F-DFT contains diffusion +self-attention (DFSA) and diffusion feed-forward network (DFN) within a new +coarse-to-fine training scheme. The DFSA and DFN respectively capture the +long-range diffusion dependencies and learn hierarchy diffusion representation +to facilitate better restoration. In the coarse training stage, our C2F-DFT +estimates noises and then generates the final clean image by a sampling +algorithm. To further improve the restoration quality, we propose a simple yet +effective fine training scheme. It first exploits the coarse-trained diffusion +model with fixed steps to generate restoration results, which then would be +constrained with corresponding ground-truth ones to optimize the models to +remedy the unsatisfactory results affected by inaccurate noise estimation. +Extensive experiments show that C2F-DFT significantly outperforms +diffusion-based restoration method IR-SDE and achieves competitive performance +compared with Transformer-based state-of-the-art methods on $3$ tasks, +including deraining, deblurring, and real denoising. The code is available at +https://github.com/wlydlut/C2F-DFT. + +
+
+ comment: 9 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Efficient Region-Aware Neural Radiance Fields for High-Fidelity Talking + Portrait Synthesis ICCV 2023 + + +
+ This paper presents ER-NeRF, a novel conditional Neural Radiance Fields +(NeRF) based architecture for talking portrait synthesis that can concurrently +achieve fast convergence, real-time rendering, and state-of-the-art performance +with small model size. Our idea is to explicitly exploit the unequal +contribution of spatial regions to guide talking portrait modeling. +Specifically, to improve the accuracy of dynamic head reconstruction, a compact +and expressive NeRF-based Tri-Plane Hash Representation is introduced by +pruning empty spatial regions with three planar hash encoders. For speech +audio, we propose a Region Attention Module to generate region-aware condition +feature via an attention mechanism. Different from existing methods that +utilize an MLP-based encoder to learn the cross-modal relation implicitly, the +attention mechanism builds an explicit connection between audio features and +spatial regions to capture the priors of local motions. Moreover, a direct and +fast Adaptive Pose Encoding is introduced to optimize the head-torso separation +problem by mapping the complex transformation of the head pose into spatial +coordinates. Extensive experiments demonstrate that our method renders better +high-fidelity and audio-lips synchronized talking portrait videos, with +realistic details and high efficiency compared to previous methods. + +
+
+ comment: Accepted by ICCV 2023. Project page: + https://fictionarry.github.io/ER-NeRF/ +
+
+
+
+
+ + ♻ ☆ Reconstructing Pruned Filters using Cheap Spatial Transformations ICCV 2023 + + +
+ We present an efficient alternative to the convolutional layer using cheap +spatial transformations. This construction exploits an inherent spatial +redundancy of the learned convolutional filters to enable a much greater +parameter efficiency, while maintaining the top-end accuracy of their dense +counter-parts. Training these networks is modelled as a generalised pruning +problem, whereby the pruned filters are replaced with cheap transformations +from the set of non-pruned filters. We provide an efficient implementation of +the proposed layer, followed by two natural extensions to avoid excessive +feature compression and to improve the expressivity of the transformed +features. We show that these networks can achieve comparable or improved +performance to state-of-the-art pruning models across both the CIFAR-10 and +ImageNet-1K datasets. + +
+
+ comment: ICCV 2023 Workshop on Resource Efficient Deep Learning for Computer + Vision +
+
+
+
+
+ + ♻ ☆ E2E-LOAD: End-to-End Long-form Online Action Detection + + +
+ Recently, there has been a growing trend toward feature-based approaches for +Online Action Detection (OAD). However, these approaches have limitations due +to their fixed backbone design, which ignores the potential capability of a +trainable backbone. In this paper, we propose the first end-to-end OAD model, +termed E2E-LOAD, designed to address the major challenge of OAD, namely, +long-term understanding and efficient online reasoning. Specifically, our +proposed approach adopts an initial spatial model that is shared by all frames +and maintains a long sequence cache for inference at a low computational cost. +We also advocate an asymmetric spatial-temporal model for long-form and +short-form modeling effectively. Furthermore, we propose a novel and efficient +inference mechanism that accelerates heavy spatial-temporal exploration. +Extensive ablation studies and experiments demonstrate the effectiveness and +efficiency of our proposed method. Notably, we achieve 17.3 (+12.6) FPS for +end-to-end OAD with 72.4%~(+1.2%), 90.3%~(+0.7%), and 48.1%~(+26.0%) mAP on +THMOUS14, TVSeries, and HDD, respectively, which is 3x faster than previous +approaches. The source code will be made publicly available. + +
+
+
+
+
+ + ♻ ☆ What can a cook in Italy teach a mechanic in India? Action Recognition + Generalisation Over Scenarios and Locations ICCV 2023 + + +
+ We propose and address a new generalisation problem: can a model trained for +action recognition successfully classify actions when they are performed within +a previously unseen scenario and in a previously unseen location? To answer +this question, we introduce the Action Recognition Generalisation Over +scenarios and locations dataset (ARGO1M), which contains 1.1M video clips from +the large-scale Ego4D dataset, across 10 scenarios and 13 locations. We +demonstrate recognition models struggle to generalise over 10 proposed test +splits, each of an unseen scenario in an unseen location. We thus propose CIR, +a method to represent each video as a Cross-Instance Reconstruction of videos +from other domains. Reconstructions are paired with text narrations to guide +the learning of a domain generalisable representation. We provide extensive +analysis and ablations on ARGO1M that show CIR outperforms prior domain +generalisation works on all test splits. Code and data: +https://chiaraplizz.github.io/what-can-a-cook/. + +
+
+ comment: Accepted at ICCV 2023. Project page: + https://chiaraplizz.github.io/what-can-a-cook/ +
+
+
+
+
+ + ♻ ☆ Flow-Guided Controllable Line Drawing Generation + + +
+ In this paper, we investigate the problem of automatically controllable +artistic character line drawing generation from photographs by proposing a +Vector Flow Aware and Line Controllable Image-to-Image Translation +architecture, which can be viewed as an appealing intersection between +Artificial Intelligence and Arts. Specifically, we first present an +Image-to-Flow network (I2FNet) to efficiently and robustly create the vector +flow field in a learning-based manner, which can provide a direction guide for +drawing lines. Then, we introduce our well-designed Double Flow Generator (DFG) +framework to fuse features from learned vector flow and input image flow +guaranteeing the spatial coherence of lines. Meanwhile, in order to allow for +controllable character line drawing generation, we integrate a Line Control +Matrix (LCM) into DFG and train a Line Control Regressor (LCR) to synthesize +drawings with different styles by elaborately controlling the level of details, +such as thickness, smoothness, and continuity, of lines. Finally, we design a +Fourier Transformation Loss to further constrain the character line generation +from the frequency domain view of the point. Quantitative and qualitative +experiments demonstrate that our approach can obtain superior performance in +producing high-resolution character line-drawing images with perceptually +realistic characteristics. + +
+
+
+
+
+ + ♻ ☆ A Video-based Detector for Suspicious Activity in Examination with + OpenPose + + +
+ Examinations are a crucial part of the learning process, and academic +institutions invest significant resources into maintaining their integrity by +preventing cheating from students or facilitators. However, cheating has become +rampant in examination setups, compromising their integrity. The traditional +method of relying on invigilators to monitor every student is impractical and +ineffective. To address this issue, there is a need to continuously record exam +sessions to monitor students for suspicious activities. However, these +recordings are often too lengthy for invigilators to analyze effectively, and +fatigue may cause them to miss significant details. To widen the coverage, +invigilators could use fixed overhead or wearable cameras. This paper +introduces a framework that uses automation to analyze videos and detect +suspicious activities during examinations efficiently and effectively. We +utilized the OpenPose framework and Convolutional Neural Network (CNN) to +identify students exchanging objects during exams. This detection system is +vital in preventing cheating and promoting academic integrity, fairness, and +quality education for institutions. + +
+
+
+
+
+ + ♻ ☆ Face Encryption via Frequency-Restricted Identity-Agnostic Attacks + + +
+ Billions of people are sharing their daily live images on social media +everyday. However, malicious collectors use deep face recognition systems to +easily steal their biometric information (e.g., faces) from these images. Some +studies are being conducted to generate encrypted face photos using adversarial +attacks by introducing imperceptible perturbations to reduce face information +leakage. However, existing studies need stronger black-box scenario feasibility +and more natural visual appearances, which challenge the feasibility of privacy +protection. To address these problems, we propose a frequency-restricted +identity-agnostic (FRIA) framework to encrypt face images from unauthorized +face recognition without access to personal information. As for the weak +black-box scenario feasibility, we obverse that representations of the average +feature in multiple face recognition models are similar, thus we propose to +utilize the average feature via the crawled dataset from the Internet as the +target to guide the generation, which is also agnostic to identities of unknown +face recognition systems; in nature, the low-frequency perturbations are more +visually perceptible by the human vision system. Inspired by this, we restrict +the perturbation in the low-frequency facial regions by discrete cosine +transform to achieve the visual naturalness guarantee. Extensive experiments on +several face recognition models demonstrate that our FRIA outperforms other +state-of-the-art methods in generating more natural encrypted faces while +attaining high black-box attack success rates of 96%. In addition, we validate +the efficacy of FRIA using real-world black-box commercial API, which reveals +the potential of FRIA in practice. Our codes can be found in +https://github.com/XinDong10/FRIA. + +
+
+
+
+
+ + ♻ ☆ Tackling Face Verification Edge Cases: In-Depth Analysis and + Human-Machine Fusion Approach + + +
+ Nowadays, face recognition systems surpass human performance on several +datasets. However, there are still edge cases that the machine can't correctly +classify. This paper investigates the effect of a combination of machine and +human operators in the face verification task. First, we look closer at the +edge cases for several state-of-the-art models to discover common datasets' +challenging settings. Then, we conduct a study with 60 participants on these +selected tasks with humans and provide an extensive analysis. Finally, we +demonstrate that combining machine and human decisions can further improve the +performance of state-of-the-art face verification systems on various benchmark +datasets. Code and data are publicly available on GitHub. + +
+
+
+
+
+ + ♻ ☆ VAD: Vectorized Scene Representation for Efficient Autonomous Driving ICCV 2023 + + +
+ Autonomous driving requires a comprehensive understanding of the surrounding +environment for reliable trajectory planning. Previous works rely on dense +rasterized scene representation (e.g., agent occupancy and semantic map) to +perform planning, which is computationally intensive and misses the +instance-level structure information. In this paper, we propose VAD, an +end-to-end vectorized paradigm for autonomous driving, which models the driving +scene as a fully vectorized representation. The proposed vectorized paradigm +has two significant advantages. On one hand, VAD exploits the vectorized agent +motion and map elements as explicit instance-level planning constraints which +effectively improves planning safety. On the other hand, VAD runs much faster +than previous end-to-end planning methods by getting rid of +computation-intensive rasterized representation and hand-designed +post-processing steps. VAD achieves state-of-the-art end-to-end planning +performance on the nuScenes dataset, outperforming the previous best method by +a large margin. Our base model, VAD-Base, greatly reduces the average collision +rate by 29.0% and runs 2.5x faster. Besides, a lightweight variant, VAD-Tiny, +greatly improves the inference speed (up to 9.3x) while achieving comparable +planning performance. We believe the excellent performance and the high +efficiency of VAD are critical for the real-world deployment of an autonomous +driving system. Code and models are available at https://github.com/hustvl/VAD +for facilitating future research. + +
+
+ comment: Accepted to ICCV 2023. Code&Demos: https://github.com/hustvl/VAD +
+
+
+
+
+ + ♻ ☆ Multi-modal Pre-training for Medical Vision-language Understanding and + Generation: An Empirical Study with A New Benchmark + + +
+ With the availability of large-scale, comprehensive, and general-purpose +vision-language (VL) datasets such as MSCOCO, vision-language pre-training +(VLP) has become an active area of research and proven to be effective for +various VL tasks such as visual-question answering. However, studies on VLP in +the medical domain have so far been scanty. To provide a comprehensive +perspective on VLP for medical VL tasks, we conduct a thorough experimental +analysis to study key factors that may affect the performance of VLP with a +unified vision-language Transformer. To allow making sound and quick +pre-training decisions, we propose RadioGraphy Captions (RGC), a high-quality, +multi-modality radiographic dataset containing 18,434 image-caption pairs +collected from an open-access online database MedPix. RGC can be used as a +pre-training dataset or a new benchmark for medical report generation and +medical image-text retrieval. By utilizing RGC and other available datasets for +pre-training, we develop several key insights that can guide future medical VLP +research and new strong baselines for various medical VL tasks. + +
+
+ comment: Published as oral paper in CHIL 2023 +
+
+
+
+
+ + ♻ ☆ Efficient Joint Optimization of Layer-Adaptive Weight Pruning in Deep + Neural Networks + + +
+ In this paper, we propose a novel layer-adaptive weight-pruning approach for +Deep Neural Networks (DNNs) that addresses the challenge of optimizing the +output distortion minimization while adhering to a target pruning ratio +constraint. Our approach takes into account the collective influence of all +layers to design a layer-adaptive pruning scheme. We discover and utilize a +very important additivity property of output distortion caused by pruning +weights on multiple layers. This property enables us to formulate the pruning +as a combinatorial optimization problem and efficiently solve it through +dynamic programming. By decomposing the problem into sub-problems, we achieve +linear time complexity, making our optimization algorithm fast and feasible to +run on CPUs. Our extensive experiments demonstrate the superiority of our +approach over existing methods on the ImageNet and CIFAR-10 datasets. On +CIFAR-10, our method achieves remarkable improvements, outperforming others by +up to 1.0% for ResNet-32, 0.5% for VGG-16, and 0.7% for DenseNet-121 in terms +of top-1 accuracy. On ImageNet, we achieve up to 4.7% and 4.6% higher top-1 +accuracy compared to other methods for VGG-16 and ResNet-50, respectively. +These results highlight the effectiveness and practicality of our approach for +enhancing DNN performance through layer-adaptive weight pruning. Code will be +available on https://github.com/Akimoto-Cris/RD_VIT_PRUNE. + +
+
+
+
+
+ + ♻ ☆ HCDG: A Hierarchical Consistency Framework for Domain Generalization on + Medical Image Segmentation + + +
+ Modern deep neural networks struggle to transfer knowledge and generalize +across diverse domains when deployed to real-world applications. Currently, +domain generalization (DG) is introduced to learn a universal representation +from multiple domains to improve the network generalization ability on unseen +domains. However, previous DG methods only focus on the data-level consistency +scheme without considering the synergistic regularization among different +consistency schemes. In this paper, we present a novel Hierarchical Consistency +framework for Domain Generalization (HCDG) by integrating Extrinsic Consistency +and Intrinsic Consistency synergistically. Particularly, for the Extrinsic +Consistency, we leverage the knowledge across multiple source domains to +enforce data-level consistency. To better enhance such consistency, we design a +novel Amplitude Gaussian-mixing strategy into Fourier-based data augmentation +called DomainUp. For the Intrinsic Consistency, we perform task-level +consistency for the same instance under the dual-task scenario. We evaluate the +proposed HCDG framework on two medical image segmentation tasks, i.e., optic +cup/disc segmentation on fundus images and prostate MRI segmentation. Extensive +experimental results manifest the effectiveness and versatility of our HCDG +framework. + +
+
+ comment: this paper is currently not published +
+
+
+
+
+ + ♻ ☆ Visual Crowd Analysis: Open Research Problems + + +
+ Over the last decade, there has been a remarkable surge in interest in +automated crowd monitoring within the computer vision community. Modern +deep-learning approaches have made it possible to develop fully-automated +vision-based crowd-monitoring applications. However, despite the magnitude of +the issue at hand, the significant technological advancements, and the +consistent interest of the research community, there are still numerous +challenges that need to be overcome. In this article, we delve into six major +areas of visual crowd analysis, emphasizing the key developments in each of +these areas. We outline the crucial unresolved issues that must be tackled in +future works, in order to ensure that the field of automated crowd monitoring +continues to progress and thrive. Several surveys related to this topic have +been conducted in the past. Nonetheless, this article thoroughly examines and +presents a more intuitive categorization of works, while also depicting the +latest breakthroughs within the field, incorporating more recent studies +carried out within the last few years in a concise manner. By carefully +choosing prominent works with significant contributions in terms of novelty or +performance gains, this paper presents a more comprehensive exposition of +advancements in the current state-of-the-art. + +
+
+ comment: Accepted in AI Magazine published by Wiley Periodicals LLC on behalf + of the Association for the Advancement of Artificial Intelligence +
+
+
+
+
+ + ♻ ☆ Feature Unlearning for Pre-trained GANs and VAEs + + +
+ We tackle the problem of feature unlearning from a pre-trained image +generative model: GANs and VAEs. Unlike a common unlearning task where an +unlearning target is a subset of the training set, we aim to unlearn a specific +feature, such as hairstyle from facial images, from the pre-trained generative +models. As the target feature is only presented in a local region of an image, +unlearning the entire image from the pre-trained model may result in losing +other details in the remaining region of the image. To specify which features +to unlearn, we collect randomly generated images that contain the target +features. We then identify a latent representation corresponding to the target +feature and then use the representation to fine-tune the pre-trained model. +Through experiments on MNIST and CelebA datasets, we show that target features +are successfully removed while keeping the fidelity of the original models. +Further experiments with an adversarial attack show that the unlearned model is +more robust under the presence of malicious parties. + +
+
+
+
+
+ + ♻ ☆ CCDWT-GAN: Generative Adversarial Networks Based on Color Channel Using + Discrete Wavelet Transform for Document Image Binarization PRICAI 2023 + + +
+ To efficiently extract textual information from color degraded document +images is a significant research area. The prolonged imperfect preservation of +ancient documents has led to various types of degradation, such as page +staining, paper yellowing, and ink bleeding. These types of degradation badly +impact the image processing for features extraction. This paper introduces a +novelty method employing generative adversarial networks based on color channel +using discrete wavelet transform (CCDWT-GAN). The proposed method involves +three stages: image preprocessing, image enhancement, and image binarization. +In the initial step, we apply discrete wavelet transform (DWT) to retain the +low-low (LL) subband image, thereby enhancing image quality. Subsequently, we +divide the original input image into four single-channel colors (red, green, +blue, and gray) to separately train adversarial networks. For the extraction of +global and local features, we utilize the output image from the image +enhancement stage and the entire input image to train adversarial networks +independently, and then combine these two results as the final output. To +validate the positive impact of the image enhancement and binarization stages +on model performance, we conduct an ablation study. This work compares the +performance of the proposed method with other state-of-the-art (SOTA) methods +on DIBCO and H-DIBCO ((Handwritten) Document Image Binarization Competition) +datasets. The experimental results demonstrate that CCDWT-GAN achieves a top +two performance on multiple benchmark datasets. Notably, on DIBCO 2013 and 2016 +dataset, our method achieves F-measure (FM) values of 95.24 and 91.46, +respectively. + +
+
+ comment: accepted by PRICAI 2023 +
+
+
+
+
+ + ♻ ☆ ACLS: Adaptive and Conditional Label Smoothing for Network Calibration ICCV 2023 + + +
+ We address the problem of network calibration adjusting miscalibrated +confidences of deep neural networks. Many approaches to network calibration +adopt a regularization-based method that exploits a regularization term to +smooth the miscalibrated confidences. Although these approaches have shown the +effectiveness on calibrating the networks, there is still a lack of +understanding on the underlying principles of regularization in terms of +network calibration. We present in this paper an in-depth analysis of existing +regularization-based methods, providing a better understanding on how they +affect to network calibration. Specifically, we have observed that 1) the +regularization-based methods can be interpreted as variants of label smoothing, +and 2) they do not always behave desirably. Based on the analysis, we introduce +a novel loss function, dubbed ACLS, that unifies the merits of existing +regularization methods, while avoiding the limitations. We show extensive +experimental results for image classification and semantic segmentation on +standard benchmarks, including CIFAR10, Tiny-ImageNet, ImageNet, and PASCAL +VOC, demonstrating the effectiveness of our loss function. + +
+
+ comment: Accepted to ICCV 2023 (Oral presentation) +
+
+
+
+
+ + ♻ ☆ Towards an Accurate and Secure Detector against Adversarial + Perturbations + + +
+ The vulnerability of deep neural networks to adversarial perturbations has +been widely perceived in the computer vision community. From a security +perspective, it poses a critical risk for modern vision systems, e.g., the +popular Deep Learning as a Service (DLaaS) frameworks. For protecting +off-the-shelf deep models while not modifying them, current algorithms +typically detect adversarial patterns through discriminative decomposition of +natural-artificial data. However, these decompositions are biased towards +frequency or spatial discriminability, thus failing to capture adversarial +patterns comprehensively. More seriously, successful defense-aware (secondary) +adversarial attack (i.e., evading the detector as well as fooling the model) is +practical under the assumption that the adversary is fully aware of the +detector (i.e., the Kerckhoffs's principle). Motivated by such facts, we +propose an accurate and secure adversarial example detector, relying on a +spatial-frequency discriminative decomposition with secret keys. It expands the +above works on two aspects: 1) the introduced Krawtchouk basis provides better +spatial-frequency discriminability and thereby is more suitable for capturing +adversarial patterns than the common trigonometric or wavelet basis; 2) the +extensive parameters for decomposition are generated by a pseudo-random +function with secret keys, hence blocking the defense-aware adversarial attack. +Theoretical and numerical analysis demonstrates the increased accuracy and +security of our detector with respect to a number of state-of-the-art +algorithms. + +
+
+
+
+
+ + ♻ ☆ Strivec: Sparse Tri-Vector Radiance Fields + + +
+ We propose Strivec, a novel neural representation that models a 3D scene as a +radiance field with sparsely distributed and compactly factorized local tensor +feature grids. Our approach leverages tensor decomposition, following the +recent work TensoRF, to model the tensor grids. In contrast to TensoRF which +uses a global tensor and focuses on their vector-matrix decomposition, we +propose to utilize a cloud of local tensors and apply the classic +CANDECOMP/PARAFAC (CP) decomposition to factorize each tensor into triple +vectors that express local feature distributions along spatial axes and +compactly encode a local neural field. We also apply multi-scale tensor grids +to discover the geometry and appearance commonalities and exploit spatial +coherence with the tri-vector factorization at multiple local scales. The final +radiance field properties are regressed by aggregating neural features from +multiple local tensors across all scales. Our tri-vector tensors are sparsely +distributed around the actual scene surface, discovered by a fast coarse +reconstruction, leveraging the sparsity of a 3D scene. We demonstrate that our +model can achieve better rendering quality while using significantly fewer +parameters than previous methods, including TensoRF and Instant-NGP. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Manifold Linearizing and Clustering + + +
+ We consider the problem of simultaneously clustering and learning a linear +representation of data lying close to a union of low-dimensional manifolds, a +fundamental task in machine learning and computer vision. When the manifolds +are assumed to be linear subspaces, this reduces to the classical problem of +subspace clustering, which has been studied extensively over the past two +decades. Unfortunately, many real-world datasets such as natural images can not +be well approximated by linear subspaces. On the other hand, numerous works +have attempted to learn an appropriate transformation of the data, such that +data is mapped from a union of general non-linear manifolds to a union of +linear subspaces (with points from the same manifold being mapped to the same +subspace). However, many existing works have limitations such as assuming +knowledge of the membership of samples to clusters, requiring high sampling +density, or being shown theoretically to learn trivial representations. In this +paper, we propose to optimize the Maximal Coding Rate Reduction metric with +respect to both the data representation and a novel doubly stochastic cluster +membership, inspired by state-of-the-art subspace clustering results. We give a +parameterization of such a representation and membership, allowing efficient +mini-batching and one-shot initialization. Experiments on CIFAR-10, -20, -100, +and TinyImageNet-200 datasets show that the proposed method is much more +accurate and scalable than state-of-the-art deep clustering methods, and +further learns a latent linear representation of the data. + +
+
+
+
+
+ + ♻ ☆ BallGAN: 3D-aware Image Synthesis with a Spherical Background ICCV 2023 + + +
+ 3D-aware GANs aim to synthesize realistic 3D scenes such that they can be +rendered in arbitrary perspectives to produce images. Although previous methods +produce realistic images, they suffer from unstable training or degenerate +solutions where the 3D geometry is unnatural. We hypothesize that the 3D +geometry is underdetermined due to the insufficient constraint, i.e., being +classified as real image to the discriminator is not enough. To solve this +problem, we propose to approximate the background as a spherical surface and +represent a scene as a union of the foreground placed in the sphere and the +thin spherical background. It reduces the degree of freedom in the background +field. Accordingly, we modify the volume rendering equation and incorporate +dedicated constraints to design a novel 3D-aware GAN framework named BallGAN. +BallGAN has multiple advantages as follows. 1) It produces more reasonable 3D +geometry; the images of a scene across different viewpoints have better +photometric consistency and fidelity than the state-of-the-art methods. 2) The +training becomes much more stable. 3) The foreground can be separately rendered +on top of different arbitrary backgrounds. + +
+
+ comment: ICCV 2023, Project Page: https://minjung-s.github.io/ballgan +
+
+
+
+
+ + ♻ ☆ R2Det: Redemption from Range-view for Accurate 3D Object Detection + + +
+ LiDAR-based 3D object detection is of paramount importance for autonomous +driving. Recent trends show a remarkable improvement for bird's-eye-view (BEV) +based and point-based methods as they demonstrate superior performance compared +to range-view counterparts. This paper presents an insight that leverages +range-view representation to enhance 3D points for accurate 3D object +detection. Specifically, we introduce a Redemption from Range-view Module +(R2M), a plug-and-play approach for 3D surface texture enhancement from the 2D +range view to the 3D point view. R2M comprises BasicBlock for 2D feature +extraction, Hierarchical-dilated (HD) Meta Kernel for expanding the 3D +receptive field, and Feature Points Redemption (FPR) for recovering 3D surface +texture information. R2M can be seamlessly integrated into state-of-the-art +LiDAR-based 3D object detectors as preprocessing and achieve appealing +improvement, e.g., 1.39%, 1.67%, and 1.97% mAP improvement on easy, moderate, +and hard difficulty level of KITTI val set, respectively. Based on R2M, we +further propose R2Detector (R2Det) with the Synchronous-Grid RoI Pooling for +accurate box refinement. R2Det outperforms existing range-view-based methods by +a significant margin on both the KITTI benchmark and the Waymo Open Dataset. +Codes will be made publicly available. + +
+
+
+
+
+ + ♻ ☆ GridMM: Grid Memory Map for Vision-and-Language Navigation ICCV 2023 + + +
+ Vision-and-language navigation (VLN) enables the agent to navigate to a +remote location following the natural language instruction in 3D environments. +To represent the previously visited environment, most approaches for VLN +implement memory using recurrent states, topological maps, or top-down semantic +maps. In contrast to these approaches, we build the top-down egocentric and +dynamically growing Grid Memory Map (i.e., GridMM) to structure the visited +environment. From a global perspective, historical observations are projected +into a unified grid map in a top-down view, which can better represent the +spatial relations of the environment. From a local perspective, we further +propose an instruction relevance aggregation method to capture fine-grained +visual clues in each grid region. Extensive experiments are conducted on both +the REVERIE, R2R, SOON datasets in the discrete environments, and the R2R-CE +dataset in the continuous environments, showing the superiority of our proposed +method. + +
+
+ comment: Accepted by ICCV 2023. The code is available at + https://github.com/MrZihan/GridMM +
+
+
+
+
+ + ♻ ☆ Neural Fourier Filter Bank + + +
+ We present a novel method to provide efficient and highly detailed +reconstructions. Inspired by wavelets, we learn a neural field that decompose +the signal both spatially and frequency-wise. We follow the recent grid-based +paradigm for spatial decomposition, but unlike existing work, encourage +specific frequencies to be stored in each grid via Fourier features encodings. +We then apply a multi-layer perceptron with sine activations, taking these +Fourier encoded features in at appropriate layers so that higher-frequency +components are accumulated on top of lower-frequency components sequentially, +which we sum up to form the final output. We demonstrate that our method +outperforms the state of the art regarding model compactness and convergence +speed on multiple tasks: 2D image fitting, 3D shape reconstruction, and neural +radiance fields. Our code is available at https://github.com/ubc-vision/NFFB. + +
+
+
+
+
+ + ♻ ☆ PointCaM: Cut-and-Mix for Open-Set Point Cloud Learning + + +
+ Point cloud learning is receiving increasing attention, however, most +existing point cloud models lack the practical ability to deal with the +unavoidable presence of unknown objects. This paper mainly discusses point +cloud learning under open-set settings, where we train the model without data +from unknown classes and identify them in the inference stage. Basically, we +propose to solve open-set point cloud learning using a novel Point Cut-and-Mix +mechanism consisting of Unknown-Point Simulator and Unknown-Point Estimator +modules. Specifically, we use the Unknown-Point Simulator to simulate +out-of-distribution data in the training stage by manipulating the geometric +context of partial known data. Based on this, the Unknown-Point Estimator +module learns to exploit the point cloud's feature context for discriminating +the known and unknown data. Extensive experiments show the plausibility of +open-set point cloud learning and the effectiveness of our proposed solutions. +Our code is available at \url{https://github.com/ShiQiu0419/pointcam}. + +
+
+
+
+
+ + ♻ ☆ MonoDETR: Depth-guided Transformer for Monocular 3D Object Detection ICCV 2023 + + +
+ Monocular 3D object detection has long been a challenging task in autonomous +driving. Most existing methods follow conventional 2D detectors to first +localize object centers, and then predict 3D attributes by neighboring +features. However, only using local visual features is insufficient to +understand the scene-level 3D spatial structures and ignores the long-range +inter-object depth relations. In this paper, we introduce the first DETR +framework for Monocular DEtection with a depth-guided TRansformer, named +MonoDETR. We modify the vanilla transformer to be depth-aware and guide the +whole detection process by contextual depth cues. Specifically, concurrent to +the visual encoder that captures object appearances, we introduce to predict a +foreground depth map, and specialize a depth encoder to extract non-local depth +embeddings. Then, we formulate 3D object candidates as learnable queries and +propose a depth-guided decoder to conduct object-scene depth interactions. In +this way, each object query estimates its 3D attributes adaptively from the +depth-guided regions on the image and is no longer constrained to local visual +features. On KITTI benchmark with monocular images as input, MonoDETR achieves +state-of-the-art performance and requires no extra dense depth annotations. +Besides, our depth-guided modules can also be plug-and-play to enhance +multi-view 3D object detectors on nuScenes dataset, demonstrating our superior +generalization capacity. Code is available at +https://github.com/ZrrSkywalker/MonoDETR. + +
+
+ comment: Accepted by ICCV 2023. Code is available at + https://github.com/ZrrSkywalker/MonoDETR +
+
+
+
+
+ + ♻ ☆ Pluralistic Aging Diffusion Autoencoder ICCV 2023 + + +
+ Face aging is an ill-posed problem because multiple plausible aging patterns +may correspond to a given input. Most existing methods often produce one +deterministic estimation. This paper proposes a novel CLIP-driven Pluralistic +Aging Diffusion Autoencoder (PADA) to enhance the diversity of aging patterns. +First, we employ diffusion models to generate diverse low-level aging details +via a sequential denoising reverse process. Second, we present Probabilistic +Aging Embedding (PAE) to capture diverse high-level aging patterns, which +represents age information as probabilistic distributions in the common CLIP +latent space. A text-guided KL-divergence loss is designed to guide this +learning. Our method can achieve pluralistic face aging conditioned on +open-world aging texts and arbitrary unseen face images. Qualitative and +quantitative experiments demonstrate that our method can generate more diverse +and high-quality plausible aging results. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Integrated Image and Location Analysis for Wound Classification: A Deep + Learning Approach + + +
+ The global burden of acute and chronic wounds presents a compelling case for +enhancing wound classification methods, a vital step in diagnosing and +determining optimal treatments. Recognizing this need, we introduce an +innovative multi-modal network based on a deep convolutional neural network for +categorizing wounds into four categories: diabetic, pressure, surgical, and +venous ulcers. Our multi-modal network uses wound images and their +corresponding body locations for more precise classification. A unique aspect +of our methodology is incorporating a body map system that facilitates accurate +wound location tagging, improving upon traditional wound image classification +techniques. A distinctive feature of our approach is the integration of models +such as VGG16, ResNet152, and EfficientNet within a novel architecture. This +architecture includes elements like spatial and channel-wise +Squeeze-and-Excitation modules, Axial Attention, and an Adaptive Gated +Multi-Layer Perceptron, providing a robust foundation for classification. Our +multi-modal network was trained and evaluated on two distinct datasets +comprising relevant images and corresponding location information. Notably, our +proposed network outperformed traditional methods, reaching an accuracy range +of 74.79% to 100% for Region of Interest (ROI) without location +classifications, 73.98% to 100% for ROI with location classifications, and +78.10% to 100% for whole image classifications. This marks a significant +enhancement over previously reported performance metrics in the literature. Our +results indicate the potential of our multi-modal network as an effective +decision-support tool for wound image classification, paving the way for its +application in various clinical contexts. + +
+
+
+
+
+ + ♻ ☆ Trip-ROMA: Self-Supervised Learning with Triplets and Random Mappings + + +
+ Contrastive self-supervised learning (SSL) methods, such as MoCo and SimCLR, +have achieved great success in unsupervised visual representation learning. +They rely on a large number of negative pairs and thus require either large +memory banks or large batches. Some recent non-contrastive SSL methods, such as +BYOL and SimSiam, attempt to discard negative pairs and have also shown +remarkable performance. To avoid collapsed solutions caused by not using +negative pairs, these methods require non-trivial asymmetry designs. However, +in small data regimes, we can not obtain a sufficient number of negative pairs +or effectively avoid the over-fitting problem when negatives are not used at +all. To address this situation, we argue that negative pairs are still +important but one is generally sufficient for each positive pair. We show that +a simple Triplet-based loss (Trip) can achieve surprisingly good performance +without requiring large batches or asymmetry designs. Moreover, to alleviate +the over-fitting problem in small data regimes and further enhance the effect +of Trip, we propose a simple plug-and-play RandOm MApping (ROMA) strategy by +randomly mapping samples into other spaces and requiring these randomly +projected samples to satisfy the same relationship indicated by the triplets. +Integrating the triplet-based loss with random mapping, we obtain the proposed +method Trip-ROMA. Extensive experiments, including unsupervised representation +learning and unsupervised few-shot learning, have been conducted on ImageNet-1K +and seven small datasets. They successfully demonstrate the effectiveness of +Trip-ROMA and consistently show that ROMA can further effectively boost other +SSL methods. Code is available at https://github.com/WenbinLee/Trip-ROMA. + +
+
+ comment: Accepted to Transactions on Machine Learning Research (TMLR) 2023 +
+
+
+
+
+ + ♻ ☆ GeodesicPSIM: Predicting the Quality of Static Mesh with Texture Map via + Geodesic Patch Similarity + + +
+ Static meshes with texture maps have attracted considerable attention in both +industrial manufacturing and academic research, leading to an urgent +requirement for effective and robust objective quality evaluation. However, +current model-based static mesh quality metrics have obvious limitations: most +of them only consider geometry information, while color information is ignored, +and they have strict constraints for the meshes' geometrical topology. Other +metrics, such as image-based and point-based metrics, are easily influenced by +the prepossessing algorithms, e.g., projection and sampling, hampering their +ability to perform at their best. In this paper, we propose Geodesic Patch +Similarity (GeodesicPSIM), a novel model-based metric to accurately predict +human perception quality for static meshes. After selecting a group keypoints, +1-hop geodesic patches are constructed based on both the reference and +distorted meshes cleaned by an effective mesh cleaning algorithm. A two-step +patch cropping algorithm and a patch texture mapping module refine the size of +1-hop geodesic patches and build the relationship between the mesh geometry and +color information, resulting in the generation of 1-hop textured geodesic +patches. Three types of features are extracted to quantify the distortion: +patch color smoothness, patch discrete mean curvature, and patch pixel color +average and variance. To the best of our knowledge, GeodesicPSIM is the first +model-based metric especially designed for static meshes with texture maps. +GeodesicPSIM provides state-of-the-art performance in comparison with +image-based, point-based, and video-based metrics on a newly created and +challenging database. We also prove the robustness of GeodesicPSIM by +introducing different settings of hyperparameters. Ablation studies also +exhibit the effectiveness of three proposed features and the patch cropping +algorithm. + +
+
+
+
+
+ + ♻ ☆ Efficient Adaptive Activation Rounding for Post-Training Quantization + + +
+ Post-training quantization attracts increasing attention due to its +convenience in deploying quantized neural networks. Although +rounding-to-nearest remains the prevailing method for DNN quantization, prior +research has demonstrated its suboptimal nature when applied to weight +quantization. They propose optimizing weight rounding schemes by leveraging +output error rather than the traditional weight quantization error. Our study +reveals that similar rounding challenges also extend to activation +quantization. Despite the easy generalization, the challenges lie in the +dynamic nature of activation. Adaptive rounding is expected for varying +activations and the method is subjected to runtime overhead. To tackle this, we +propose the AQuant quantization framework with a novel perspective to reduce +output error by adjusting rounding schemes of activations. Instead of using the +constant rounding border 0.5 of the rounding-to-nearest operation, we make the +border become a function w.r.t. the activation value to change the activation +rounding by the adaptive border. To deal with the runtime overhead, we use a +coarse-grained version of the border function. Finally, we introduce our +framework to optimize the border function. Extensive experiments show that +AQuant achieves notable improvements compared to state-of-the-art works and +pushes the accuracy of ResNet-18 up to 60.31% under the 2-bit weight and +activation quantization. + +
+
+
+
+
+ + ♻ ☆ Heterogeneous Forgetting Compensation for Class-Incremental Learning ICCV2023 + + +
+ Class-incremental learning (CIL) has achieved remarkable successes in +learning new classes consecutively while overcoming catastrophic forgetting on +old categories. However, most existing CIL methods unreasonably assume that all +old categories have the same forgetting pace, and neglect negative influence of +forgetting heterogeneity among different old classes on forgetting +compensation. To surmount the above challenges, we develop a novel +Heterogeneous Forgetting Compensation (HFC) model, which can resolve +heterogeneous forgetting of easy-to-forget and hard-to-forget old categories +from both representation and gradient aspects. Specifically, we design a +task-semantic aggregation block to alleviate heterogeneous forgetting from +representation aspect. It aggregates local category information within each +task to learn task-shared global representations. Moreover, we develop two +novel plug-and-play losses: a gradient-balanced forgetting compensation loss +and a gradient-balanced relation distillation loss to alleviate forgetting from +gradient aspect. They consider gradient-balanced compensation to rectify +forgetting heterogeneity of old categories and heterogeneous relation +consistency. Experiments on several representative datasets illustrate +effectiveness of our HFC model. The code is available at +https://github.com/JiahuaDong/HFC. + +
+
+ comment: Accepted to ICCV2023 +
+
+
+
+
+ + ♻ ☆ No One Left Behind: Real-World Federated Class-Incremental Learning + + +
+ Federated learning (FL) is a hot collaborative training framework via +aggregating model parameters of decentralized local clients. However, most FL +methods unreasonably assume data categories of FL framework are known and fixed +in advance. Moreover, some new local clients that collect novel categories +unseen by other clients may be introduced to FL training irregularly. These +issues render global model to undergo catastrophic forgetting on old +categories, when local clients receive new categories consecutively under +limited memory of storing old categories. To tackle the above issues, we +propose a novel Local-Global Anti-forgetting (LGA) model. It ensures no local +clients are left behind as they learn new classes continually, by addressing +local and global catastrophic forgetting. Specifically, considering tackling +class imbalance of local client to surmount local forgetting, we develop a +category-balanced gradient-adaptive compensation loss and a category +gradient-induced semantic distillation loss. They can balance heterogeneous +forgetting speeds of hard-to-forget and easy-to-forget old categories, while +ensure consistent class-relations within different tasks. Moreover, a proxy +server is designed to tackle global forgetting caused by Non-IID class +imbalance between different clients. It augments perturbed prototype images of +new categories collected from local clients via self-supervised prototype +augmentation, thus improving robustness to choose the best old global model for +local-side semantic distillation loss. Experiments on representative datasets +verify superior performance of our model against comparison methods. The code +is available at https://github.com/JiahuaDong/LGA. + +
+
+ comment: 17 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ CLIPN for Zero-Shot OOD Detection: Teaching CLIP to Say No ICCV 2023 + + +
+ Out-of-distribution (OOD) detection refers to training the model on an +in-distribution (ID) dataset to classify whether the input images come from +unknown classes. Considerable effort has been invested in designing various OOD +detection methods based on either convolutional neural networks or +transformers. However, zero-shot OOD detection methods driven by CLIP, which +only require class names for ID, have received less attention. This paper +presents a novel method, namely CLIP saying no (CLIPN), which empowers the +logic of saying no within CLIP. Our key motivation is to equip CLIP with the +capability of distinguishing OOD and ID samples using positive-semantic prompts +and negation-semantic prompts. Specifically, we design a novel learnable no +prompt and a no text encoder to capture negation semantics within images. +Subsequently, we introduce two loss functions: the image-text binary-opposite +loss and the text semantic-opposite loss, which we use to teach CLIPN to +associate images with no prompts, thereby enabling it to identify unknown +samples. Furthermore, we propose two threshold-free inference algorithms to +perform OOD detection by utilizing negation semantics from no prompts and the +text encoder. Experimental results on 9 benchmark datasets (3 ID datasets and 6 +OOD datasets) for the OOD detection task demonstrate that CLIPN, based on +ViT-B-16, outperforms 7 well-used algorithms by at least 2.34% and 11.64% in +terms of AUROC and FPR95 for zero-shot OOD detection on ImageNet-1K. Our CLIPN +can serve as a solid foundation for effectively leveraging CLIP in downstream +OOD tasks. The code is available on https://github.com/xmed-lab/CLIPN. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ DALNet: A Rail Detection Network Based on Dynamic Anchor Line + + +
+ Rail detection is one of the key factors for intelligent train. In the paper, +motivated by the anchor line-based lane detection methods, we propose a rail +detection network called DALNet based on dynamic anchor line. Aiming to solve +the problem that the predefined anchor line is image agnostic, we design a +novel dynamic anchor line mechanism. It utilizes a dynamic anchor line +generator to dynamically generate an appropriate anchor line for each rail +instance based on the position and shape of the rails in the input image. These +dynamically generated anchor lines can be considered as better position +references to accurately localize the rails than the predefined anchor lines. +In addition, we present a challenging urban rail detection dataset DL-Rail with +high-quality annotations and scenario diversity. DL-Rail contains 7000 pairs of +images and annotations along with scene tags, and it is expected to encourage +the development of rail detection. We extensively compare DALNet with many +competitive lane methods. The results show that our DALNet achieves +state-of-the-art performance on our DL-Rail rail detection dataset and the +popular Tusimple and LLAMAS lane detection benchmarks. The code will be +released at https://github.com/Yzichen/mmLaneDet. + +
+
+
+
+
+ + ♻ ☆ Dynamic Open Vocabulary Enhanced Safe-landing with Intelligence + (DOVESEI) IROS 2023 + + +
+ This work targets what we consider to be the foundational step for urban +airborne robots, a safe landing. Our attention is directed toward what we deem +the most crucial aspect of the safe landing perception stack: segmentation. We +present a streamlined reactive UAV system that employs visual servoing by +harnessing the capabilities of open vocabulary image segmentation. This +approach can adapt to various scenarios with minimal adjustments, bypassing the +necessity for extensive data accumulation for refining internal models, thanks +to its open vocabulary methodology. Given the limitations imposed by local +authorities, our primary focus centers on operations originating from altitudes +of 100 meters. This choice is deliberate, as numerous preceding works have +dealt with altitudes up to 30 meters, aligning with the capabilities of small +stereo cameras. Consequently, we leave the remaining 20m to be navigated using +conventional 3D path planning methods. Utilizing monocular cameras and image +segmentation, our findings demonstrate the system's capability to successfully +execute landing maneuvers at altitudes as low as 20 meters. However, this +approach is vulnerable to intermittent and occasionally abrupt fluctuations in +the segmentation between frames in a video stream. To address this challenge, +we enhance the image segmentation output by introducing what we call a dynamic +focus: a masking mechanism that self adjusts according to the current landing +stage. This dynamic focus guides the control system to avoid regions beyond the +drone's safety radius projected onto the ground, thus mitigating the problems +with fluctuations. Through the implementation of this supplementary layer, our +experiments have reached improvements in the landing success rate of almost +tenfold when compared to global segmentation. All the source code is open +source and available online (github.com/MISTLab/DOVESEI). + +
+
+ comment: Submitted to IROS 2023 The Last-Mile Robotics Workshop +
+
+
+
+
+ + ♻ ☆ Region-Aware Pretraining for Open-Vocabulary Object Detection with + Vision Transformers CVPR 2023 + + +
+ We present Region-aware Open-vocabulary Vision Transformers (RO-ViT) - a +contrastive image-text pretraining recipe to bridge the gap between image-level +pretraining and open-vocabulary object detection. At the pretraining phase, we +propose to randomly crop and resize regions of positional embeddings instead of +using the whole image positional embeddings. This better matches the use of +positional embeddings at region-level in the detection finetuning phase. In +addition, we replace the common softmax cross entropy loss in contrastive +learning with focal loss to better learn the informative yet difficult +examples. Finally, we leverage recent advances in novel object proposals to +improve open-vocabulary detection finetuning. We evaluate our full model on the +LVIS and COCO open-vocabulary detection benchmarks and zero-shot transfer. +RO-ViT achieves a state-of-the-art 34.1 $AP_r$ on LVIS, surpassing the best +existing approach by +7.8 points in addition to competitive zero-shot transfer +detection. Surprisingly, RO-ViT improves the image-level representation as well +and achieves the state of the art on 9 out of 12 metrics on COCO and Flickr +image-text retrieval benchmarks, outperforming competitive approaches with +larger models. + +
+
+ comment: CVPR 2023 Highlight (https://github.com/mcahny/rovit); adds LAION-2B + result +
+
+
+
+
+ + ♻ ☆ SceneRF: Self-Supervised Monocular 3D Scene Reconstruction with Radiance + Fields ICCV 2023 + + +
+ 3D reconstruction from a single 2D image was extensively covered in the +literature but relies on depth supervision at training time, which limits its +applicability. To relax the dependence to depth we propose SceneRF, a +self-supervised monocular scene reconstruction method using only posed image +sequences for training. Fueled by the recent progress in neural radiance fields +(NeRF) we optimize a radiance field though with explicit depth optimization and +a novel probabilistic sampling strategy to efficiently handle large scenes. At +inference, a single input image suffices to hallucinate novel depth views which +are fused together to obtain 3D scene reconstruction. Thorough experiments +demonstrate that we outperform all baselines for novel depth views synthesis +and scene reconstruction, on indoor BundleFusion and outdoor SemanticKITTI. +Code is available at https://astra-vision.github.io/SceneRF . + +
+
+ comment: ICCV 2023. Project page: https://astra-vision.github.io/SceneRF +
+
+
+
+
+ + ♻ ☆ On the in vivo recognition of kidney stones using machine learning + + +
+ Determining the type of kidney stones allows urologists to prescribe a +treatment to avoid recurrence of renal lithiasis. An automated in-vivo +image-based classification method would be an important step towards an +immediate identification of the kidney stone type required as a first phase of +the diagnosis. In the literature it was shown on ex-vivo data (i.e., in very +controlled scene and image acquisition conditions) that an automated kidney +stone classification is indeed feasible. This pilot study compares the kidney +stone recognition performances of six shallow machine learning methods and +three deep-learning architectures which were tested with in-vivo images of the +four most frequent urinary calculi types acquired with an endoscope during +standard ureteroscopies. This contribution details the database construction +and the design of the tested kidney stones classifiers. Even if the best +results were obtained by the Inception v3 architecture (weighted precision, +recall and F1-score of 0.97, 0.98 and 0.97, respectively), it is also shown +that choosing an appropriate colour space and texture features allows a shallow +machine learning method to approach closely the performances of the most +promising deep-learning methods (the XGBoost classifier led to weighted +precision, recall and F1-score values of 0.96). This paper is the first one +that explores the most discriminant features to be extracted from images +acquired during ureteroscopies. + +
+
+ comment: Paper submitted to IEEE Access +
+
+
+
+
+ + ♻ ☆ Rapid building damage assessment workflow: An implementation for the + 2023 Rolling Fork, Mississippi tornado event ICCV + + +
+ Rapid and accurate building damage assessments from high-resolution satellite +imagery following a natural disaster is essential to inform and optimize first +responder efforts. However, performing such building damage assessments in an +automated manner is non-trivial due to the challenges posed by variations in +disaster-specific damage, diversity in satellite imagery, and the dearth of +extensive, labeled datasets. To circumvent these issues, this paper introduces +a human-in-the-loop workflow for rapidly training building damage assessment +models after a natural disaster. This article details a case study using this +workflow, executed in partnership with the American Red Cross during a tornado +event in Rolling Fork, Mississippi in March, 2023. The output from our +human-in-the-loop modeling process achieved a precision of 0.86 and recall of +0.80 for damaged buildings when compared to ground truth data collected +post-disaster. This workflow was implemented end-to-end in under 2 hours per +satellite imagery scene, highlighting its potential for real-time deployment. + +
+
+ comment: Accepted at the 2023 ICCV Humanitarian Assistance and Disaster + Response workshop +
+
+
+
+
+ + ♻ ☆ VEIL: Vetting Extracted Image Labels from In-the-Wild Captions for + Weakly-Supervised Object Detection + + +
+ The use of large-scale vision-language datasets is limited for object +detection due to the negative impact of label noise on localization. Prior +methods have shown how such large-scale datasets can be used for pretraining, +which can provide initial signal for localization, but is insufficient without +clean bounding-box data for at least some categories. We propose a technique to +"vet" labels extracted from noisy captions, and use them for weakly-supervised +object detection (WSOD). We conduct analysis of the types of label noise in +captions, and train a classifier that predicts if an extracted label is +actually present in the image or not. Our classifier generalizes across dataset +boundaries and across categories. We compare the classifier to eleven baselines +on five datasets, and demonstrate that it can improve WSOD without label +vetting by 30% (31.2 to 40.5 mAP when evaluated on PASCAL VOC) + +
+
+
+
+
+ + ♻ ☆ Training-based Model Refinement and Representation Disagreement for + Semi-Supervised Object Detection + + +
+ Semi-supervised object detection (SSOD) aims to improve the performance and +generalization of existing object detectors by utilizing limited labeled data +and extensive unlabeled data. Despite many advances, recent SSOD methods are +still challenged by inadequate model refinement using the classical exponential +moving average (EMA) strategy, the consensus of Teacher-Student models in the +latter stages of training (i.e., losing their distinctiveness), and +noisy/misleading pseudo-labels. This paper proposes a novel training-based +model refinement (TMR) stage and a simple yet effective representation +disagreement (RD) strategy to address the limitations of classical EMA and the +consensus problem. The TMR stage of Teacher-Student models optimizes the +lightweight scaling operation to refine the model's weights and prevent +overfitting or forgetting learned patterns from unlabeled data. Meanwhile, the +RD strategy helps keep these models diverged to encourage the student model to +explore complementary representations. Our approach can be integrated into +established SSOD methods and is empirically validated using two baseline +methods, with and without cascade regression, to generate more reliable +pseudo-labels. Extensive experiments demonstrate the superior performance of +our approach over state-of-the-art SSOD methods. Specifically, the proposed +approach outperforms the baseline Unbiased-Teacher-v2 (& Unbiased-Teacher-v1) +method by an average mAP margin of 2.23, 2.1, and 3.36 (& 2.07, 1.9, and 3.27) +on COCO-standard, COCO-additional, and Pascal VOC datasets, respectively. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ UnShadowNet: Illumination Critic Guided Contrastive Learning For Shadow + Removal + + +
+ Shadows are frequently encountered natural phenomena that significantly +hinder the performance of computer vision perception systems in practical +settings, e.g., autonomous driving. A solution to this would be to eliminate +shadow regions from the images before the processing of the perception system. +Yet, training such a solution requires pairs of aligned shadowed and +non-shadowed images which are difficult to obtain. We introduce a novel weakly +supervised shadow removal framework UnShadowNet trained using contrastive +learning. It is composed of a DeShadower network responsible for the removal of +the extracted shadow under the guidance of an Illumination network which is +trained adversarially by the illumination critic and a Refinement network to +further remove artefacts. We show that UnShadowNet can be easily extended to a +fully-supervised set-up to exploit the ground-truth when available. UnShadowNet +outperforms existing state-of-the-art approaches on three publicly available +shadow datasets (ISTD, adjusted ISTD, SRD) in both the weakly and fully +supervised setups. + +
+
+ comment: Accepted for publication at IEEE Access, vol. 11, pp. 87760-87774, + 2023 +
+
+
+
+
+ + ♻ ☆ Efficient Video Prediction via Sparsely Conditioned Flow Matching ICCV 2023 + + +
+ We introduce a novel generative model for video prediction based on latent +flow matching, an efficient alternative to diffusion-based models. In contrast +to prior work, we keep the high costs of modeling the past during training and +inference at bay by conditioning only on a small random set of past frames at +each integration step of the image generation process. Moreover, to enable the +generation of high-resolution videos and to speed up the training, we work in +the latent space of a pretrained VQGAN. Finally, we propose to approximate the +initial condition of the flow ODE with the previous noisy frame. This allows to +reduce the number of integration steps and hence, speed up the sampling at +inference time. We call our model Random frame conditioned flow Integration for +VidEo pRediction, or, in short, RIVER. We show that RIVER achieves superior or +on par performance compared to prior work on common video prediction +benchmarks, while requiring an order of magnitude fewer computational +resources. + +
+
+ comment: Accepted to ICCV 2023. Project page: https://araachie.github.io/river +
+
+
+
+
+ + ♻ ☆ WSSL: Weighted Self-supervised Learning Framework For Image-inpainting + + +
+ Image inpainting is the process of regenerating lost parts of the image. +Supervised algorithm-based methods have shown excellent results but have two +significant drawbacks. They do not perform well when tested with unseen data. +They fail to capture the global context of the image, resulting in a visually +unappealing result. We propose a novel self-supervised learning framework for +image-inpainting: Weighted Self-Supervised Learning (WSSL) to tackle these +problems. We designed WSSL to learn features from multiple weighted pretext +tasks. These features are then utilized for the downstream task, +image-inpainting. To improve the performance of our framework and produce more +visually appealing images, we also present a novel loss function for image +inpainting. The loss function takes advantage of both reconstruction loss and +perceptual loss functions to regenerate the image. Our experimentation shows +WSSL outperforms previous methods, and our loss function helps produce better +results. + +
+
+ comment: 9 Pages, document submitted for publication at CGVCVIP 2022 - ISBN + 978-989-8704-42-9 +
+
+
+
+
+ + ♻ ☆ ViewRefer: Grasp the Multi-view Knowledge for 3D Visual Grounding with + GPT and Prototype Guidance ICCV 2023 + + +
+ Understanding 3D scenes from multi-view inputs has been proven to alleviate +the view discrepancy issue in 3D visual grounding. However, existing methods +normally neglect the view cues embedded in the text modality and fail to weigh +the relative importance of different views. In this paper, we propose +ViewRefer, a multi-view framework for 3D visual grounding exploring how to +grasp the view knowledge from both text and 3D modalities. For the text branch, +ViewRefer leverages the diverse linguistic knowledge of large-scale language +models, e.g., GPT, to expand a single grounding text to multiple +geometry-consistent descriptions. Meanwhile, in the 3D modality, a transformer +fusion module with inter-view attention is introduced to boost the interaction +of objects across views. On top of that, we further present a set of learnable +multi-view prototypes, which memorize scene-agnostic knowledge for different +views, and enhance the framework from two perspectives: a view-guided attention +module for more robust text features, and a view-guided scoring strategy during +the final prediction. With our designed paradigm, ViewRefer achieves superior +performance on three benchmarks and surpasses the second-best by +2.8%, +1.5%, +and +1.35% on Sr3D, Nr3D, and ScanRefer. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Grammar-Based Grounded Lexicon Learning NeurIPS 2021 + + +
+ We present Grammar-Based Grounded Lexicon Learning (G2L2), a lexicalist +approach toward learning a compositional and grounded meaning representation of +language from grounded data, such as paired images and texts. At the core of +G2L2 is a collection of lexicon entries, which map each word to a tuple of a +syntactic type and a neuro-symbolic semantic program. For example, the word +shiny has a syntactic type of adjective; its neuro-symbolic semantic program +has the symbolic form {\lambda}x. filter(x, SHINY), where the concept SHINY is +associated with a neural network embedding, which will be used to classify +shiny objects. Given an input sentence, G2L2 first looks up the lexicon entries +associated with each token. It then derives the meaning of the sentence as an +executable neuro-symbolic program by composing lexical meanings based on +syntax. The recovered meaning programs can be executed on grounded inputs. To +facilitate learning in an exponentially-growing compositional space, we +introduce a joint parsing and expected execution algorithm, which does local +marginalization over derivations to reduce the training time. We evaluate G2L2 +on two domains: visual reasoning and language-driven navigation. Results show +that G2L2 can generalize from small amounts of data to novel compositions of +words. + +
+
+ comment: Minor typo fixes. NeurIPS 2021. Project page: + https://g2l2.csail.mit.edu/ +
+
+
+
+
+
+
+
+ + Information Retrieval 13 + +
+
+
+ + ☆ On Popularity Bias of Multimodal-aware Recommender Systems: a + Modalities-driven Analysis + + +
+ Multimodal-aware recommender systems (MRSs) exploit multimodal content (e.g., +product images or descriptions) as items' side information to improve +recommendation accuracy. While most of such methods rely on factorization +models (e.g., MFBPR) as base architecture, it has been shown that MFBPR may be +affected by popularity bias, meaning that it inherently tends to boost the +recommendation of popular (i.e., short-head) items at the detriment of niche +(i.e., long-tail) items from the catalog. Motivated by this assumption, in this +work, we provide one of the first analyses on how multimodality in +recommendation could further amplify popularity bias. Concretely, we evaluate +the performance of four state-of-the-art MRSs algorithms (i.e., VBPR, MMGCN, +GRCN, LATTICE) on three datasets from Amazon by assessing, along with +recommendation accuracy metrics, performance measures accounting for the +diversity of recommended items and the portion of retrieved niche items. To +better investigate this aspect, we decide to study the separate influence of +each modality (i.e., visual and textual) on popularity bias in different +evaluation dimensions. Results, which demonstrate how the single modality may +augment the negative effect of popularity bias, shed light on the importance to +provide a more rigorous analysis of the performance of such models. + +
+
+
+
+
+ + ☆ Towards Communication-Efficient Model Updating for On-Device + Session-Based Recommendation + + +
+ On-device recommender systems recently have garnered increasing attention due +to their advantages of providing prompt response and securing privacy. To stay +current with evolving user interests, cloud-based recommender systems are +periodically updated with new interaction data. However, on-device models +struggle to retrain themselves because of limited onboard computing resources. +As a solution, we consider the scenario where the model retraining occurs on +the server side and then the updated parameters are transferred to edge devices +via network communication. While this eliminates the need for local retraining, +it incurs a regular transfer of parameters that significantly taxes network +bandwidth. To mitigate this issue, we develop an efficient approach based on +compositional codes to compress the model update. This approach ensures the +on-device model is updated flexibly with minimal additional parameters whilst +utilizing previous knowledge. The extensive experiments conducted on multiple +session-based recommendation models with distinctive architectures demonstrate +that the on-device model can achieve comparable accuracy to the retrained +server-side counterpart through transferring an update 60x smaller in size. The +codes are available at \url{https://github.com/xiaxin1998/ODUpdate}. + +
+
+ comment: cikm2023 +
+
+
+
+
+ + ☆ On the Consistency of Average Embeddings for Item Recommendation RecSys 2023 + + +
+ A prevalent practice in recommender systems consists of averaging item +embeddings to represent users or higher-level concepts in the same embedding +space. This paper investigates the relevance of such a practice. For this +purpose, we propose an expected precision score, designed to measure the +consistency of an average embedding relative to the items used for its +construction. We subsequently analyze the mathematical expression of this score +in a theoretical setting with specific assumptions, as well as its empirical +behavior on real-world data from music streaming services. Our results +emphasize that real-world averages are less consistent for recommendation, +which paves the way for future research to better align real-world embeddings +with assumptions from our theoretical setting. + +
+
+ comment: 17th ACM Conference on Recommender Systems (RecSys 2023) +
+
+
+
+
+ + ☆ Video Recommendation Using Social Network Analysis and User Viewing + Patterns + + +
+ With the meteoric rise of video-on-demand (VOD) platforms, users face the +challenge of sifting through an expansive sea of content to uncover shows that +closely match their preferences. To address this information overload dilemma, +VOD services have increasingly incorporated recommender systems powered by +algorithms that analyze user behavior and suggest personalized content. +However, a majority of existing recommender systems depend on explicit user +feedback in the form of ratings and reviews, which can be difficult and +time-consuming to collect at scale. This presents a key research gap, as +leveraging users' implicit feedback patterns could provide an alternative +avenue for building effective video recommendation models, circumventing the +need for explicit ratings. However, prior literature lacks sufficient +exploration into implicit feedback-based recommender systems, especially in the +context of modeling video viewing behavior. Therefore, this paper aims to +bridge this research gap by proposing a novel video recommendation technique +that relies solely on users' implicit feedback in the form of their content +viewing percentages. + +
+
+
+
+
+ + ☆ Out of the Box Thinking: Improving Customer Lifetime Value Modelling via + Expert Routing and Game Whale Detection + + +
+ Customer lifetime value (LTV) prediction is essential for mobile game +publishers trying to optimize the advertising investment for each user +acquisition based on the estimated worth. In mobile games, deploying +microtransactions is a simple yet effective monetization strategy, which +attracts a tiny group of game whales who splurge on in-game purchases. The +presence of such game whales may impede the practicality of existing LTV +prediction models, since game whales' purchase behaviours always exhibit varied +distribution from general users. Consequently, identifying game whales can open +up new opportunities to improve the accuracy of LTV prediction models. However, +little attention has been paid to applying game whale detection in LTV +prediction, and existing works are mainly specialized for the long-term LTV +prediction with the assumption that the high-quality user features are +available, which is not applicable in the UA stage. In this paper, we propose +ExpLTV, a novel multi-task framework to perform LTV prediction and game whale +detection in a unified way. In ExpLTV, we first innovatively design a deep +neural network-based game whale detector that can not only infer the intrinsic +order in accordance with monetary value, but also precisely identify high +spenders (i.e., game whales) and low spenders. Then, by treating the game whale +detector as a gating network to decide the different mixture patterns of LTV +experts assembling, we can thoroughly leverage the shared information and +scenario-specific information (i.e., game whales modelling and low spenders +modelling). Finally, instead of separately designing a purchase rate estimator +for two tasks, we design a shared estimator that can preserve the inner task +relationships. The superiority of ExpLTV is further validated via extensive +experiments on three industrial datasets. + +
+
+
+
+
+ + ☆ Laying foundations to quantify the "Effort of Reproducibility" + + +
+ Why are some research studies easy to reproduce while others are difficult? +Casting doubt on the accuracy of scientific work is not fruitful, especially +when an individual researcher cannot reproduce the claims made in the paper. +There could be many subjective reasons behind the inability to reproduce a +scientific paper. The field of Machine Learning (ML) faces a reproducibility +crisis, and surveying a portion of published articles has resulted in a group +realization that although sharing code repositories would be appreciable, code +bases are not the end all be all for determining the reproducibility of an +article. Various parties involved in the publication process have come forward +to address the reproducibility crisis and solutions such as badging articles as +reproducible, reproducibility checklists at conferences (\textit{NeurIPS, ICML, +ICLR, etc.}), and sharing artifacts on \textit{OpenReview} come across as +promising solutions to the core problem. The breadth of literature on +reproducibility focuses on measures required to avoid ir-reproducibility, and +there is not much research into the effort behind reproducing these articles. +In this paper, we investigate the factors that contribute to the easiness and +difficulty of reproducing previously published studies and report on the +foundational framework to quantify effort of reproducibility. + +
+
+ comment: Accepted at ACM/IEEE conference JCDL' 2023. Refer + https://2023.jcdl.org/program/schedule-printable/ for confirmation +
+
+
+
+
+ + ☆ Exploring the Integration Strategies of Retriever and Large Language + Models + + +
+ The integration of retrieved passages and large language models (LLMs), such +as ChatGPTs, has significantly contributed to improving open-domain question +answering. However, there is still a lack of exploration regarding the optimal +approach for incorporating retrieved passages into the answer generation +process. This paper aims to fill this gap by investigating different methods of +combining retrieved passages with LLMs to enhance answer generation. We begin +by examining the limitations of a commonly-used concatenation approach. +Surprisingly, this approach often results in generating "unknown" outputs, even +when the correct document is among the top-k retrieved passages. To address +this issue, we explore four alternative strategies for integrating the +retrieved passages with the LLMs. These strategies include two single-round +methods that utilize chain-of-thought reasoning and two multi-round strategies +that incorporate feedback loops. Through comprehensive analyses and +experiments, we provide insightful observations on how to effectively leverage +retrieved passages to enhance the answer generation capability of LLMs. + +
+
+
+
+
+ + ☆ Multi-BERT for Embeddings for Recommendation System + + +
+ In this paper, we propose a novel approach for generating document embeddings +using a combination of Sentence-BERT (SBERT) and RoBERTa, two state-of-the-art +natural language processing models. Our approach treats sentences as tokens and +generates embeddings for them, allowing the model to capture both +intra-sentence and inter-sentence relations within a document. We evaluate our +model on a book recommendation task and demonstrate its effectiveness in +generating more semantically rich and accurate document embeddings. To assess +the performance of our approach, we conducted experiments on a book +recommendation task using the Goodreads dataset. We compared the document +embeddings generated using our MULTI-BERT model to those generated using SBERT +alone. We used precision as our evaluation metric to compare the quality of the +generated embeddings. Our results showed that our model consistently +outperformed SBERT in terms of the quality of the generated embeddings. +Furthermore, we found that our model was able to capture more nuanced semantic +relations within documents, leading to more accurate recommendations. Overall, +our results demonstrate the effectiveness of our approach and suggest that it +is a promising direction for improving the performance of recommendation +systems + +
+
+ comment: 5 pages, 1 figure, 1 table +
+
+
+
+
+ + ☆ Financial News Analytics Using Fine-Tuned Llama 2 GPT Model + + +
+ The paper considers the possibility to fine-tune Llama 2 Large Language Model +(LLM) for the multitask analysis of financial news. For fine-tuning, the +PEFT/LoRA based approach was used. In the study, the model was fine-tuned for +the following tasks: analysing a text from financial market perspectives, +highlighting main points of a text, summarizing a text and extracting named +entities with appropriate sentiments. The obtained results show that the +fine-tuned Llama 2 model can perform a multitask financial news analysis with a +specified structure of response, part of response can be a structured text and +another part of data can have JSON format for further processing. Extracted +sentiments for named entities can be considered as predictive features in +supervised machine learning models with quantitative target variables. + +
+
+
+
+
+ + ♻ ☆ Uncovering ChatGPT's Capabilities in Recommender Systems RecSys 2023 + + +
+ The debut of ChatGPT has recently attracted the attention of the natural +language processing (NLP) community and beyond. Existing studies have +demonstrated that ChatGPT shows significant improvement in a range of +downstream NLP tasks, but the capabilities and limitations of ChatGPT in terms +of recommendations remain unclear. In this study, we aim to conduct an +empirical analysis of ChatGPT's recommendation ability from an Information +Retrieval (IR) perspective, including point-wise, pair-wise, and list-wise +ranking. To achieve this goal, we re-formulate the above three recommendation +policies into a domain-specific prompt format. Through extensive experiments on +four datasets from different domains, we demonstrate that ChatGPT outperforms +other large language models across all three ranking policies. Based on the +analysis of unit cost improvements, we identify that ChatGPT with list-wise +ranking achieves the best trade-off between cost and performance compared to +point-wise and pair-wise ranking. Moreover, ChatGPT shows the potential for +mitigating the cold start problem and explainable recommendation. To facilitate +further explorations in this area, the full code and detailed original results +are open-sourced at https://github.com/rainym00d/LLM4RS. + +
+
+ comment: Accepted by RecSys 2023 +
+
+
+
+
+ + ♻ ☆ Committed Private Information Retrieval ESORICS 2023 + + +
+ A private information retrieval (PIR) scheme allows a client to retrieve a +data item $x_i$ among $n$ items $x_1,x_2,\ldots,x_n$ from $k$ servers, without +revealing what $i$ is even when $t < k$ servers collude and try to learn $i$. +Such a PIR scheme is said to be $t$-private. A PIR scheme is $v$-verifiable if +the client can verify the correctness of the retrieved $x_i$ even when $v \leq +k$ servers collude and try to fool the client by sending manipulated data. Most +of the previous works in the literature on PIR assumed that $v < k$, leaving +the case of all-colluding servers open. We propose a generic construction that +combines a linear map commitment (LMC) and an arbitrary linear PIR scheme to +produce a $k$-verifiable PIR scheme, termed a committed PIR scheme. Such a +scheme guarantees that even in the worst scenario, when all servers are under +the control of an attacker, although the privacy is unavoidably lost, the +client won't be fooled into accepting an incorrect $x_i$. We demonstrate the +practicality of our proposal by implementing the committed PIR schemes based on +the Lai-Malavolta LMC and three well-known PIR schemes using the GMP library +and blst, the current fastest C library for elliptic curve pairings. + +
+
+ comment: Accepted at ESORICS 2023 +
+
+
+
+
+ + ♻ ☆ Natural Language is All a Graph Needs + + +
+ The emergence of large-scale pre-trained language models, such as ChatGPT, +has revolutionized various research fields in artificial intelligence. +Transformers-based large language models (LLMs) have gradually replaced CNNs +and RNNs to unify fields of computer vision and natural language processing. +Compared with the data that exists relatively independently such as images, +videos or texts, graph is a type of data that contains rich structural and +relational information. Meanwhile, natural language, as one of the most +expressive mediums, excels in describing complex structures. However, existing +work on incorporating graph learning problems into the generative language +modeling framework remains very limited. As the importance of large language +models continues to grow, it becomes essential to explore whether LLMs can also +replace GNNs as the foundation model for graphs. In this paper, we propose +InstructGLM (Instruction-finetuned Graph Language Model), systematically design +highly scalable prompts based on natural language instructions, and use natural +language to describe the geometric structure and node features of the graph for +instruction tuning an LLM to perform learning and inference on graphs in a +generative manner. Our method exceeds all competitive GNN baselines on +ogbn-arxiv, Cora and PubMed datasets, which demonstrates the effectiveness of +our method and sheds light on generative large language models as the +foundation model for graph machine learning. + +
+
+ comment: 21 pages, 2 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ An In-depth Investigation of User Response Simulation for Conversational + Search + + +
+ Conversational search has seen increased recent attention in both the IR and +NLP communities. It seeks to clarify and solve a user's search need through +multi-turn natural language interactions. However, most existing systems are +trained and demonstrated with recorded or artificial conversation logs. +Eventually, conversational search systems should be trained, evaluated, and +deployed in an open-ended setting with unseen conversation trajectories. A key +challenge is that training and evaluating such systems both require a +human-in-the-loop, which is expensive and does not scale. One strategy for this +is to simulate users, thereby reducing the scaling costs. However, current user +simulators are either limited to only respond to yes-no questions from the +conversational search system, or unable to produce high quality responses in +general. + In this paper, we show that current state-of-the-art user simulation system +could be significantly improved by replacing it with a smaller but advanced +natural language generation model. But rather than merely reporting this new +state-of-the-art, we present an in-depth investigation of the task of +simulating user response for conversational search. Our goal is to supplement +existing works with an insightful hand-analysis of what challenges are still +unsolved by the advanced model, as well as to propose our solutions for them. +The challenges we identified include (1) dataset noise, (2) a blind spot that +is difficult for existing models to learn, and (3) a specific type of +misevaluation in the standard empirical setup. Except for the dataset noise +issue, we propose solutions to cover the training blind spot and to avoid the +misevaluation. Our proposed solutions lead to further improvements. Our best +system improves the previous state-of-the-art significantly. + +
+
+ comment: 9 pages +
+
+
+
+
+
+
+
+ + Machine Learning 150 + +
+
+
+ + ☆ NeuralClothSim: Neural Deformation Fields Meet the Kirchhoff-Love Thin + Shell Theory + + +
+ Cloth simulation is an extensively studied problem, with a plethora of +solutions available in computer graphics literature. Existing cloth simulators +produce realistic cloth deformations that obey different types of boundary +conditions. Nevertheless, their operational principle remains limited in +several ways: They operate on explicit surface representations with a fixed +spatial resolution, perform a series of discretised updates (which bounds their +temporal resolution), and require comparably large amounts of storage. +Moreover, back-propagating gradients through the existing solvers is often not +straightforward, which poses additional challenges when integrating them into +modern neural architectures. In response to the limitations mentioned above, +this paper takes a fundamentally different perspective on physically-plausible +cloth simulation and re-thinks this long-standing problem: We propose +NeuralClothSim, i.e., a new cloth simulation approach using thin shells, in +which surface evolution is encoded in neural network weights. Our +memory-efficient and differentiable solver operates on a new continuous +coordinate-based representation of dynamic surfaces, i.e., neural deformation +fields (NDFs); it supervises NDF evolution with the rules of the non-linear +Kirchhoff-Love shell theory. NDFs are adaptive in the sense that they 1) +allocate their capacity to the deformation details as the latter arise during +the cloth evolution and 2) allow surface state queries at arbitrary spatial and +temporal resolutions without retraining. We show how to train our +NeuralClothSim solver while imposing hard boundary conditions and demonstrate +multiple applications, such as material interpolation and simulation editing. +The experimental results highlight the effectiveness of our formulation and its +potential impact. + +
+
+ comment: 27 pages, 22 figures and 3 tables; project page: + https://4dqv.mpi-inf.mpg.de/NeuralClothSim/ +
+
+
+
+
+ + ☆ NeO 360: Neural Fields for Sparse View Synthesis of Outdoor Scenes ICCV + + +
+ Recent implicit neural representations have shown great results for novel +view synthesis. However, existing methods require expensive per-scene +optimization from many views hence limiting their application to real-world +unbounded urban settings where the objects of interest or backgrounds are +observed from very few views. To mitigate this challenge, we introduce a new +approach called NeO 360, Neural fields for sparse view synthesis of outdoor +scenes. NeO 360 is a generalizable method that reconstructs 360{\deg} scenes +from a single or a few posed RGB images. The essence of our approach is in +capturing the distribution of complex real-world outdoor 3D scenes and using a +hybrid image-conditional triplanar representation that can be queried from any +world point. Our representation combines the best of both voxel-based and +bird's-eye-view (BEV) representations and is more effective and expressive than +each. NeO 360's representation allows us to learn from a large collection of +unbounded 3D scenes while offering generalizability to new views and novel +scenes from as few as a single image during inference. We demonstrate our +approach on the proposed challenging 360{\deg} unbounded dataset, called NeRDS +360, and show that NeO 360 outperforms state-of-the-art generalizable methods +for novel view synthesis while also offering editing and composition +capabilities. Project page: +https://zubair-irshad.github.io/projects/neo360.html + +
+
+ comment: Accepted to International Conference on Computer Vision (ICCV), 2023. + Project page: https://zubair-irshad.github.io/projects/neo360.html +
+
+
+
+
+ + ☆ Scenimefy: Learning to Craft Anime Scene via Semi-Supervised + Image-to-Image Translation ICCV 2023 + + +
+ Automatic high-quality rendering of anime scenes from complex real-world +images is of significant practical value. The challenges of this task lie in +the complexity of the scenes, the unique features of anime style, and the lack +of high-quality datasets to bridge the domain gap. Despite promising attempts, +previous efforts are still incompetent in achieving satisfactory results with +consistent semantic preservation, evident stylization, and fine details. In +this study, we propose Scenimefy, a novel semi-supervised image-to-image +translation framework that addresses these challenges. Our approach guides the +learning with structure-consistent pseudo paired data, simplifying the pure +unsupervised setting. The pseudo data are derived uniquely from a +semantic-constrained StyleGAN leveraging rich model priors like CLIP. We +further apply segmentation-guided data selection to obtain high-quality pseudo +supervision. A patch-wise contrastive style loss is introduced to improve +stylization and fine details. Besides, we contribute a high-resolution anime +scene dataset to facilitate future research. Our extensive experiments +demonstrate the superiority of our method over state-of-the-art baselines in +terms of both perceptual quality and quantitative performance. + +
+
+ comment: ICCV 2023. The first two authors contributed equally. Code: + https://github.com/Yuxinn-J/Scenimefy Project page: + https://yuxinn-j.github.io/projects/Scenimefy.html +
+
+
+
+
+ + ☆ Dense Text-to-Image Generation with Attention Modulation ICCV2023 + + +
+ Existing text-to-image diffusion models struggle to synthesize realistic +images given dense captions, where each text prompt provides a detailed +description for a specific image region. To address this, we propose +DenseDiffusion, a training-free method that adapts a pre-trained text-to-image +model to handle such dense captions while offering control over the scene +layout. We first analyze the relationship between generated images' layouts and +the pre-trained model's intermediate attention maps. Next, we develop an +attention modulation method that guides objects to appear in specific regions +according to layout guidance. Without requiring additional fine-tuning or +datasets, we improve image generation performance given dense captions +regarding both automatic and human evaluation scores. In addition, we achieve +similar-quality visual results with models specifically trained with layout +conditions. + +
+
+ comment: Accepted by ICCV2023. Code and data are available at + https://github.com/naver-ai/DenseDiffusion +
+
+
+
+
+ + ☆ DLIP: Distilling Language-Image Pre-training + + +
+ Vision-Language Pre-training (VLP) shows remarkable progress with the +assistance of extremely heavy parameters, which challenges deployment in real +applications. Knowledge distillation is well recognized as the essential +procedure in model compression. However, existing knowledge distillation +techniques lack an in-depth investigation and analysis of VLP, and practical +guidelines for VLP-oriented distillation are still not yet explored. In this +paper, we present DLIP, a simple yet efficient Distilling Language-Image +Pre-training framework, through which we investigate how to distill a light VLP +model. Specifically, we dissect the model distillation from multiple +dimensions, such as the architecture characteristics of different modules and +the information transfer of different modalities. We conduct comprehensive +experiments and provide insights on distilling a light but performant VLP +model. Experimental results reveal that DLIP can achieve a state-of-the-art +accuracy/efficiency trade-off across diverse cross-modal tasks, e.g., +image-text retrieval, image captioning and visual question answering. For +example, DLIP compresses BLIP by 1.9x, from 213M to 108M parameters, while +achieving comparable or better performance. Furthermore, DLIP succeeds in +retaining more than 95% of the performance with 22.4% parameters and 24.8% +FLOPs compared to the teacher model and accelerates inference speed by 2.7x. + +
+
+
+
+
+ + ☆ BridgeData V2: A Dataset for Robot Learning at Scale + + +
+ We introduce BridgeData V2, a large and diverse dataset of robotic +manipulation behaviors designed to facilitate research on scalable robot +learning. BridgeData V2 contains 60,096 trajectories collected across 24 +environments on a publicly available low-cost robot. BridgeData V2 provides +extensive task and environment variability, leading to skills that can +generalize across environments, domains, and institutions, making the dataset a +useful resource for a broad range of researchers. Additionally, the dataset is +compatible with a wide variety of open-vocabulary, multi-task learning methods +conditioned on goal images or natural language instructions. In our +experiments, we train 6 state-of-the-art imitation learning and offline +reinforcement learning methods on our dataset, and find that they succeed on a +suite of tasks requiring varying amounts of generalization. We also demonstrate +that the performance of these methods improves with more data and higher +capacity models, and that training on a greater variety of skills leads to +improved generalization. By publicly sharing BridgeData V2 and our pre-trained +models, we aim to accelerate research in scalable robot learning methods. +Project page at https://rail-berkeley.github.io/bridgedata + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ Label Budget Allocation in Multi-Task Learning + + +
+ The cost of labeling data often limits the performance of machine learning +systems. In multi-task learning, related tasks provide information to each +other and improve overall performance, but the label cost can vary among tasks. +How should the label budget (i.e. the amount of money spent on labeling) be +allocated among different tasks to achieve optimal multi-task performance? We +are the first to propose and formally define the label budget allocation +problem in multi-task learning and to empirically show that different budget +allocation strategies make a big difference to its performance. We propose a +Task-Adaptive Budget Allocation algorithm to robustly generate the optimal +budget allocation adaptive to different multi-task learning settings. +Specifically, we estimate and then maximize the extent of new information +obtained from the allocated budget as a proxy for multi-task learning +performance. Experiments on PASCAL VOC and Taskonomy demonstrate the efficacy +of our approach over other widely used heuristic labeling strategies. + +
+
+
+
+
+ + ☆ Learning Only On Boundaries: a Physics-Informed Neural operator for + Solving Parametric Partial Differential Equations in Complex Geometries + + +
+ Recently deep learning surrogates and neural operators have shown promise in +solving partial differential equations (PDEs). However, they often require a +large amount of training data and are limited to bounded domains. In this work, +we present a novel physics-informed neural operator method to solve +parametrized boundary value problems without labeled data. By reformulating the +PDEs into boundary integral equations (BIEs), we can train the operator network +solely on the boundary of the domain. This approach reduces the number of +required sample points from $O(N^d)$ to $O(N^{d-1})$, where $d$ is the domain's +dimension, leading to a significant acceleration of the training process. +Additionally, our method can handle unbounded problems, which are unattainable +for existing physics-informed neural networks (PINNs) and neural operators. Our +numerical experiments show the effectiveness of parametrized complex geometries +and unbounded problems. + +
+
+
+
+
+ + ☆ Low-count Time Series Anomaly Detection SP + + +
+ Low-count time series describe sparse or intermittent events, which are +prevalent in large-scale online platforms that capture and monitor diverse data +types. Several distinct challenges surface when modelling low-count time +series, particularly low signal-to-noise ratios (when anomaly signatures are +provably undetectable), and non-uniform performance (when average metrics are +not representative of local behaviour). The time series anomaly detection +community currently lacks explicit tooling and processes to model and reliably +detect anomalies in these settings. We address this gap by introducing a novel +generative procedure for creating benchmark datasets comprising of low-count +time series with anomalous segments. Via a mixture of theoretical and empirical +analysis, our work explains how widely-used algorithms struggle with the +distribution overlap between normal and anomalous segments. In order to +mitigate this shortcoming, we then leverage our findings to demonstrate how +anomaly score smoothing consistently improves performance. The practical +utility of our analysis and recommendation is validated on a real-world dataset +containing sales data for retail stores. + +
+
+ comment: 6 pages, 7 figures, to be published in IEEE 2023 Workshop on Machine + Learning for Signal Processing (MLSP) +
+
+
+
+
+ + ☆ An Efficient Distributed Multi-Agent Reinforcement Learning for EV + Charging Network Control + + +
+ The increasing trend in adopting electric vehicles (EVs) will significantly +impact the residential electricity demand, which results in an increased risk +of transformer overload in the distribution grid. To mitigate such risks, there +are urgent needs to develop effective EV charging controllers. Currently, the +majority of the EV charge controllers are based on a centralized approach for +managing individual EVs or a group of EVs. In this paper, we introduce a +decentralized Multi-agent Reinforcement Learning (MARL) charging framework that +prioritizes the preservation of privacy for EV owners. We employ the +Centralized Training Decentralized Execution-Deep Deterministic Policy Gradient +(CTDE-DDPG) scheme, which provides valuable information to users during +training while maintaining privacy during execution. Our results demonstrate +that the CTDE framework improves the performance of the charging network by +reducing the network costs. Moreover, we show that the Peak-to-Average Ratio +(PAR) of the total demand is reduced, which, in turn, reduces the risk of +transformer overload during the peak hours. + +
+
+ comment: 8 pages, 4 figures, accepted at Allerton 2023 +
+
+
+
+
+ + ☆ Towards Realistic Unsupervised Fine-tuning with CLIP + + +
+ The emergence of vision-language models (VLMs), such as CLIP, has spurred a +significant research effort towards their application for downstream supervised +learning tasks. Although some previous studies have explored the unsupervised +fine-tuning of CLIP, they often rely on prior knowledge in the form of class +names associated with ground truth labels. In this paper, we delve into a +realistic unsupervised fine-tuning scenario by assuming that the unlabeled data +might contain out-of-distribution samples from unknown classes. Furthermore, we +emphasize the importance of simultaneously enhancing out-of-distribution +detection capabilities alongside the recognition of instances associated with +predefined class labels. + To tackle this problem, we present a simple, efficient, and effective +fine-tuning approach called Universal Entropy Optimization (UEO). UEO leverages +sample-level confidence to approximately minimize the conditional entropy of +confident instances and maximize the marginal entropy of less confident +instances. Apart from optimizing the textual prompts, UEO also incorporates +optimization of channel-wise affine transformations within the visual branch of +CLIP. Through extensive experiments conducted across 15 domains and 4 different +types of prior knowledge, we demonstrate that UEO surpasses baseline methods in +terms of both generalization and out-of-distribution detection. + +
+
+
+
+
+ + ☆ Evaluating the Vulnerabilities in ML systems in terms of adversarial + attacks + + +
+ There have been recent adversarial attacks that are difficult to find. These +new adversarial attacks methods may pose challenges to current deep learning +cyber defense systems and could influence the future defense of cyberattacks. +The authors focus on this domain in this research paper. They explore the +consequences of vulnerabilities in AI systems. This includes discussing how +they might arise, differences between randomized and adversarial examples and +also potential ethical implications of vulnerabilities. Moreover, it is +important to train the AI systems appropriately when they are in testing phase +and getting them ready for broader use. + +
+
+
+
+
+ + ☆ POLCA: Power Oversubscription in LLM Cloud Providers + + +
+ Recent innovation in large language models (LLMs), and their myriad use-cases +have rapidly driven up the compute capacity demand for datacenter GPUs. Several +cloud providers and other enterprises have made substantial plans of growth in +their datacenters to support these new workloads. One of the key bottleneck +resources in datacenters is power, and given the increasing model sizes of +LLMs, they are becoming increasingly power intensive. In this paper, we show +that there is a significant opportunity to oversubscribe power in LLM clusters. +Power oversubscription improves the power efficiency of these datacenters, +allowing more deployable servers per datacenter, and reduces the deployment +time, since building new datacenters is slow. + We extensively characterize the power consumption patterns of a variety of +LLMs and their configurations. We identify the differences between the +inference and training power consumption patterns. Based on our analysis of +these LLMs, we claim that the average and peak power utilization in LLM +clusters for inference should not be very high. Our deductions align with the +data from production LLM clusters, revealing that inference workloads offer +substantial headroom for power oversubscription. However, the stringent set of +telemetry and controls that GPUs offer in a virtualized environment, makes it +challenging to have a reliable and robust power oversubscription mechanism. + We propose POLCA, our framework for power oversubscription that is robust, +reliable, and readily deployable for GPU clusters. Using open-source models to +replicate the power patterns observed in production, we simulate POLCA and +demonstrate that we can deploy 30% more servers in the same GPU cluster for +inference, with minimal performance loss + +
+
+
+
+
+ + ☆ CDAN: Convolutional Dense Attention-guided Network for Low-light Image + Enhancement + + +
+ Low-light images, characterized by inadequate illumination, pose challenges +of diminished clarity, muted colors, and reduced details. Low-light image +enhancement, an essential task in computer vision, aims to rectify these issues +by improving brightness, contrast, and overall perceptual quality, thereby +facilitating accurate analysis and interpretation. This paper introduces the +Convolutional Dense Attention-guided Network (CDAN), a novel solution for +enhancing low-light images. CDAN integrates an autoencoder-based architecture +with convolutional and dense blocks, complemented by an attention mechanism and +skip connections. This architecture ensures efficient information propagation +and feature learning. Furthermore, a dedicated post-processing phase refines +color balance and contrast. Our approach demonstrates notable progress compared +to state-of-the-art results in low-light image enhancement, showcasing its +robustness across a wide range of challenging scenarios. Our model performs +remarkably on benchmark datasets, effectively mitigating under-exposure and +proficiently restoring textures and colors in diverse low-light scenarios. This +achievement underscores CDAN's potential for diverse computer vision tasks, +notably enabling robust object detection and recognition in challenging +low-light conditions. + +
+
+ comment: 18 pages, 13 figures +
+
+
+
+
+ + ☆ Unified Data Management and Comprehensive Performance Evaluation for + Urban Spatial-Temporal Prediction [Experiment, Analysis & Benchmark] + + +
+ The field of urban spatial-temporal prediction is advancing rapidly with the +development of deep learning techniques and the availability of large-scale +datasets. However, challenges persist in accessing and utilizing diverse urban +spatial-temporal datasets from different sources and stored in different +formats, as well as determining effective model structures and components with +the proliferation of deep learning models. This work addresses these challenges +and provides three significant contributions. Firstly, we introduce "atomic +files", a unified storage format designed for urban spatial-temporal big data, +and validate its effectiveness on 40 diverse datasets, simplifying data +management. Secondly, we present a comprehensive overview of technological +advances in urban spatial-temporal prediction models, guiding the development +of robust models. Thirdly, we conduct extensive experiments using diverse +models and datasets, establishing a performance leaderboard and identifying +promising research directions. Overall, this work effectively manages urban +spatial-temporal data, guides future efforts, and facilitates the development +of accurate and efficient urban spatial-temporal prediction models. It can +potentially make long-term contributions to urban spatial-temporal data +management and prediction, ultimately leading to improved urban living +standards. + +
+
+ comment: 14 pages, 3 figures. arXiv admin note: text overlap with + arXiv:2304.14343 +
+
+
+
+
+ + ☆ Beyond Document Page Classification: Design, Datasets, and Challenges + + +
+ This paper highlights the need to bring document classification benchmarking +closer to real-world applications, both in the nature of data tested ($X$: +multi-channel, multi-paged, multi-industry; $Y$: class distributions and label +set variety) and in classification tasks considered ($f$: multi-page document, +page stream, and document bundle classification, ...). We identify the lack of +public multi-page document classification datasets, formalize different +classification tasks arising in application scenarios, and motivate the value +of targeting efficient multi-page document representations. An experimental +study on proposed multi-page document classification datasets demonstrates that +current benchmarks have become irrelevant and need to be updated to evaluate +complete documents, as they naturally occur in practice. This reality check +also calls for more mature evaluation methodologies, covering calibration +evaluation, inference complexity (time-memory), and a range of realistic +distribution shifts (e.g., born-digital vs. scanning noise, shifting page +order). Our study ends on a hopeful note by recommending concrete avenues for +future improvements.} + +
+
+ comment: 8 pages, under review +
+
+
+
+
+ + ☆ Easy attention: A simple self-attention mechanism for Transformers + + +
+ To improve the robustness of transformer neural networks used for +temporal-dynamics prediction of chaotic systems, we propose a novel attention +mechanism called easy attention. Due to the fact that self attention only makes +usage of the inner product of queries and keys, it is demonstrated that the +keys, queries and softmax are not necessary for obtaining the attention score +required to capture long-term dependencies in temporal sequences. Through +implementing singular-value decomposition (SVD) on the softmax attention score, +we further observe that the self attention compresses contribution from both +queries and keys in the spanned space of the attention score. Therefore, our +proposed easy-attention method directly treats the attention scores as +learnable parameters. This approach produces excellent results when +reconstructing and predicting the temporal dynamics of chaotic systems +exhibiting more robustness and less complexity than the self attention or the +widely-used long short-term memory (LSTM) network. Our results show great +potential for applications in more complex high-dimensional dynamical systems. + +
+
+ comment: 12 pages and 8 figures +
+
+
+
+
+ + ☆ IPA: Inference Pipeline Adaptation to Achieve High Accuracy and + Cost-Efficiency + + +
+ Efficiently optimizing multi-model inference pipelines for fast, accurate, +and cost-effective inference is a crucial challenge in ML production systems, +given their tight end-to-end latency requirements. To simplify the exploration +of the vast and intricate trade-off space of accuracy and cost in inference +pipelines, providers frequently opt to consider one of them. However, the +challenge lies in reconciling accuracy and cost trade-offs. To address this +challenge and propose a solution to efficiently manage model variants in +inference pipelines, we present IPA, an online deep-learning Inference Pipeline +Adaptation system that efficiently leverages model variants for each deep +learning task. Model variants are different versions of pre-trained models for +the same deep learning task with variations in resource requirements, latency, +and accuracy. IPA dynamically configures batch size, replication, and model +variants to optimize accuracy, minimize costs, and meet user-defined latency +SLAs using Integer Programming. It supports multi-objective settings for +achieving different trade-offs between accuracy and cost objectives while +remaining adaptable to varying workloads and dynamic traffic patterns. +Extensive experiments on a Kubernetes implementation with five real-world +inference pipelines demonstrate that IPA improves normalized accuracy by up to +35% with a minimal cost increase of less than 5%. + +
+
+
+
+
+ + ☆ Auto-weighted Bayesian Physics-Informed Neural Networks and robust + estimations for multitask inverse problems in pore-scale imaging of + dissolution + + +
+ In this article, we present a novel data assimilation strategy in pore-scale +imaging and demonstrate that this makes it possible to robustly address +reactive inverse problems incorporating Uncertainty Quantification (UQ). +Pore-scale modeling of reactive flow offers a valuable opportunity to +investigate the evolution of macro-scale properties subject to dynamic +processes. Yet, they suffer from imaging limitations arising from the +associated X-ray microtomography (X-ray microCT) process, which induces +discrepancies in the properties estimates. Assessment of the kinetic parameters +also raises challenges, as reactive coefficients are critical parameters that +can cover a wide range of values. We account for these two issues and ensure +reliable calibration of pore-scale modeling, based on dynamical microCT images, +by integrating uncertainty quantification in the workflow. + The present method is based on a multitasking formulation of reactive inverse +problems combining data-driven and physics-informed techniques in calcite +dissolution. This allows quantifying morphological uncertainties on the +porosity field and estimating reactive parameter ranges through prescribed PDE +models with a latent concentration field and dynamical microCT. The data +assimilation strategy relies on sequential reinforcement incorporating +successively additional PDE constraints. We guarantee robust and unbiased +uncertainty quantification by straightforward adaptive weighting of Bayesian +Physics-Informed Neural Networks (BPINNs), ensuring reliable micro-porosity +changes during geochemical transformations. We demonstrate successful Bayesian +Inference in 1D+Time and 2D+Time calcite dissolution based on synthetic microCT +images with meaningful posterior distribution on the reactive parameters and +dimensionless numbers. + +
+
+
+
+
+ + ☆ Towards Automated Animal Density Estimation with Acoustic Spatial + Capture-Recapture + + +
+ Passive acoustic monitoring can be an effective way of monitoring wildlife +populations that are acoustically active but difficult to survey visually. +Digital recorders allow surveyors to gather large volumes of data at low cost, +but identifying target species vocalisations in these data is non-trivial. +Machine learning (ML) methods are often used to do the identification. They can +process large volumes of data quickly, but they do not detect all vocalisations +and they do generate some false positives (vocalisations that are not from the +target species). Existing wildlife abundance survey methods have been designed +specifically to deal with the first of these mistakes, but current methods of +dealing with false positives are not well-developed. They do not take account +of features of individual vocalisations, some of which are more likely to be +false positives than others. We propose three methods for acoustic spatial +capture-recapture inference that integrate individual-level measures of +confidence from ML vocalisation identification into the likelihood and hence +integrate ML uncertainty into inference. The methods include a mixture model in +which species identity is a latent variable. We test the methods by simulation +and find that in a scenario based on acoustic data from Hainan gibbons, in +which ignoring false positives results in 17% positive bias, our methods give +negligible bias and coverage probabilities that are close to the nominal 95% +level. + +
+
+ comment: 35 pages, 5 figures +
+
+
+
+
+ + ☆ Fast Adversarial Training with Smooth Convergence + + +
+ Fast adversarial training (FAT) is beneficial for improving the adversarial +robustness of neural networks. However, previous FAT work has encountered a +significant issue known as catastrophic overfitting when dealing with large +perturbation budgets, \ie the adversarial robustness of models declines to near +zero during training. + To address this, we analyze the training process of prior FAT work and +observe that catastrophic overfitting is accompanied by the appearance of loss +convergence outliers. + Therefore, we argue a moderately smooth loss convergence process will be a +stable FAT process that solves catastrophic overfitting. + To obtain a smooth loss convergence process, we propose a novel oscillatory +constraint (dubbed ConvergeSmooth) to limit the loss difference between +adjacent epochs. The convergence stride of ConvergeSmooth is introduced to +balance convergence and smoothing. Likewise, we design weight centralization +without introducing additional hyperparameters other than the loss balance +coefficient. + Our proposed methods are attack-agnostic and thus can improve the training +stability of various FAT techniques. + Extensive experiments on popular datasets show that the proposed methods +efficiently avoid catastrophic overfitting and outperform all previous FAT +methods. Code is available at \url{https://github.com/FAT-CS/ConvergeSmooth}. + +
+
+
+
+
+ + ☆ Probabilistic load forecasting with Reservoir Computing + + +
+ Some applications of deep learning require not only to provide accurate +results but also to quantify the amount of confidence in their prediction. The +management of an electric power grid is one of these cases: to avoid risky +scenarios, decision-makers need both precise and reliable forecasts of, for +example, power loads. For this reason, point forecasts are not enough hence it +is necessary to adopt methods that provide an uncertainty quantification. + This work focuses on reservoir computing as the core time series forecasting +method, due to its computational efficiency and effectiveness in predicting +time series. While the RC literature mostly focused on point forecasting, this +work explores the compatibility of some popular uncertainty quantification +methods with the reservoir setting. Both Bayesian and deterministic approaches +to uncertainty assessment are evaluated and compared in terms of their +prediction accuracy, computational resource efficiency and reliability of the +estimated uncertainty, based on a set of carefully chosen performance metrics. + +
+
+
+
+
+ + ☆ Actuator Trajectory Planning for UAVs with Overhead Manipulator using + Reinforcement Learning + + +
+ In this paper, we investigate the operation of an aerial manipulator system, +namely an Unmanned Aerial Vehicle (UAV) equipped with a controllable arm with +two degrees of freedom to carry out actuation tasks on the fly. Our solution is +based on employing a Q-learning method to control the trajectory of the tip of +the arm, also called \textit{end-effector}. More specifically, we develop a +motion planning model based on Time To Collision (TTC), which enables a +quadrotor UAV to navigate around obstacles while ensuring the manipulator's +reachability. Additionally, we utilize a model-based Q-learning model to +independently track and control the desired trajectory of the manipulator's +end-effector, given an arbitrary baseline trajectory for the UAV platform. Such +a combination enables a variety of actuation tasks such as high-altitude +welding, structural monitoring and repair, battery replacement, gutter +cleaning, sky scrapper cleaning, and power line maintenance in hard-to-reach +and risky environments while retaining compatibility with flight control +firmware. Our RL-based control mechanism results in a robust control strategy +that can handle uncertainties in the motion of the UAV, offering promising +performance. Specifically, our method achieves 92\% accuracy in terms of +average displacement error (i.e. the mean distance between the target and +obtained trajectory points) using Q-learning with 15,000 episodes + +
+
+
+
+
+ + ☆ Short Run Transit Route Planning Decision Support System Using a Deep + Learning-Based Weighted Graph + + +
+ Public transport routing plays a crucial role in transit network design, +ensuring a satisfactory level of service for passengers. However, current +routing solutions rely on traditional operational research heuristics, which +can be time-consuming to implement and lack the ability to provide quick +solutions. Here, we propose a novel deep learning-based methodology for a +decision support system that enables public transport (PT) planners to identify +short-term route improvements rapidly. By seamlessly adjusting specific +sections of routes between two stops during specific times of the day, our +method effectively reduces times and enhances PT services. Leveraging diverse +data sources such as GTFS and smart card data, we extract features and model +the transportation network as a directed graph. Using self-supervision, we +train a deep learning model for predicting lateness values for road segments. + These lateness values are then utilized as edge weights in the transportation +graph, enabling efficient path searching. Through evaluating the method on Tel +Aviv, we are able to reduce times on more than 9\% of the routes. The improved +routes included both intraurban and suburban routes showcasing a fact +highlighting the model's versatility. The findings emphasize the potential of +our data-driven decision support system to enhance public transport and city +logistics, promoting greater efficiency and reliability in PT services. + +
+
+
+
+
+ + ☆ Prediction without Preclusion: Recourse Verification with Reachable Sets + + +
+ Machine learning models are often used to decide who will receive a loan, a +job interview, or a public benefit. Standard techniques to build these models +use features about people but overlook their actionability. In turn, models can +assign predictions that are fixed, meaning that consumers who are denied loans, +interviews, or benefits may be permanently locked out from access to credit, +employment, or assistance. In this work, we introduce a formal testing +procedure to flag models that assign fixed predictions that we call recourse +verification. We develop machinery to reliably determine if a given model can +provide recourse to its decision subjects from a set of user-specified +actionability constraints. We demonstrate how our tools can ensure recourse and +adversarial robustness in real-world datasets and use them to study the +infeasibility of recourse in real-world lending datasets. Our results highlight +how models can inadvertently assign fixed predictions that permanently bar +access, and we provide tools to design algorithms that account for +actionability when developing models. + +
+
+
+
+
+ + ☆ Job Shop Scheduling Benchmark: Environments and Instances for Learning + and Non-learning Methods + + +
+ We introduce an open-source GitHub repository containing comprehensive +benchmarks for a wide range of machine scheduling problems, including Job Shop +Scheduling (JSP), Flow Shop Scheduling (FSP), Flexible Job Shop Scheduling +(FJSP), FJSP with Assembly constraints (FAJSP), FJSP with Sequence-Dependent +Setup Times (FJSP-SDST), and the online FJSP (with online job arrivals). Our +primary goal is to provide a centralized hub for researchers, practitioners, +and enthusiasts interested in tackling machine scheduling challenges. + +
+
+
+
+
+ + ☆ Single-shot Bayesian approximation for neural networks + + +
+ Deep neural networks (NNs) are known for their high-prediction performances. +However, NNs are prone to yield unreliable predictions when encountering +completely new situations without indicating their uncertainty. Bayesian +variants of NNs (BNNs), such as Monte Carlo (MC) dropout BNNs, do provide +uncertainty measures and simultaneously increase the prediction performance. +The only disadvantage of BNNs is their higher computation time during test time +because they rely on a sampling approach. Here we present a single-shot MC +dropout approximation that preserves the advantages of BNNs while being as fast +as NNs. Our approach is based on moment propagation (MP) and allows to +analytically approximate the expected value and the variance of the MC dropout +signal for commonly used layers in NNs, i.e. convolution, max pooling, dense, +softmax, and dropout layers. The MP approach can convert an NN into a BNN +without re-training given the NN has been trained with standard dropout. We +evaluate our approach on different benchmark datasets and a simulated toy +example in a classification and regression setting. We demonstrate that our +single-shot MC dropout approximation resembles the point estimate and the +uncertainty estimate of the predictive distribution that is achieved with an MC +approach, while being fast enough for real-time deployments of BNNs. We show +that using part of the saved time to combine our MP approach with deep ensemble +techniques does further improve the uncertainty measures. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2007.03293 +
+
+
+
+
+ + ☆ Intentionally-underestimated Value Function at Terminal State for + Temporal-difference Learning with Mis-designed Reward + + +
+ Robot control using reinforcement learning has become popular, but its +learning process generally terminates halfway through an episode for safety and +time-saving reasons. This study addresses the problem of the most popular +exception handling that temporal-difference (TD) learning performs at such +termination. That is, by forcibly assuming zero value after termination, +unintentionally implicit underestimation or overestimation occurs, depending on +the reward design in the normal states. When the episode is terminated due to +task failure, the failure may be highly valued with the unintentional +overestimation, and the wrong policy may be acquired. Although this problem can +be avoided by paying attention to the reward design, it is essential in +practical use of TD learning to review the exception handling at termination. +This paper therefore proposes a method to intentionally underestimate the value +after termination to avoid learning failures due to the unintentional +overestimation. In addition, the degree of underestimation is adjusted +according to the degree of stationarity at termination, thereby preventing +excessive exploration due to the intentional underestimation. Simulations and +real robot experiments showed that the proposed method can stably obtain the +optimal policies for various tasks and reward designs. +https://youtu.be/AxXr8uFOe7M + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ☆ On the Consistency of Average Embeddings for Item Recommendation RecSys 2023 + + +
+ A prevalent practice in recommender systems consists of averaging item +embeddings to represent users or higher-level concepts in the same embedding +space. This paper investigates the relevance of such a practice. For this +purpose, we propose an expected precision score, designed to measure the +consistency of an average embedding relative to the items used for its +construction. We subsequently analyze the mathematical expression of this score +in a theoretical setting with specific assumptions, as well as its empirical +behavior on real-world data from music streaming services. Our results +emphasize that real-world averages are less consistent for recommendation, +which paves the way for future research to better align real-world embeddings +with assumptions from our theoretical setting. + +
+
+ comment: 17th ACM Conference on Recommender Systems (RecSys 2023) +
+
+
+
+
+ + ☆ IP-UNet: Intensity Projection UNet Architecture for 3D Medical Volume + Segmentation + + +
+ CNNs have been widely applied for medical image analysis. However, limited +memory capacity is one of the most common drawbacks of processing +high-resolution 3D volumetric data. 3D volumes are usually cropped or downsized +first before processing, which can result in a loss of resolution, increase +class imbalance, and affect the performance of the segmentation algorithms. In +this paper, we propose an end-to-end deep learning approach called IP-UNet. +IP-UNet is a UNet-based model that performs multi-class segmentation on +Intensity Projection (IP) of 3D volumetric data instead of the memory-consuming +3D volumes. IP-UNet uses limited memory capability for training without losing +the original 3D image resolution. We compare the performance of three models in +terms of segmentation accuracy and computational cost: 1) Slice-by-slice 2D +segmentation of the CT scan images using a conventional 2D UNet model. 2) +IP-UNet that operates on data obtained by merging the extracted Maximum +Intensity Projection (MIP), Closest Vessel Projection (CVP), and Average +Intensity Projection (AvgIP) representations of the source 3D volumes, then +applying the UNet model on the output IP images. 3) 3D-UNet model directly +reads the 3D volumes constructed from a series of CT scan images and outputs +the 3D volume of the predicted segmentation. We test the performance of these +methods on 3D volumetric images for automatic breast calcification detection. +Experimental results show that IP-Unet can achieve similar segmentation +accuracy with 3D-Unet but with much better performance. It reduces the training +time by 70\% and memory consumption by 92\%. + +
+
+
+
+
+ + ☆ Motion In-Betweening with Phase Manifolds + + +
+ This paper introduces a novel data-driven motion in-betweening system to +reach target poses of characters by making use of phases variables learned by a +Periodic Autoencoder. Our approach utilizes a mixture-of-experts neural network +model, in which the phases cluster movements in both space and time with +different expert weights. Each generated set of weights then produces a +sequence of poses in an autoregressive manner between the current and target +state of the character. In addition, to satisfy poses which are manually +modified by the animators or where certain end effectors serve as constraints +to be reached by the animation, a learned bi-directional control scheme is +implemented to satisfy such constraints. The results demonstrate that using +phases for motion in-betweening tasks sharpen the interpolated movements, and +furthermore stabilizes the learning process. Moreover, using phases for motion +in-betweening tasks can also synthesize more challenging movements beyond +locomotion behaviors. Additionally, style control is enabled between given +target keyframes. Our proposed framework can compete with popular +state-of-the-art methods for motion in-betweening in terms of motion quality +and generalization, especially in the existence of long transition durations. +Our framework contributes to faster prototyping workflows for creating animated +character sequences, which is of enormous interest for the game and film +industry. + +
+
+ comment: 17 pages, 11 figures, conference +
+
+
+
+
+ + ☆ Human Comprehensible Active Learning of Genome-Scale Metabolic Networks AAAI + + +
+ An important application of Synthetic Biology is the engineering of the host +cell system to yield useful products. However, an increase in the scale of the +host system leads to huge design space and requires a large number of +validation trials with high experimental costs. A comprehensible machine +learning approach that efficiently explores the hypothesis space and guides +experimental design is urgently needed for the Design-Build-Test-Learn (DBTL) +cycle of the host cell system. We introduce a novel machine learning framework +ILP-iML1515 based on Inductive Logic Programming (ILP) that performs abductive +logical reasoning and actively learns from training examples. In contrast to +numerical models, ILP-iML1515 is built on comprehensible logical +representations of a genome-scale metabolic model and can update the model by +learning new logical structures from auxotrophic mutant trials. The ILP-iML1515 +framework 1) allows high-throughput simulations and 2) actively selects +experiments that reduce the experimental cost of learning gene functions in +comparison to randomly selected experiments. + +
+
+ comment: Invited presentation for AAAI Spring Symposium Series 2023 on + Computational Scientific Discovery +
+
+
+
+
+ + ☆ Real-time Detection of AI-Generated Speech for DeepFake Voice Conversion + + +
+ There are growing implications surrounding generative AI in the speech domain +that enable voice cloning and real-time voice conversion from one individual to +another. This technology poses a significant ethical threat and could lead to +breaches of privacy and misrepresentation, thus there is an urgent need for +real-time detection of AI-generated speech for DeepFake Voice Conversion. To +address the above emerging issues, the DEEP-VOICE dataset is generated in this +study, comprised of real human speech from eight well-known figures and their +speech converted to one another using Retrieval-based Voice Conversion. +Presenting as a binary classification problem of whether the speech is real or +AI-generated, statistical analysis of temporal audio features through t-testing +reveals that there are significantly different distributions. Hyperparameter +optimisation is implemented for machine learning models to identify the source +of speech. Following the training of 208 individual machine learning models +over 10-fold cross validation, it is found that the Extreme Gradient Boosting +model can achieve an average classification accuracy of 99.3% and can classify +speech in real-time, at around 0.004 milliseconds given one second of speech. +All data generated for this study is released publicly for future research on +AI speech detection. + +
+
+
+
+
+ + ☆ Out of the Box Thinking: Improving Customer Lifetime Value Modelling via + Expert Routing and Game Whale Detection + + +
+ Customer lifetime value (LTV) prediction is essential for mobile game +publishers trying to optimize the advertising investment for each user +acquisition based on the estimated worth. In mobile games, deploying +microtransactions is a simple yet effective monetization strategy, which +attracts a tiny group of game whales who splurge on in-game purchases. The +presence of such game whales may impede the practicality of existing LTV +prediction models, since game whales' purchase behaviours always exhibit varied +distribution from general users. Consequently, identifying game whales can open +up new opportunities to improve the accuracy of LTV prediction models. However, +little attention has been paid to applying game whale detection in LTV +prediction, and existing works are mainly specialized for the long-term LTV +prediction with the assumption that the high-quality user features are +available, which is not applicable in the UA stage. In this paper, we propose +ExpLTV, a novel multi-task framework to perform LTV prediction and game whale +detection in a unified way. In ExpLTV, we first innovatively design a deep +neural network-based game whale detector that can not only infer the intrinsic +order in accordance with monetary value, but also precisely identify high +spenders (i.e., game whales) and low spenders. Then, by treating the game whale +detector as a gating network to decide the different mixture patterns of LTV +experts assembling, we can thoroughly leverage the shared information and +scenario-specific information (i.e., game whales modelling and low spenders +modelling). Finally, instead of separately designing a purchase rate estimator +for two tasks, we design a shared estimator that can preserve the inner task +relationships. The superiority of ExpLTV is further validated via extensive +experiments on three industrial datasets. + +
+
+
+
+
+ + ☆ Continuous Reinforcement Learning-based Dynamic Difficulty Adjustment in + a Visual Working Memory Game + + +
+ Dynamic Difficulty Adjustment (DDA) is a viable approach to enhance a +player's experience in video games. Recently, Reinforcement Learning (RL) +methods have been employed for DDA in non-competitive games; nevertheless, they +rely solely on discrete state-action space with a small search space. In this +paper, we propose a continuous RL-based DDA methodology for a visual working +memory (VWM) game to handle the complex search space for the difficulty of +memorization. The proposed RL-based DDA tailors game difficulty based on the +player's score and game difficulty in the last trial. We defined a continuous +metric for the difficulty of memorization. Then, we consider the task +difficulty and the vector of difficulty-score as the RL's action and state, +respectively. We evaluated the proposed method through a within-subject +experiment involving 52 subjects. The proposed approach was compared with two +rule-based difficulty adjustment methods in terms of player's score and game +experience measured by a questionnaire. The proposed RL-based approach resulted +in a significantly better game experience in terms of competence, tension, and +negative and positive affect. Players also achieved higher scores and win +rates. Furthermore, the proposed RL-based DDA led to a significantly less +decline in the score in a 20-trial session. + +
+
+
+
+
+ + ☆ Solving Forward and Inverse Problems of Contact Mechanics using + Physics-Informed Neural Networks + + +
+ This paper explores the ability of physics-informed neural networks (PINNs) +to solve forward and inverse problems of contact mechanics for small +deformation elasticity. We deploy PINNs in a mixed-variable formulation +enhanced by output transformation to enforce Dirichlet and Neumann boundary +conditions as hard constraints. Inequality constraints of contact problems, +namely Karush-Kuhn-Tucker (KKT) type conditions, are enforced as soft +constraints by incorporating them into the loss function during network +training. To formulate the loss function contribution of KKT constraints, +existing approaches applied to elastoplasticity problems are investigated and +we explore a nonlinear complementarity problem (NCP) function, namely +Fischer-Burmeister, which possesses advantageous characteristics in terms of +optimization. Based on the Hertzian contact problem, we show that PINNs can +serve as pure partial differential equation (PDE) solver, as data-enhanced +forward model, as inverse solver for parameter identification, and as +fast-to-evaluate surrogate model. Furthermore, we demonstrate the importance of +choosing proper hyperparameters, e.g. loss weights, and a combination of Adam +and L-BFGS-B optimizers aiming for better results in terms of accuracy and +training time. + +
+
+
+
+
+ + ☆ Disentanglement Learning via Topology + + +
+ We propose TopDis (Topological Disentanglement), a method for learning +disentangled representations via adding multi-scale topological loss term. +Disentanglement is a crucial property of data representations substantial for +the explainability and robustness of deep learning models and a step towards +high-level cognition. The state-of-the-art method based on VAE minimizes the +total correlation of the joint distribution of latent variables. We take a +different perspective on disentanglement by analyzing topological properties of +data manifolds. In particular, we optimize the topological similarity for data +manifolds traversals. To the best of our knowledge, our paper is the first one +to propose a differentiable topological loss for disentanglement. Our +experiments have shown that the proposed topological loss improves +disentanglement scores such as MIG, FactorVAE score, SAP score and DCI +disentanglement score with respect to state-of-the-art results. Our method +works in an unsupervised manner, permitting to apply it for problems without +labeled factors of variation. Additionally, we show how to use the proposed +topological loss to find disentangled directions in a trained GAN. + +
+
+
+
+
+ + ☆ An Efficient Data Analysis Method for Big Data using Multiple-Model + Linear Regression + + +
+ This paper introduces a new data analysis method for big data using a newly +defined regression model named multiple model linear regression(MMLR), which +separates input datasets into subsets and construct local linear regression +models of them. The proposed data analysis method is shown to be more efficient +and flexible than other regression based methods. This paper also proposes an +approximate algorithm to construct MMLR models based on +$(\epsilon,\delta)$-estimator, and gives mathematical proofs of the correctness +and efficiency of MMLR algorithm, of which the time complexity is linear with +respect to the size of input datasets. This paper also empirically implements +the method on both synthetic and real-world datasets, the algorithm shows to +have comparable performance to existing regression methods in many cases, while +it takes almost the shortest time to provide a high prediction accuracy. + +
+
+
+
+
+ + ☆ Match-And-Deform: Time Series Domain Adaptation through Optimal + Transport and Temporal Alignment + + +
+ While large volumes of unlabeled data are usually available, associated +labels are often scarce. The unsupervised domain adaptation problem aims at +exploiting labels from a source domain to classify data from a related, yet +different, target domain. When time series are at stake, new difficulties arise +as temporal shifts may appear in addition to the standard feature distribution +shift. In this paper, we introduce the Match-And-Deform (MAD) approach that +aims at finding correspondences between the source and target time series while +allowing temporal distortions. The associated optimization problem +simultaneously aligns the series thanks to an optimal transport loss and the +time stamps through dynamic time warping. When embedded into a deep neural +network, MAD helps learning new representations of time series that both align +the domains and maximize the discriminative power of the network. Empirical +studies on benchmark datasets and remote sensing data demonstrate that MAD +makes meaningful sample-to-sample pairing and time shift estimation, reaching +similar or better classification performance than state-of-the-art deep time +series domain adaptation strategies. + +
+
+
+
+
+ + ☆ LR-XFL: Logical Reasoning-based Explainable Federated Learning + + +
+ Federated learning (FL) is an emerging approach for training machine learning +models collaboratively while preserving data privacy. The need for privacy +protection makes it difficult for FL models to achieve global transparency and +explainability. To address this limitation, we incorporate logic-based +explanations into FL by proposing the Logical Reasoning-based eXplainable +Federated Learning (LR-XFL) approach. Under LR-XFL, FL clients create local +logic rules based on their local data and send them, along with model updates, +to the FL server. The FL server connects the local logic rules through a proper +logical connector that is derived based on properties of client data, without +requiring access to the raw data. In addition, the server also aggregates the +local model updates with weight values determined by the quality of the +clients' local data as reflected by their uploaded logic rules. The results +show that LR-XFL outperforms the most relevant baseline by 1.19%, 5.81% and +5.41% in terms of classification accuracy, rule accuracy and rule fidelity, +respectively. The explicit rule evaluation and expression under LR-XFL enable +human experts to validate and correct the rules on the server side, hence +improving the global FL model's robustness to errors. It has the potential to +enhance the transparency of FL models for areas like healthcare and finance +where both data privacy and explainability are important. + +
+
+
+
+
+ + ☆ Master-slave Deep Architecture for Top-K Multi-armed Bandits with + Non-linear Bandit Feedback and Diversity Constraints + + +
+ We propose a novel master-slave architecture to solve the top-$K$ +combinatorial multi-armed bandits problem with non-linear bandit feedback and +diversity constraints, which, to the best of our knowledge, is the first +combinatorial bandits setting considering diversity constraints under bandit +feedback. Specifically, to efficiently explore the combinatorial and +constrained action space, we introduce six slave models with distinguished +merits to generate diversified samples well balancing rewards and constraints +as well as efficiency. Moreover, we propose teacher learning based optimization +and the policy co-training technique to boost the performance of the multiple +slave models. The master model then collects the elite samples provided by the +slave models and selects the best sample estimated by a neural contextual +UCB-based network to make a decision with a trade-off between exploration and +exploitation. Thanks to the elaborate design of slave models, the co-training +mechanism among slave models, and the novel interactions between the master and +slave models, our approach significantly surpasses existing state-of-the-art +algorithms in both synthetic and real datasets for recommendation tasks. The +code is available at: +\url{https://github.com/huanghanchi/Master-slave-Algorithm-for-Top-K-Bandits}. + +
+
+ comment: IEEE Transactions on Neural Networks and Learning Systems +
+
+
+
+
+ + ☆ A Continual Learning Approach for Cross-Domain White Blood Cell + Classification MICCAI 2023 + + +
+ Accurate classification of white blood cells in peripheral blood is essential +for diagnosing hematological diseases. Due to constantly evolving clinical +settings, data sources, and disease classifications, it is necessary to update +machine learning classification models regularly for practical real-world use. +Such models significantly benefit from sequentially learning from incoming data +streams without forgetting previously acquired knowledge. However, models can +suffer from catastrophic forgetting, causing a drop in performance on previous +tasks when fine-tuned on new data. Here, we propose a rehearsal-based continual +learning approach for class incremental and domain incremental scenarios in +white blood cell classification. To choose representative samples from previous +tasks, we employ exemplar set selection based on the model's predictions. This +involves selecting the most confident samples and the most challenging samples +identified through uncertainty estimation of the model. We thoroughly evaluated +our proposed approach on three white blood cell classification datasets that +differ in color, resolution, and class composition, including scenarios where +new domains or new classes are introduced to the model with every task. We also +test a long class incremental experiment with both new domains and new classes. +Our results demonstrate that our approach outperforms established baselines in +continual learning, including existing iCaRL and EWC methods for classifying +white blood cells in cross-domain environments. + +
+
+ comment: Accepted for publication at workshop on Domain Adaptation and + Representation Transfer (DART) in International Conference on Medical Image + Computing and Computer Assisted Intervention (MICCAI 2023) +
+
+
+
+
+ + ☆ Masked Feature Modelling: Feature Masking for the Unsupervised + Pre-training of a Graph Attention Network Block for Bottom-up Video Event + Recognition + + +
+ In this paper, we introduce Masked Feature Modelling (MFM), a novel approach +for the unsupervised pre-training of a Graph Attention Network (GAT) block. MFM +utilizes a pretrained Visual Tokenizer to reconstruct masked features of +objects within a video, leveraging the MiniKinetics dataset. We then +incorporate the pre-trained GAT block into a state-of-the-art bottom-up +supervised video-event recognition architecture, ViGAT, to improve the model's +starting point and overall accuracy. Experimental evaluations on the YLI-MED +dataset demonstrate the effectiveness of MFM in improving event recognition +performance. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ Optimal data pooling for shared learning in maintenance operations + + +
+ This paper addresses the benefits of pooling data for shared learning in +maintenance operations. We consider a set of systems subject to Poisson +degradation that are coupled through an a-priori unknown rate. Decision +problems involving these systems are high-dimensional Markov decision processes +(MDPs). We present a decomposition result that reduces such an MDP to +two-dimensional MDPs, enabling structural analyses and computations. We +leverage this decomposition to demonstrate that pooling data can lead to +significant cost reductions compared to not pooling. + +
+
+
+
+
+ + ☆ Geodesic Mode Connectivity ICLR 2023 + + +
+ Mode connectivity is a phenomenon where trained models are connected by a +path of low loss. We reframe this in the context of Information Geometry, where +neural networks are studied as spaces of parameterized distributions with +curved geometry. We hypothesize that shortest paths in these spaces, known as +geodesics, correspond to mode-connecting paths in the loss landscape. We +propose an algorithm to approximate geodesics and demonstrate that they achieve +mode connectivity. + +
+
+ comment: Published as a TinyPaper at ICLR 2023 +
+
+
+
+
+ + ☆ Don't Look into the Sun: Adversarial Solarization Attacks on Image + Classifiers + + +
+ Assessing the robustness of deep neural networks against out-of-distribution +inputs is crucial, especially in safety-critical domains like autonomous +driving, but also in safety systems where malicious actors can digitally alter +inputs to circumvent safety guards. However, designing effective +out-of-distribution tests that encompass all possible scenarios while +preserving accurate label information is a challenging task. Existing +methodologies often entail a compromise between variety and constraint levels +for attacks and sometimes even both. In a first step towards a more holistic +robustness evaluation of image classification models, we introduce an attack +method based on image solarization that is conceptually straightforward yet +avoids jeopardizing the global structure of natural images independent of the +intensity. Through comprehensive evaluations of multiple ImageNet models, we +demonstrate the attack's capacity to degrade accuracy significantly, provided +it is not integrated into the training augmentations. Interestingly, even then, +no full immunity to accuracy deterioration is achieved. In other settings, the +attack can often be simplified into a black-box attack with model-independent +parameters. Defenses against other corruptions do not consistently extend to be +effective against our specific attack. + Project website: https://github.com/paulgavrikov/adversarial_solarization + +
+
+
+
+
+ + ☆ APART: Diverse Skill Discovery using All Pairs with Ascending Reward and + DropouT + + +
+ We study diverse skill discovery in reward-free environments, aiming to +discover all possible skills in simple grid-world environments where prior +methods have struggled to succeed. This problem is formulated as mutual +training of skills using an intrinsic reward and a discriminator trained to +predict a skill given its trajectory. Our initial solution replaces the +standard one-vs-all (softmax) discriminator with a one-vs-one (all pairs) +discriminator and combines it with a novel intrinsic reward function and a +dropout regularization technique. The combined approach is named APART: Diverse +Skill Discovery using All Pairs with Ascending Reward and Dropout. We +demonstrate that APART discovers all the possible skills in grid worlds with +remarkably fewer samples than previous works. Motivated by the empirical +success of APART, we further investigate an even simpler algorithm that +achieves maximum skills by altering VIC, rescaling its intrinsic reward, and +tuning the temperature of its softmax discriminator. We believe our findings +shed light on the crucial factors underlying success of skill discovery +algorithms in reinforcement learning. + +
+
+
+
+
+ + ☆ The GENEA Challenge 2023: A large scale evaluation of gesture generation + models in monadic and dyadic settings + + +
+ This paper reports on the GENEA Challenge 2023, in which participating teams +built speech-driven gesture-generation systems using the same speech and motion +dataset, followed by a joint evaluation. This year's challenge provided data on +both sides of a dyadic interaction, allowing teams to generate full-body motion +for an agent given its speech (text and audio) and the speech and motion of the +interlocutor. We evaluated 12 submissions and 2 baselines together with +held-out motion-capture data in several large-scale user studies. The studies +focused on three aspects: 1) the human-likeness of the motion, 2) the +appropriateness of the motion for the agent's own speech whilst controlling for +the human-likeness of the motion, and 3) the appropriateness of the motion for +the behaviour of the interlocutor in the interaction, using a setup that +controls for both the human-likeness of the motion and the agent's own speech. +We found a large span in human-likeness between challenge submissions, with a +few systems rated close to human mocap. Appropriateness seems far from being +solved, with most submissions performing in a narrow range slightly above +chance, far behind natural motion. The effect of the interlocutor is even more +subtle, with submitted systems at best performing barely above chance. +Interestingly, a dyadic system being highly appropriate for agent speech does +not necessarily imply high appropriateness for the interlocutor. Additional +material is available via the project website at +https://svito-zar.github.io/GENEAchallenge2023/ . + +
+
+ comment: The first three authors made equal contributions. Accepted for + publication at the ACM International Conference on Multimodal Interaction + (ICMI) +
+
+
+
+
+ + ☆ Towards Hierarchical Regional Transformer-based Multiple Instance + Learning ICCV 2023 + + +
+ The classification of gigapixel histopathology images with deep multiple +instance learning models has become a critical task in digital pathology and +precision medicine. In this work, we propose a Transformer-based multiple +instance learning approach that replaces the traditional learned attention +mechanism with a regional, Vision Transformer inspired self-attention +mechanism. We present a method that fuses regional patch information to derive +slide-level predictions and show how this regional aggregation can be stacked +to hierarchically process features on different distance levels. To increase +predictive accuracy, especially for datasets with small, local morphological +features, we introduce a method to focus the image processing on high attention +regions during inference. Our approach is able to significantly improve +performance over the baseline on two histopathology datasets and points towards +promising directions for further research. + +
+
+ comment: To be published as ICCV 2023 workshop paper +
+
+
+
+
+ + ☆ Uncertainty and Explainable Analysis of Machine Learning Model for + Reconstruction of Sonic Slowness Logs + + +
+ Logs are valuable information for oil and gas fields as they help to +determine the lithology of the formations surrounding the borehole and the +location and reserves of subsurface oil and gas reservoirs. However, important +logs are often missing in horizontal or old wells, which poses a challenge in +field applications. In this paper, we utilize data from the 2020 machine +learning competition of the SPWLA, which aims to predict the missing +compressional wave slowness and shear wave slowness logs using other logs in +the same borehole. We employ the NGBoost algorithm to construct an Ensemble +Learning model that can predicate the results as well as their uncertainty. +Furthermore, we combine the SHAP method to investigate the interpretability of +the machine learning model. We compare the performance of the NGBosst model +with four other commonly used Ensemble Learning methods, including Random +Forest, GBDT, XGBoost, LightGBM. The results show that the NGBoost model +performs well in the testing set and can provide a probability distribution for +the prediction results. In addition, the variance of the probability +distribution of the predicted log can be used to justify the quality of the +constructed log. Using the SHAP explainable machine learning model, we +calculate the importance of each input log to the predicted results as well as +the coupling relationship among input logs. Our findings reveal that the +NGBoost model tends to provide greater slowness prediction results when the +neutron porosity and gamma ray are large, which is consistent with the +cognition of petrophysical models. Furthermore, the machine learning model can +capture the influence of the changing borehole caliper on slowness, where the +influence of borehole caliper on slowness is complex and not easy to establish +a direct relationship. These findings are in line with the physical principle +of borehole acoustics. + +
+
+
+
+
+ + ☆ Try with Simpler -- An Evaluation of Improved Principal Component + Analysis in Log-based Anomaly Detection + + +
+ The rapid growth of deep learning (DL) has spurred interest in enhancing +log-based anomaly detection. This approach aims to extract meaning from log +events (log message templates) and develop advanced DL models for anomaly +detection. However, these DL methods face challenges like heavy reliance on +training data, labels, and computational resources due to model complexity. In +contrast, traditional machine learning and data mining techniques are less +data-dependent and more efficient but less effective than DL. To make log-based +anomaly detection more practical, the goal is to enhance traditional techniques +to match DL's effectiveness. Previous research in a different domain (linking +questions on Stack Overflow) suggests that optimized traditional techniques can +rival state-of-the-art DL methods. Drawing inspiration from this concept, we +conducted an empirical study. We optimized the unsupervised PCA (Principal +Component Analysis), a traditional technique, by incorporating lightweight +semantic-based log representation. This addresses the issue of unseen log +events in training data, enhancing log representation. Our study compared seven +log-based anomaly detection methods, including four DL-based, two traditional, +and the optimized PCA technique, using public and industrial datasets. Results +indicate that the optimized unsupervised PCA technique achieves similar +effectiveness to advanced supervised/semi-supervised DL methods while being +more stable with limited training data and resource-efficient. This +demonstrates the adaptability and strength of traditional techniques through +small yet impactful adaptations. + +
+
+
+
+
+ + ☆ A Greedy Approach for Offering to Telecom Subscribers + + +
+ Customer retention or churn prevention is a challenging task of a telecom +operator. One of the effective approaches is to offer some attractive incentive +or additional services or money to the subscribers for keeping them engaged and +make sure they stay in the operator's network for longer time. Often, operators +allocate certain amount of monetary budget to carry out the offer campaign. The +difficult part of this campaign is the selection of a set of customers from a +large subscriber-base and deciding the amount that should be offered to an +individual so that operator's objective is achieved. There may be multiple +objectives (e.g., maximizing revenue, minimizing number of churns) for +selection of subscriber and selection of an offer to the selected subscriber. +Apart from monetary benefit, offers may include additional data, SMS, hots-spot +tethering, and many more. This problem is known as offer optimization. In this +paper, we propose a novel combinatorial algorithm for solving offer +optimization under heterogeneous offers by maximizing expected revenue under +the scenario of subscriber churn, which is, in general, seen in telecom domain. +The proposed algorithm is efficient and accurate even for a very large +subscriber-base. + +
+
+
+
+
+ + ☆ Exploiting Time-Frequency Conformers for Music Audio Enhancement + + +
+ With the proliferation of video platforms on the internet, recording musical +performances by mobile devices has become commonplace. However, these +recordings often suffer from degradation such as noise and reverberation, which +negatively impact the listening experience. Consequently, the necessity for +music audio enhancement (referred to as music enhancement from this point +onward), involving the transformation of degraded audio recordings into +pristine high-quality music, has surged to augment the auditory experience. To +address this issue, we propose a music enhancement system based on the +Conformer architecture that has demonstrated outstanding performance in speech +enhancement tasks. Our approach explores the attention mechanisms of the +Conformer and examines their performance to discover the best approach for the +music enhancement task. Our experimental results show that our proposed model +achieves state-of-the-art performance on single-stem music enhancement. +Furthermore, our system can perform general music enhancement with multi-track +mixtures, which has not been examined in previous work. + +
+
+ comment: Accepted by ACM Multimedia 2023 +
+
+
+
+
+ + ☆ LORD: Leveraging Open-Set Recognition with Unknown Data ICCV 2023 + + +
+ Handling entirely unknown data is a challenge for any deployed classifier. +Classification models are typically trained on a static pre-defined dataset and +are kept in the dark for the open unassigned feature space. As a result, they +struggle to deal with out-of-distribution data during inference. Addressing +this task on the class-level is termed open-set recognition (OSR). However, +most OSR methods are inherently limited, as they train closed-set classifiers +and only adapt the downstream predictions to OSR. This work presents LORD, a +framework to Leverage Open-set Recognition by exploiting unknown Data. LORD +explicitly models open space during classifier training and provides a +systematic evaluation for such approaches. We identify three model-agnostic +training strategies that exploit background data and applied them to +well-established classifiers. Due to LORD's extensive evaluation protocol, we +consistently demonstrate improved recognition of unknown data. The benchmarks +facilitate in-depth analysis across various requirement levels. To mitigate +dependency on extensive and costly background datasets, we explore mixup as an +off-the-shelf data generation technique. Our experiments highlight mixup's +effectiveness as a substitute for background datasets. Lightweight constraints +on mixup synthesis further improve OSR performance. + +
+
+ comment: Accepted at ICCV 2023 Workshop (Out-Of-Distribution Generalization in + Computer Vision) +
+
+
+
+
+ + ☆ Persistent learning signals and working memory without continuous + attractors + + +
+ Neural dynamical systems with stable attractor structures, such as point +attractors and continuous attractors, are hypothesized to underlie meaningful +temporal behavior that requires working memory. However, working memory may not +support useful learning signals necessary to adapt to changes in the temporal +structure of the environment. We show that in addition to the continuous +attractors that are widely implicated, periodic and quasi-periodic attractors +can also support learning arbitrarily long temporal relationships. Unlike the +continuous attractors that suffer from the fine-tuning problem, the less +explored quasi-periodic attractors are uniquely qualified for learning to +produce temporally structured behavior. Our theory has broad implications for +the design of artificial learning systems and makes predictions about +observable signatures of biological neural dynamics that can support temporal +dependence learning and working memory. Based on our theory, we developed a new +initialization scheme for artificial recurrent neural networks that outperforms +standard methods for tasks that require learning temporal dynamics. Moreover, +we propose a robust recurrent memory mechanism for integrating and maintaining +head direction without a ring attractor. + +
+
+
+
+
+ + ☆ A Huber Loss Minimization Approach to Byzantine Robust Federated + Learning + + +
+ Federated learning systems are susceptible to adversarial attacks. To combat +this, we introduce a novel aggregator based on Huber loss minimization, and +provide a comprehensive theoretical analysis. Under independent and identically +distributed (i.i.d) assumption, our approach has several advantages compared to +existing methods. Firstly, it has optimal dependence on $\epsilon$, which +stands for the ratio of attacked clients. Secondly, our approach does not need +precise knowledge of $\epsilon$. Thirdly, it allows different clients to have +unequal data sizes. We then broaden our analysis to include non-i.i.d data, +such that clients have slightly different distributions. + +
+
+
+
+
+ + ☆ Hypergraph Convolutional Networks for Fine-grained ICU Patient + Similarity Analysis and Risk Prediction + + +
+ The Intensive Care Unit (ICU) is one of the most important parts of a +hospital, which admits critically ill patients and provides continuous +monitoring and treatment. Various patient outcome prediction methods have been +attempted to assist healthcare professionals in clinical decision-making. +Existing methods focus on measuring the similarity between patients using deep +neural networks to capture the hidden feature structures. However, the +higher-order relationships are ignored, such as patient characteristics (e.g., +diagnosis codes) and their causal effects on downstream clinical predictions. + In this paper, we propose a novel Hypergraph Convolutional Network that +allows the representation of non-pairwise relationships among diagnosis codes +in a hypergraph to capture the hidden feature structures so that fine-grained +patient similarity can be calculated for personalized mortality risk +prediction. Evaluation using a publicly available eICU Collaborative Research +Database indicates that our method achieves superior performance over the +state-of-the-art models on mortality risk prediction. Moreover, the results of +several case studies demonstrated the effectiveness of constructing graph +networks in providing good transparency and robustness in decision-making. + +
+
+ comment: 7 pages, 2 figures, submitted to IEEE BIBM 2023 +
+
+
+
+
+ + ☆ Conditional Kernel Imitation Learning for Continuous State Environments + + +
+ Imitation Learning (IL) is an important paradigm within the broader +reinforcement learning (RL) methodology. Unlike most of RL, it does not assume +availability of reward-feedback. Reward inference and shaping are known to be +difficult and error-prone methods particularly when the demonstration data +comes from human experts. Classical methods such as behavioral cloning and +inverse reinforcement learning are highly sensitive to estimation errors, a +problem that is particularly acute in continuous state space problems. +Meanwhile, state-of-the-art IL algorithms convert behavioral policy learning +problems into distribution-matching problems which often require additional +online interaction data to be effective. In this paper, we consider the problem +of imitation learning in continuous state space environments based solely on +observed behavior, without access to transition dynamics information, reward +structure, or, most importantly, any additional interactions with the +environment. Our approach is based on the Markov balance equation and +introduces a novel conditional kernel density estimation-based imitation +learning framework. It involves estimating the environment's transition +dynamics using conditional kernel density estimators and seeks to satisfy the +probabilistic balance equations for the environment. We establish that our +estimators satisfy basic asymptotic consistency requirements. Through a series +of numerical experiments on continuous state benchmark environments, we show +consistently superior empirical performance over many state-of-the-art IL +algorithms. + +
+
+
+
+
+ + ☆ Multivariate Time-Series Anomaly Detection with Contaminated Data: + Application to Physiological Signals + + +
+ Mainstream unsupervised anomaly detection algorithms often excel in academic +datasets, yet their real-world performance is restricted due to the controlled +experimental conditions involving clean training data. Addressing the challenge +of training with noise, a prevalent issue in practical anomaly detection, is +frequently overlooked. In a pioneering endeavor, this study delves into the +realm of label-level noise within sensory time-series anomaly detection (TSAD). +This paper presents a novel and practical end-to-end unsupervised TSAD when the +training data are contaminated with anomalies. The introduced approach, called +TSAD-C, is devoid of access to abnormality labels during the training phase. +TSAD-C encompasses three modules: a Decontaminator to rectify the abnormalities +(aka noise) present in the training data, a Variable Dependency Modeling module +to capture both long-term intra- and inter-variable dependencies within the +decontaminated data that can be considered as a surrogate of the pure normal +data, and an Anomaly Scoring module to detect anomalies. Our extensive +experiments conducted on three widely used physiological datasets conclusively +demonstrate that our approach surpasses existing methodologies, thus +establishing a new state-of-the-art performance in the field. + +
+
+ comment: 9 pages, 2 tables, 3 figures +
+
+
+
+
+ + ☆ Variational Information Pursuit with Large Language and Multimodal + Models for Interpretable Predictions + + +
+ Variational Information Pursuit (V-IP) is a framework for making +interpretable predictions by design by sequentially selecting a short chain of +task-relevant, user-defined and interpretable queries about the data that are +most informative for the task. While this allows for built-in interpretability +in predictive models, applying V-IP to any task requires data samples with +dense concept-labeling by domain experts, limiting the application of V-IP to +small-scale tasks where manual data annotation is feasible. In this work, we +extend the V-IP framework with Foundational Models (FMs) to address this +limitation. More specifically, we use a two-step process, by first leveraging +Large Language Models (LLMs) to generate a sufficiently large candidate set of +task-relevant interpretable concepts, then using Large Multimodal Models to +annotate each data sample by semantic similarity with each concept in the +generated concept set. While other interpretable-by-design frameworks such as +Concept Bottleneck Models (CBMs) require an additional step of removing +repetitive and non-discriminative concepts to have good interpretability and +test performance, we mathematically and empirically justify that, with a +sufficiently informative and task-relevant query (concept) set, the proposed +FM+V-IP method does not require any type of concept filtering. In addition, we +show that FM+V-IP with LLM generated concepts can achieve better test +performance than V-IP with human annotated concepts, demonstrating the +effectiveness of LLMs at generating efficient query sets. Finally, when +compared to other interpretable-by-design frameworks such as CBMs, FM+V-IP can +achieve competitive test performance using fewer number of concepts/queries in +both cases with filtered or unfiltered concept sets. + +
+
+
+
+
+ + ☆ Deep Reinforcement Learning-driven Cross-Community Energy Interaction + Optimal Scheduling + + +
+ In order to coordinate energy interactions among various communities and +energy conversions among multi-energy subsystems within the multi-community +integrated energy system under uncertain conditions, and achieve overall +optimization and scheduling of the comprehensive energy system, this paper +proposes a comprehensive scheduling model that utilizes a multi-agent deep +reinforcement learning algorithm to learn load characteristics of different +communities and make decisions based on this knowledge. In this model, the +scheduling problem of the integrated energy system is transformed into a Markov +decision process and solved using a data-driven deep reinforcement learning +algorithm, which avoids the need for modeling complex energy coupling +relationships between multi-communities and multi-energy subsystems. The +simulation results show that the proposed method effectively captures the load +characteristics of different communities and utilizes their complementary +features to coordinate reasonable energy interactions among them. This leads to +a reduction in wind curtailment rate from 16.3% to 0% and lowers the overall +operating cost by 5445.6 Yuan, demonstrating significant economic and +environmental benefits. + +
+
+ comment: in Chinese language, Accepted by Electric Power Construction +
+
+
+
+
+ + ☆ Don't blame Dataset Shift! Shortcut Learning due to Gradients and Cross + Entropy + + +
+ Common explanations for shortcut learning assume that the shortcut improves +prediction under the training distribution but not in the test distribution. +Thus, models trained via the typical gradient-based optimization of +cross-entropy, which we call default-ERM, utilize the shortcut. However, even +when the stable feature determines the label in the training distribution and +the shortcut does not provide any additional information, like in perception +tasks, default-ERM still exhibits shortcut learning. Why are such solutions +preferred when the loss for default-ERM can be driven to zero using the stable +feature alone? By studying a linear perception task, we show that default-ERM's +preference for maximizing the margin leads to models that depend more on the +shortcut than the stable feature, even without overparameterization. This +insight suggests that default-ERM's implicit inductive bias towards max-margin +is unsuitable for perception tasks. Instead, we develop an inductive bias +toward uniform margins and show that this bias guarantees dependence only on +the perfect stable feature in the linear perception task. We develop loss +functions that encourage uniform-margin solutions, called margin control +(MARG-CTRL). MARG-CTRL mitigates shortcut learning on a variety of vision and +language tasks, showing that better inductive biases can remove the need for +expensive two-stage shortcut-mitigating methods in perception tasks. + +
+
+
+
+
+ + ☆ A Co-training Approach for Noisy Time Series Learning CIKM2023 + + +
+ In this work, we focus on robust time series representation learning. Our +assumption is that real-world time series is noisy and complementary +information from different views of the same time series plays an important +role while analyzing noisy input. Based on this, we create two views for the +input time series through two different encoders. We conduct co-training based +contrastive learning iteratively to learn the encoders. Our experiments +demonstrate that this co-training approach leads to a significant improvement +in performance. Especially, by leveraging the complementary information from +different views, our proposed TS-CoT method can mitigate the impact of data +noise and corruption. Empirical evaluations on four time series benchmarks in +unsupervised and semi-supervised settings reveal that TS-CoT outperforms +existing methods. Furthermore, the representations learned by TS-CoT can +transfer well to downstream tasks through fine-tuning. + +
+
+ comment: Accepted by CIKM2023 +
+
+
+
+
+ + ☆ CALM : A Multi-task Benchmark for Comprehensive Assessment of Language + Model Bias + + +
+ As language models (LMs) become increasingly powerful, it is important to +quantify and compare them for sociodemographic bias with potential for harm. +Prior bias measurement datasets are sensitive to perturbations in their +manually designed templates, therefore unreliable. To achieve reliability, we +introduce the Comprehensive Assessment of Language Model bias (CALM), a +benchmark dataset to quantify bias in LMs across three tasks. We integrate 16 +existing datasets across different domains, such as Wikipedia and news +articles, to filter 224 templates from which we construct a dataset of 78,400 +examples. We compare the diversity of CALM with prior datasets on metrics such +as average semantic similarity, and variation in template length, and test the +sensitivity to small perturbations. We show that our dataset is more diverse +and reliable than previous datasets, thus better capture the breadth of +linguistic variation required to reliably evaluate model bias. We evaluate 20 +large language models including six prominent families of LMs such as Llama-2. +In two LM series, OPT and Bloom, we found that larger parameter models are more +biased than lower parameter models. We found the T0 series of models to be the +least biased. Furthermore, we noticed a tradeoff between gender and racial bias +with increasing model size in some model series. The code is available at +https://github.com/vipulgupta1011/CALM. + +
+
+
+
+
+ + ☆ FedSoL: Bridging Global Alignment and Local Generality in Federated + Learning + + +
+ Federated Learning (FL) aggregates locally trained models from individual +clients to construct a global model. While FL enables learning a model with +data privacy, it often suffers from significant performance degradation when +client data distributions are heterogeneous. Many previous FL algorithms have +addressed this issue by introducing various proximal restrictions. These +restrictions aim to encourage global alignment by constraining the deviation of +local learning from the global objective. However, they inherently limit local +learning by interfering with the original local objectives. Recently, an +alternative approach has emerged to improve local learning generality. By +obtaining local models within a smooth loss landscape, this approach mitigates +conflicts among different local objectives of the clients. Yet, it does not +ensure stable global alignment, as local learning does not take the global +objective into account. In this study, we propose Federated Stability on +Learning (FedSoL), which combines both the concepts of global alignment and +local generality. In FedSoL, the local learning seeks a parameter region robust +against proximal perturbations. This strategy introduces an implicit proximal +restriction effect in local learning while maintaining the original local +objective for parameter update. Our experiments show that FedSoL consistently +achieves state-of-the-art performance on various setups. + +
+
+
+
+
+ + ☆ SieveNet: Selecting Point-Based Features for Mesh Networks + + +
+ Meshes are widely used in 3D computer vision and graphics, but their +irregular topology poses challenges in applying them to existing neural network +architectures. Recent advances in mesh neural networks turn to remeshing and +push the boundary of pioneer methods that solely take the raw meshes as input. +Although the remeshing offers a regular topology that significantly facilitates +the design of mesh network architectures, features extracted from such remeshed +proxies may struggle to retain the underlying geometry faithfully, limiting the +subsequent neural network's capacity. To address this issue, we propose +SieveNet, a novel paradigm that takes into account both the regular topology +and the exact geometry. Specifically, this method utilizes structured mesh +topology from remeshing and accurate geometric information from +distortion-aware point sampling on the surface of the original mesh. +Furthermore, our method eliminates the need for hand-crafted feature +engineering and can leverage off-the-shelf network architectures such as the +vision transformer. Comprehensive experimental results on classification and +segmentation tasks well demonstrate the effectiveness and superiority of our +method. + +
+
+ comment: The project homepage is https://sievenet.github.io/ +
+
+
+
+
+ + ☆ UNISOUND System for VoxCeleb Speaker Recognition Challenge 2023 + + +
+ This report describes the UNISOUND submission for Track1 and Track2 of +VoxCeleb Speaker Recognition Challenge 2023 (VoxSRC 2023). We submit the same +system on Track 1 and Track 2, which is trained with only VoxCeleb2-dev. +Large-scale ResNet and RepVGG architectures are developed for the challenge. We +propose a consistency-aware score calibration method, which leverages the +stability of audio voiceprints in similarity score by a Consistency Measure +Factor (CMF). CMF brings a huge performance boost in this challenge. Our final +system is a fusion of six models and achieves the first place in Track 1 and +second place in Track 2 of VoxSRC 2023. The minDCF of our submission is 0.0855 +and the EER is 1.5880%. + +
+
+
+
+
+ + ☆ Not Only Rewards But Also Constraints: Applications on Legged Robot + Locomotion + + +
+ Several earlier studies have shown impressive control performance in complex +robotic systems by designing the controller using a neural network and training +it with model-free reinforcement learning. However, these outstanding +controllers with natural motion style and high task performance are developed +through extensive reward engineering, which is a highly laborious and +time-consuming process of designing numerous reward terms and determining +suitable reward coefficients. In this work, we propose a novel reinforcement +learning framework for training neural network controllers for complex robotic +systems consisting of both rewards and constraints. To let the engineers +appropriately reflect their intent to constraints and handle them with minimal +computation overhead, two constraint types and an efficient policy optimization +algorithm are suggested. The learning framework is applied to train locomotion +controllers for several legged robots with different morphology and physical +attributes to traverse challenging terrains. Extensive simulation and +real-world experiments demonstrate that performant controllers can be trained +with significantly less reward engineering, by tuning only a single reward +coefficient. Furthermore, a more straightforward and intuitive engineering +process can be utilized, thanks to the interpretability and generalizability of +constraints. The summary video is available at https://youtu.be/KAlm3yskhvM. + +
+
+ comment: Submitted to Transactions on Robotics (T-RO) +
+
+
+
+
+ + ☆ Masked Autoencoders are Efficient Class Incremental Learners ICCV 2023 + + +
+ Class Incremental Learning (CIL) aims to sequentially learn new classes while +avoiding catastrophic forgetting of previous knowledge. We propose to use +Masked Autoencoders (MAEs) as efficient learners for CIL. MAEs were originally +designed to learn useful representations through reconstructive unsupervised +learning, and they can be easily integrated with a supervised loss for +classification. Moreover, MAEs can reliably reconstruct original input images +from randomly selected patches, which we use to store exemplars from past tasks +more efficiently for CIL. We also propose a bilateral MAE framework to learn +from image-level and embedding-level fusion, which produces better-quality +reconstructed images and more stable representations. Our experiments confirm +that our approach performs better than the state-of-the-art on CIFAR-100, +ImageNet-Subset, and ImageNet-Full. The code is available at +https://github.com/scok30/MAE-CIL . + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ☆ False Information, Bots and Malicious Campaigns: Demystifying Elements + of Social Media Manipulations + + +
+ The rapid spread of false information and persistent manipulation attacks on +online social networks (OSNs), often for political, ideological, or financial +gain, has affected the openness of OSNs. While researchers from various +disciplines have investigated different manipulation-triggering elements of +OSNs (such as understanding information diffusion on OSNs or detecting +automated behavior of accounts), these works have not been consolidated to +present a comprehensive overview of the interconnections among these elements. +Notably, user psychology, the prevalence of bots, and their tactics in relation +to false information detection have been overlooked in previous research. To +address this research gap, this paper synthesizes insights from various +disciplines to provide a comprehensive analysis of the manipulation landscape. +By integrating the primary elements of social media manipulation (SMM), +including false information, bots, and malicious campaigns, we extensively +examine each SMM element. Through a systematic investigation of prior research, +we identify commonalities, highlight existing gaps, and extract valuable +insights in the field. Our findings underscore the urgent need for +interdisciplinary research to effectively combat social media manipulations, +and our systematization can guide future research efforts and assist OSN +providers in ensuring the safety and integrity of their platforms. + +
+
+
+
+
+ + ☆ Optimizing Neural Network Scale for ECG Classification + + +
+ We study scaling convolutional neural networks (CNNs), specifically targeting +Residual neural networks (ResNet), for analyzing electrocardiograms (ECGs). +Although ECG signals are time-series data, CNN-based models have been shown to +outperform other neural networks with different architectures in ECG analysis. +However, most previous studies in ECG analysis have overlooked the importance +of network scaling optimization, which significantly improves performance. We +explored and demonstrated an efficient approach to scale ResNet by examining +the effects of crucial parameters, including layer depth, the number of +channels, and the convolution kernel size. Through extensive experiments, we +found that a shallower network, a larger number of channels, and smaller kernel +sizes result in better performance for ECG classifications. The optimal network +scale might differ depending on the target task, but our findings provide +insight into obtaining more efficient and accurate models with fewer computing +resources or less time. In practice, we demonstrate that a narrower search +space based on our findings leads to higher performance. + +
+
+ comment: 30pages +
+
+
+
+
+ + ☆ Fall Detection using Knowledge Distillation Based Long short-term memory + for Offline Embedded and Low Power Devices + + +
+ This paper presents a cost-effective, low-power approach to unintentional +fall detection using knowledge distillation-based LSTM (Long Short-Term Memory) +models to significantly improve accuracy. With a primary focus on analyzing +time-series data collected from various sensors, the solution offers real-time +detection capabilities, ensuring prompt and reliable identification of falls. +The authors investigate fall detection models that are based on different +sensors, comparing their accuracy rates and performance. Furthermore, they +employ the technique of knowledge distillation to enhance the models' +precision, resulting in refined accurate configurations that consume lower +power. As a result, this proposed solution presents a compelling avenue for the +development of energy-efficient fall detection systems for future advancements +in this critical domain. + +
+
+ comment: 4 pages +
+
+
+
+
+ + ☆ Business Metric-Aware Forecasting for Inventory Management + + +
+ Time-series forecasts play a critical role in business planning. However, +forecasters typically optimize objectives that are agnostic to downstream +business goals and thus can produce forecasts misaligned with business +preferences. In this work, we demonstrate that optimization of conventional +forecasting metrics can often lead to sub-optimal downstream business +performance. Focusing on the inventory management setting, we derive an +efficient procedure for computing and optimizing proxies of common downstream +business metrics in an end-to-end differentiable manner. We explore a wide +range of plausible cost trade-off scenarios, and empirically demonstrate that +end-to-end optimization often outperforms optimization of standard +business-agnostic forecasting metrics (by up to 45.7% for a simple scaling +model, and up to 54.0% for an LSTM encoder-decoder model). Finally, we discuss +how our findings could benefit other business contexts. + +
+
+
+
+
+ + ☆ Bayesian low-rank adaptation for large language models + + +
+ Parameter-efficient fine-tuning (PEFT) has emerged as a new paradigm for +cost-efficient fine-tuning of large language models (LLMs), with low-rank +adaptation (LoRA) being a widely adopted choice. However, fine-tuned LLMs often +become overconfident especially on when fine-tuned on smaller datasets. +Bayesian methods, with their inherent ability to estimate uncertainty, serve as +potent tools to mitigate overconfidence and enhance calibration. In this work, +we introduce Laplace-LoRA, a straightforward yet effective Bayesian method, +which applies the Laplace approximation to the LoRA parameters and, +considerably boosts the calibration of fine-tuned LLMs. + +
+
+
+
+
+ + ☆ Contrastive Learning of Temporal Distinctiveness for Survival Analysis + in Electronic Health Records CIKM 2023 + + +
+ Survival analysis plays a crucial role in many healthcare decisions, where +the risk prediction for the events of interest can support an informative +outlook for a patient's medical journey. Given the existence of data censoring, +an effective way of survival analysis is to enforce the pairwise temporal +concordance between censored and observed data, aiming to utilize the time +interval before censoring as partially observed time-to-event labels for +supervised learning. Although existing studies mostly employed ranking methods +to pursue an ordering objective, contrastive methods which learn a +discriminative embedding by having data contrast against each other, have not +been explored thoroughly for survival analysis. Therefore, in this paper, we +propose a novel Ontology-aware Temporality-based Contrastive Survival (OTCSurv) +analysis framework that utilizes survival durations from both censored and +observed data to define temporal distinctiveness and construct negative sample +pairs with adjustable hardness for contrastive learning. Specifically, we first +use an ontological encoder and a sequential self-attention encoder to represent +the longitudinal EHR data with rich contexts. Second, we design a temporal +contrastive loss to capture varying survival durations in a supervised setting +through a hardness-aware negative sampling mechanism. Last, we incorporate the +contrastive task into the time-to-event predictive task with multiple loss +components. We conduct extensive experiments using a large EHR dataset to +forecast the risk of hospitalized patients who are in danger of developing +acute kidney injury (AKI), a critical and urgent medical condition. The +effectiveness and explainability of the proposed model are validated through +comprehensive quantitative and qualitative studies. + +
+
+ comment: This paper has been accepted for publication at the CIKM 2023 + conference +
+
+
+
+
+ + ☆ Racing Towards Reinforcement Learning based control of an Autonomous + Formula SAE Car + + +
+ With the rising popularity of autonomous navigation research, Formula Student +(FS) events are introducing a Driverless Vehicle (DV) category to their event +list. This paper presents the initial investigation into utilising Deep +Reinforcement Learning (RL) for end-to-end control of an autonomous FS race car +for these competitions. We train two state-of-the-art RL algorithms in +simulation on tracks analogous to the full-scale design on a Turtlebot2 +platform. The results demonstrate that our approach can successfully learn to +race in simulation and then transfer to a real-world racetrack on the physical +platform. Finally, we provide insights into the limitations of the presented +approach and guidance into the future directions for applying RL toward +full-scale autonomous FS racing. + +
+
+ comment: Accepted at the Australasian Conference on Robotics and Automation + (ACRA 2022) +
+
+
+
+
+ + ☆ SHIELD: Sustainable Hybrid Evolutionary Learning Framework for Carbon, + Wastewater, and Energy-Aware Data Center Management + + +
+ Today's cloud data centers are often distributed geographically to provide +robust data services. But these geo-distributed data centers (GDDCs) have a +significant associated environmental impact due to their increasing carbon +emissions and water usage, which needs to be curtailed. Moreover, the energy +costs of operating these data centers continue to rise. This paper proposes a +novel framework to co-optimize carbon emissions, water footprint, and energy +costs of GDDCs, using a hybrid workload management framework called SHIELD that +integrates machine learning guided local search with a decomposition-based +evolutionary algorithm. Our framework considers geographical factors and +time-based differences in power generation/use, costs, and environmental +impacts to intelligently manage workload distribution across GDDCs and data +center operation. Experimental results show that SHIELD can realize 34.4x +speedup and 2.1x improvement in Pareto Hypervolume while reducing the carbon +footprint by up to 3.7x, water footprint by up to 1.8x, energy costs by up to +1.3x, and a cumulative improvement across all objectives (carbon, water, cost) +of up to 4.8x compared to the state-of-the-art. + +
+
+
+
+
+ + ☆ Multivariate Time Series Anomaly Detection: Fancy Algorithms and Flawed + Evaluation Methodology + + +
+ Multivariate Time Series (MVTS) anomaly detection is a long-standing and +challenging research topic that has attracted tremendous research effort from +both industry and academia recently. However, a careful study of the literature +makes us realize that 1) the community is active but not as organized as other +sibling machine learning communities such as Computer Vision (CV) and Natural +Language Processing (NLP), and 2) most proposed solutions are evaluated using +either inappropriate or highly flawed protocols, with an apparent lack of +scientific foundation. So flawed is one very popular protocol, the so-called +\pa protocol, that a random guess can be shown to systematically outperform +\emph{all} algorithms developed so far. In this paper, we review and evaluate +many recent algorithms using more robust protocols and discuss how a normally +good protocol may have weaknesses in the context of MVTS anomaly detection and +how to mitigate them. We also share our concerns about benchmark datasets, +experiment design and evaluation methodology we observe in many works. +Furthermore, we propose a simple, yet challenging, baseline algorithm based on +Principal Components Analysis (PCA) that surprisingly outperforms many recent +Deep Learning (DL) based approaches on popular benchmark datasets. The main +objective of this work is to stimulate more effort towards important aspects of +the research such as data, experiment design, evaluation methodology and result +interpretability, instead of putting the highest weight on the design of +increasingly more complex and "fancier" algorithms. + +
+
+
+
+
+ + ☆ Objective-Agnostic Enhancement of Molecule Properties via Multi-Stage + VAE + + +
+ Variational autoencoder (VAE) is a popular method for drug discovery and +various architectures and pipelines have been proposed to improve its +performance. However, VAE approaches are known to suffer from poor manifold +recovery when the data lie on a low-dimensional manifold embedded in a higher +dimensional ambient space [Dai and Wipf, 2019]. The consequences of it in drug +discovery are somewhat under-explored. In this paper, we explore applying a +multi-stage VAE approach, that can improve manifold recovery on a synthetic +dataset, to the field of drug discovery. We experimentally evaluate our +multi-stage VAE approach using the ChEMBL dataset and demonstrate its ability +to improve the property statistics of generated molecules substantially from +pre-existing methods without incorporating property predictors into the +training pipeline. We further fine-tune our models on two curated and much +smaller molecule datasets that target different proteins. Our experiments show +an increase in the number of active molecules generated by the multi-stage VAE +in comparison to their one-stage equivalent. For each of the two tasks, our +baselines include methods that use learned property predictors to incorporate +target metrics directly into the training objective and we discuss +complications that arise with this methodology. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2212.02750 +
+
+
+
+
+ + ☆ ZeroLeak: Using LLMs for Scalable and Cost Effective Side-Channel + Patching + + +
+ Security critical software, e.g., OpenSSL, comes with numerous side-channel +leakages left unpatched due to a lack of resources or experts. The situation +will only worsen as the pace of code development accelerates, with developers +relying on Large Language Models (LLMs) to automatically generate code. In this +work, we explore the use of LLMs in generating patches for vulnerable code with +microarchitectural side-channel leakages. For this, we investigate the +generative abilities of powerful LLMs by carefully crafting prompts following a +zero-shot learning approach. All generated code is dynamically analyzed by +leakage detection tools, which are capable of pinpointing information leakage +at the instruction level leaked either from secret dependent accesses or +branches or vulnerable Spectre gadgets, respectively. Carefully crafted prompts +are used to generate candidate replacements for vulnerable code, which are then +analyzed for correctness and for leakage resilience. From a cost/performance +perspective, the GPT4-based configuration costs in API calls a mere few cents +per vulnerability fixed. Our results show that LLM-based patching is far more +cost-effective and thus provides a scalable solution. Finally, the framework we +propose will improve in time, especially as vulnerability detection tools and +LLMs mature. + +
+
+
+
+
+ + ☆ Bayesian Exploration Networks + + +
+ Bayesian reinforcement learning (RL) offers a principled and elegant approach +for sequential decision making under uncertainty. Most notably, Bayesian agents +do not face an exploration/exploitation dilemma, a major pathology of +frequentist methods. A key challenge for Bayesian RL is the computational +complexity of learning Bayes-optimal policies, which is only tractable in toy +domains. In this paper we propose a novel model-free approach to address this +challenge. Rather than modelling uncertainty in high-dimensional state +transition distributions as model-based approaches do, we model uncertainty in +a one-dimensional Bellman operator. Our theoretical analysis reveals that +existing model-free approaches either do not propagate epistemic uncertainty +through the MDP or optimise over a set of contextual policies instead of all +history-conditioned policies. Both approximations yield policies that can be +arbitrarily Bayes-suboptimal. To overcome these issues, we introduce the +Bayesian exploration network (BEN) which uses normalising flows to model both +the aleatoric uncertainty (via density estimation) and epistemic uncertainty +(via variational inference) in the Bellman operator. In the limit of complete +optimisation, BEN learns true Bayes-optimal policies, but like in variational +expectation-maximisation, partial optimisation renders our approach tractable. +Empirical results demonstrate that BEN can learn true Bayes-optimal policies in +tasks where existing model-free approaches fail. + +
+
+
+
+
+ + ☆ Federated Learning of Causal Effects from Incomplete Observational Data + + +
+ Decentralized and incomplete data sources are prevalent in real-world +applications, posing a formidable challenge for causal inference. These sources +cannot be consolidated into a single entity owing to privacy constraints, and +the presence of missing values within them can potentially introduce bias to +the causal estimands. We introduce a new approach for federated causal +inference from incomplete data, enabling the estimation of causal effects from +multiple decentralized and incomplete data sources. Our approach disentangles +the loss function into multiple components, each corresponding to a specific +data source with missing values. Our approach accounts for the missing data +under the missing at random assumption, while also estimating higher-order +statistics of the causal estimands. Our method recovers the conditional +distribution of missing confounders given the observed confounders from the +decentralized data sources to identify causal effects. Our framework estimates +heterogeneous causal effects without the sharing of raw training data among +sources, which helps to mitigate privacy risks. The efficacy of our approach is +demonstrated through a collection of simulated and real-world instances, +illustrating its potential and practicality. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ The intersection of video capsule endoscopy and artificial intelligence: + addressing unique challenges using machine learning + + +
+ Introduction: Technical burdens and time-intensive review processes limit the +practical utility of video capsule endoscopy (VCE). Artificial intelligence +(AI) is poised to address these limitations, but the intersection of AI and VCE +reveals challenges that must first be overcome. We identified five challenges +to address. Challenge #1: VCE data are stochastic and contains significant +artifact. Challenge #2: VCE interpretation is cost-intensive. Challenge #3: VCE +data are inherently imbalanced. Challenge #4: Existing VCE AIMLT are +computationally cumbersome. Challenge #5: Clinicians are hesitant to accept +AIMLT that cannot explain their process. + Methods: An anatomic landmark detection model was used to test the +application of convolutional neural networks (CNNs) to the task of classifying +VCE data. We also created a tool that assists in expert annotation of VCE data. +We then created more elaborate models using different approaches including a +multi-frame approach, a CNN based on graph representation, and a few-shot +approach based on meta-learning. + Results: When used on full-length VCE footage, CNNs accurately identified +anatomic landmarks (99.1%), with gradient weighted-class activation mapping +showing the parts of each frame that the CNN used to make its decision. The +graph CNN with weakly supervised learning (accuracy 89.9%, sensitivity of +91.1%), the few-shot model (accuracy 90.8%, precision 91.4%, sensitivity +90.9%), and the multi-frame model (accuracy 97.5%, precision 91.5%, sensitivity +94.8%) performed well. Discussion: Each of these five challenges is addressed, +in part, by one of our AI-based models. Our goal of producing high performance +using lightweight models that aim to improve clinician confidence was achieved. + +
+
+
+
+
+ + ☆ Financial News Analytics Using Fine-Tuned Llama 2 GPT Model + + +
+ The paper considers the possibility to fine-tune Llama 2 Large Language Model +(LLM) for the multitask analysis of financial news. For fine-tuning, the +PEFT/LoRA based approach was used. In the study, the model was fine-tuned for +the following tasks: analysing a text from financial market perspectives, +highlighting main points of a text, summarizing a text and extracting named +entities with appropriate sentiments. The obtained results show that the +fine-tuned Llama 2 model can perform a multitask financial news analysis with a +specified structure of response, part of response can be a structured text and +another part of data can have JSON format for further processing. Extracted +sentiments for named entities can be considered as predictive features in +supervised machine learning models with quantitative target variables. + +
+
+
+
+
+ + ☆ Training Neural Networks with Universal Adiabatic Quantum Computing + + +
+ The training of neural networks (NNs) is a computationally intensive task +requiring significant time and resources. This paper presents a novel approach +to NN training using Adiabatic Quantum Computing (AQC), a paradigm that +leverages the principles of adiabatic evolution to solve optimisation problems. +We propose a universal AQC method that can be implemented on gate quantum +computers, allowing for a broad range of Hamiltonians and thus enabling the +training of expressive neural networks. We apply this approach to various +neural networks with continuous, discrete, and binary weights. Our results +indicate that AQC can very efficiently find the global minimum of the loss +function, offering a promising alternative to classical training methods. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ Extreme Risk Mitigation in Reinforcement Learning using Extreme Value + Theory + + +
+ Risk-sensitive reinforcement learning (RL) has garnered significant attention +in recent years due to the growing interest in deploying RL agents in +real-world scenarios. A critical aspect of risk awareness involves modeling +highly rare risk events (rewards) that could potentially lead to catastrophic +outcomes. These infrequent occurrences present a formidable challenge for +data-driven methods aiming to capture such risky events accurately. While +risk-aware RL techniques do exist, their level of risk aversion heavily relies +on the precision of the state-action value function estimation when modeling +these rare occurrences. Our work proposes to enhance the resilience of RL +agents when faced with very rare and risky events by focusing on refining the +predictions of the extreme values predicted by the state-action value function +distribution. To achieve this, we formulate the extreme values of the +state-action value function distribution as parameterized distributions, +drawing inspiration from the principles of extreme value theory (EVT). This +approach effectively addresses the issue of infrequent occurrence by leveraging +EVT-based parameterization. Importantly, we theoretically demonstrate the +advantages of employing these parameterized distributions in contrast to other +risk-averse algorithms. Our evaluations show that the proposed method +outperforms other risk averse RL algorithms on a diverse range of benchmark +tasks, each encompassing distinct risk scenarios. + +
+
+
+
+
+ + ♻ ☆ Anderson Acceleration For Bioinformatics-Based Machine Learning IJCAI + + +
+ Anderson acceleration (AA) is a well-known method for accelerating the +convergence of iterative algorithms, with applications in various fields +including deep learning and optimization. Despite its popularity in these +areas, the effectiveness of AA in classical machine learning classifiers has +not been thoroughly studied. Tabular data, in particular, presents a unique +challenge for deep learning models, and classical machine learning models are +known to perform better in these scenarios. However, the convergence analysis +of these models has received limited attention. To address this gap in +research, we implement a support vector machine (SVM) classifier variant that +incorporates AA to speed up convergence. We evaluate the performance of our SVM +with and without Anderson acceleration on several datasets from the biology +domain and demonstrate that the use of AA significantly improves convergence +and reduces the training loss as the number of iterations increases. Our +findings provide a promising perspective on the potential of Anderson +acceleration in the training of simple machine learning classifiers and +underscore the importance of further research in this area. By showing the +effectiveness of AA in this setting, we aim to inspire more studies that +explore the applications of AA in classical machine learning. + +
+
+ comment: Accepted in KDH-2023: Knowledge Discovery in Healthcare Data (IJCAI + Workshop) +
+
+
+
+
+ + ♻ ☆ FIESTA: Autoencoders for accurate fiber segmentation in tractography + + +
+ White matter bundle segmentation is a cornerstone of modern tractography to +study the brain's structural connectivity in domains such as neurological +disorders, neurosurgery, and aging. In this study, we present FIESTA (FIbEr +Segmentation in Tractography using Autoencoders), a reliable and robust, fully +automated, and easily semi-automatically calibrated pipeline based on deep +autoencoders that can dissect and fully populate white matter bundles. This +pipeline is built upon previous works that demonstrated how autoencoders can be +used successfully for streamline filtering, bundle segmentation, and streamline +generation in tractography. Our proposed method improves bundle segmentation +coverage by recovering hard-to-track bundles with generative sampling through +the latent space seeding of the subject bundle and the atlas bundle. A latent +space of streamlines is learned using autoencoder-based modeling combined with +contrastive learning. Using an atlas of bundles in standard space (MNI), our +proposed method segments new tractograms using the autoencoder latent distance +between each tractogram streamline and its closest neighbor bundle in the atlas +of bundles. Intra-subject bundle reliability is improved by recovering +hard-to-track streamlines, using the autoencoder to generate new streamlines +that increase the spatial coverage of each bundle while remaining anatomically +correct. Results show that our method is more reliable than state-of-the-art +automated virtual dissection methods such as RecoBundles, RecoBundlesX, +TractSeg, White Matter Analysis and XTRACT. Our framework allows for the +transition from one anatomical bundle definition to another with marginal +calibration efforts. Overall, these results show that our framework improves +the practicality and usability of current state-of-the-art bundle segmentation +framework. + +
+
+ comment: 36 pages, 13 figures, accepted in NeuroImage +
+
+
+
+
+ + ♻ ☆ A Survey on Blood Pressure Measurement Technologies: Addressing + Potential Sources of Bias + + +
+ Regular blood pressure (BP) monitoring in clinical and ambulatory settings +plays a crucial role in the prevention, diagnosis, treatment, and management of +cardiovascular diseases. Recently, the widespread adoption of ambulatory BP +measurement devices has been driven predominantly by the increased prevalence +of hypertension and its associated risks and clinical conditions. Recent +guidelines advocate for regular BP monitoring as part of regular clinical +visits or even at home. This increased utilization of BP measurement +technologies has brought up significant concerns, regarding the accuracy of +reported BP values across settings. + In this survey, focusing mainly on cuff-based BP monitoring technologies, we +highlight how BP measurements can demonstrate substantial biases and variances +due to factors such as measurement and device errors, demographics, and body +habitus. With these inherent biases, the development of a new generation of +cuff-based BP devices which use artificial-intelligence (AI) has significant +potential. We present future avenues where AI-assisted technologies can +leverage the extensive clinical literature on BP-related studies together with +the large collections of BP records available in electronic health records. +These resources can be combined with machine learning approaches, including +deep learning and Bayesian inference, to remove BP measurement biases and to +provide individualized BP-related cardiovascular risk indexes. + +
+
+
+
+
+ + ♻ ☆ Exact Bayesian Inference on Discrete Models via Probability Generating + Functions: A Probabilistic Programming Approach + + +
+ We present an exact Bayesian inference method for discrete statistical +models, which can find exact solutions to many discrete inference problems, +even with infinite support and continuous priors. To express such models, we +introduce a probabilistic programming language that supports discrete and +continuous sampling, discrete observations, affine functions, (stochastic) +branching, and conditioning on events. Our key tool is probability generating +functions: they provide a compact closed-form representation of distributions +that are definable by programs, thus enabling the exact computation of +posterior probabilities, expectation, variance, and higher moments. Our +inference method is provably correct, fully automated and uses automatic +differentiation (specifically, Taylor polynomials), but does not require +computer algebra. Our experiments show that its performance on a range of +real-world examples is competitive with approximate Monte Carlo methods, while +avoiding approximation errors. + +
+
+
+
+
+ + ♻ ☆ Improving Sample Quality of Diffusion Models Using Self-Attention + Guidance ICCV 2023 + + +
+ Denoising diffusion models (DDMs) have attracted attention for their +exceptional generation quality and diversity. This success is largely +attributed to the use of class- or text-conditional diffusion guidance methods, +such as classifier and classifier-free guidance. In this paper, we present a +more comprehensive perspective that goes beyond the traditional guidance +methods. From this generalized perspective, we introduce novel condition- and +training-free strategies to enhance the quality of generated images. As a +simple solution, blur guidance improves the suitability of intermediate samples +for their fine-scale information and structures, enabling diffusion models to +generate higher quality samples with a moderate guidance scale. Improving upon +this, Self-Attention Guidance (SAG) uses the intermediate self-attention maps +of diffusion models to enhance their stability and efficacy. Specifically, SAG +adversarially blurs only the regions that diffusion models attend to at each +iteration and guides them accordingly. Our experimental results show that our +SAG improves the performance of various diffusion models, including ADM, IDDPM, +Stable Diffusion, and DiT. Moreover, combining SAG with conventional guidance +methods leads to further improvement. + +
+
+ comment: Accepted to ICCV 2023. Project Page: + https://ku-cvlab.github.io/Self-Attention-Guidance +
+
+
+
+
+ + ♻ ☆ Towards Efficient and Comprehensive Urban Spatial-Temporal Prediction: A + Unified Library and Performance Benchmark + + +
+ As deep learning technology advances and more urban spatial-temporal data +accumulates, an increasing number of deep learning models are being proposed to +solve urban spatial-temporal prediction problems. However, there are +limitations in the existing field, including open-source data being in various +formats and difficult to use, few papers making their code and data openly +available, and open-source models often using different frameworks and +platforms, making comparisons challenging. A standardized framework is urgently +needed to implement and evaluate these methods. To address these issues, we +provide a comprehensive review of urban spatial-temporal prediction and propose +a unified storage format for spatial-temporal data called atomic files. We also +propose LibCity, an open-source library that offers researchers a credible +experimental tool and a convenient development framework. In this library, we +have reproduced 65 spatial-temporal prediction models and collected 55 +spatial-temporal datasets, allowing researchers to conduct comprehensive +experiments conveniently. Using LibCity, we conducted a series of experiments +to validate the effectiveness of different models and components, and we +summarized promising future technology developments and research directions for +spatial-temporal prediction. By enabling fair model comparisons, designing a +unified data storage format, and simplifying the process of developing new +models, LibCity is poised to make significant contributions to the +spatial-temporal prediction field. + +
+
+
+
+
+ + ♻ ☆ Near Optimal Adversarial Attack on UCB Bandits ICML 2023 + + +
+ I study a stochastic multi-arm bandit problem where rewards are subject to +adversarial corruption. I propose a novel attack strategy that manipulates a +learner employing the UCB algorithm into pulling some non-optimal target arm $T +- o(T)$ times with a cumulative cost that scales as $\widehat{O}(\sqrt{\log +T})$, where $T$ is the number of rounds. I also prove the first lower bound on +the cumulative attack cost. The lower bound matches the upper bound up to +$O(\log \log T)$ factors, showing the proposed attack strategy to be near +optimal. + +
+
+ comment: Appeared at ICML 2023 AdvML Workshop +
+
+
+
+
+ + ♻ ☆ Transforming to Yoked Neural Networks to Improve ANN Structure + + +
+ Most existing classical artificial neural networks (ANN) are designed as a +tree structure to imitate neural networks. In this paper, we argue that the +connectivity of a tree is not sufficient to characterize a neural network. The +nodes of the same level of a tree cannot be connected with each other, i.e., +these neural unit cannot share information with each other, which is a major +drawback of ANN. Although ANN has been significantly improved in recent years +to more complex structures, such as the directed acyclic graph (DAG), these +methods also have unidirectional and acyclic bias for ANN. In this paper, we +propose a method to build a bidirectional complete graph for the nodes in the +same level of an ANN, which yokes the nodes of the same level to formulate a +neural module. We call our model as YNN in short. YNN promotes the information +transfer significantly which obviously helps in improving the performance of +the method. Our YNN can imitate neural networks much better compared with the +traditional ANN. In this paper, we analyze the existing structural bias of ANN +and propose a model YNN to efficiently eliminate such structural bias. In our +model, nodes also carry out aggregation and transformation of features, and +edges determine the flow of information. We further impose auxiliary sparsity +constraint to the distribution of connectedness, which promotes the learned +structure to focus on critical connections. Finally, based on the optimized +structure, we also design small neural module structure based on the minimum +cut technique to reduce the computational burden of the YNN model. This +learning process is compatible with the existing networks and different tasks. +The obtained quantitative experimental results reflect that the learned +connectivity is superior to the traditional NN structure. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2008.08261 by other authors +
+
+
+
+
+ + ♻ ☆ Farm-wide virtual load monitoring for offshore wind structures via + Bayesian neural networks + + +
+ Offshore wind structures are subject to deterioration mechanisms throughout +their operational lifetime. Even if the deterioration evolution of structural +elements can be estimated through physics-based deterioration models, the +uncertainties involved in the process hurdle the selection of lifecycle +management decisions. In this scenario, the collection of relevant information +through an efficient monitoring system enables the reduction of uncertainties, +ultimately driving more optimal lifecycle decisions. However, a full monitoring +instrumentation implemented on all wind turbines in a farm might become +unfeasible due to practical and economical constraints. Besides, certain load +monitoring systems often become defective after a few years of marine +environment exposure. Addressing the aforementioned concerns, a farm-wide +virtual load monitoring scheme directed by a fleet-leader wind turbine offers +an attractive solution. Fetched with data retrieved from a fully-instrumented +wind turbine, a model can be trained and then deployed, thus yielding load +predictions of non-fully monitored wind turbines, from which only standard data +remains available. In this paper, we propose a virtual load monitoring +framework formulated via Bayesian neural networks (BNNs) and we provide +relevant implementation details needed for the construction, training, and +deployment of BNN data-based virtual monitoring models. As opposed to their +deterministic counterparts, BNNs intrinsically announce the uncertainties +associated with generated load predictions and allow to detect inaccurate load +estimations generated for non-fully monitored wind turbines. The proposed +virtual load monitoring is thoroughly tested through an experimental campaign +in an operational offshore wind farm and the results demonstrate the +effectiveness of BNN models for fleet-leader-based farm-wide virtual +monitoring. + +
+
+
+
+
+ + ♻ ☆ Algorithmic progress in computer vision + + +
+ We investigate algorithmic progress in image classification on ImageNet, +perhaps the most well-known test bed for computer vision. We estimate a model, +informed by work on neural scaling laws, and infer a decomposition of progress +into the scaling of compute, data, and algorithms. Using Shapley values to +attribute performance improvements, we find that algorithmic improvements have +been roughly as important as the scaling of compute for progress computer +vision. Our estimates indicate that algorithmic innovations mostly take the +form of compute-augmenting algorithmic advances (which enable researchers to +get better performance from less compute), not data-augmenting algorithmic +advances. We find that compute-augmenting algorithmic advances are made at a +pace more than twice as fast as the rate usually associated with Moore's law. +In particular, we estimate that compute-augmenting innovations halve compute +requirements every nine months (95\% confidence interval: 4 to 25 months). + +
+
+
+
+
+ + ♻ ☆ Symplectic model reduction of Hamiltonian systems using data-driven + quadratic manifolds + + +
+ This work presents two novel approaches for the symplectic model reduction of +high-dimensional Hamiltonian systems using data-driven quadratic manifolds. +Classical symplectic model reduction approaches employ linear symplectic +subspaces for representing the high-dimensional system states in a +reduced-dimensional coordinate system. While these approximations respect the +symplectic nature of Hamiltonian systems, linear basis approximations can +suffer from slowly decaying Kolmogorov $N$-width, especially in wave-type +problems, which then requires a large basis size. We propose two different +model reduction methods based on recently developed quadratic manifolds, each +presenting its own advantages and limitations. The addition of quadratic terms +to the state approximation, which sits at the heart of the proposed +methodologies, enables us to better represent intrinsic low-dimensionality in +the problem at hand. Both approaches are effective for issuing predictions in +settings well outside the range of their training data while providing more +accurate solutions than the linear symplectic reduced-order models. + +
+
+
+
+
+ + ♻ ☆ Leveraging Global Binary Masks for Structure Segmentation in Medical + Images + + +
+ Deep learning (DL) models for medical image segmentation are highly +influenced by intensity variations of input images and lack generalization due +to primarily utilizing pixels' intensity information for inference. Acquiring +sufficient training data is another challenge limiting models' applications. We +proposed to leverage the consistency of organs' anatomical shape and position +information in medical images. We introduced a framework leveraging recurring +anatomical patterns through global binary masks for organ segmentation. Two +scenarios were studied.1) Global binary masks were the only model's (i.e. +U-Net) input, forcing exclusively encoding organs' position and shape +information for segmentation/localization.2) Global binary masks were +incorporated as an additional channel functioning as position/shape clues to +mitigate training data scarcity. Two datasets of the brain and heart CT images +with their ground-truth were split into (26:10:10) and (12:3:5) for training, +validation, and test respectively. Training exclusively on global binary masks +led to Dice scores of 0.77(0.06) and 0.85(0.04), with the average Euclidian +distance of 3.12(1.43)mm and 2.5(0.93)mm relative to the center of mass of the +ground truth for the brain and heart structures respectively. The outcomes +indicate that a surprising degree of position and shape information is encoded +through global binary masks. Incorporating global binary masks led to +significantly higher accuracy relative to the model trained on only CT images +in small subsets of training data; the performance improved by 4.3-125.3% and +1.3-48.1% for 1-8 training cases of the brain and heart datasets respectively. +The findings imply the advantages of utilizing global binary masks for building +generalizable models and to compensate for training data scarcity. + +
+
+
+
+
+ + ♻ ☆ A Survey on Dataset Distillation: Approaches, Applications and Future + Directions + + +
+ Dataset distillation is attracting more attention in machine learning as +training sets continue to grow and the cost of training state-of-the-art models +becomes increasingly high. By synthesizing datasets with high information +density, dataset distillation offers a range of potential applications, +including support for continual learning, neural architecture search, and +privacy protection. Despite recent advances, we lack a holistic understanding +of the approaches and applications. Our survey aims to bridge this gap by first +proposing a taxonomy of dataset distillation, characterizing existing +approaches, and then systematically reviewing the data modalities, and related +applications. In addition, we summarize the challenges and discuss future +directions for this field of research. + +
+
+
+
+
+ + ♻ ☆ Unifying Gradients to Improve Real-world Robustness for Deep Networks + + +
+ The wide application of deep neural networks (DNNs) demands an increasing +amount of attention to their real-world robustness, i.e., whether a DNN resists +black-box adversarial attacks, among which score-based query attacks (SQAs) are +most threatening since they can effectively hurt a victim network with the only +access to model outputs. Defending against SQAs requires a slight but artful +variation of outputs due to the service purpose for users, who share the same +output information with SQAs. In this paper, we propose a real-world defense by +Unifying Gradients (UniG) of different data so that SQAs could only probe a +much weaker attack direction that is similar for different samples. Since such +universal attack perturbations have been validated as less aggressive than the +input-specific perturbations, UniG protects real-world DNNs by indicating +attackers a twisted and less informative attack direction. We implement UniG +efficiently by a Hadamard product module which is plug-and-play. According to +extensive experiments on 5 SQAs, 2 adaptive attacks and 7 defense baselines, +UniG significantly improves real-world robustness without hurting clean +accuracy on CIFAR10 and ImageNet. For instance, UniG maintains a model of +77.80% accuracy under 2500-query Square attack while the state-of-the-art +adversarially-trained model only has 67.34% on CIFAR10. Simultaneously, UniG +outperforms all compared baselines in terms of clean accuracy and achieves the +smallest modification of the model output. The code is released at +https://github.com/snowien/UniG-pytorch. + +
+
+
+
+
+ + ♻ ☆ Min-Max Optimization under Delays + + +
+ Delays and asynchrony are inevitable in large-scale machine-learning problems +where communication plays a key role. As such, several works have extensively +analyzed stochastic optimization with delayed gradients. However, as far as we +are aware, no analogous theory is available for min-max optimization, a topic +that has gained recent popularity due to applications in adversarial +robustness, game theory, and reinforcement learning. Motivated by this gap, we +examine the performance of standard min-max optimization algorithms with +delayed gradient updates. First, we show (empirically) that even small delays +can cause prominent algorithms like Extra-gradient (\texttt{EG}) to diverge on +simple instances for which \texttt{EG} guarantees convergence in the absence of +delays. Our empirical study thus suggests the need for a careful analysis of +delayed versions of min-max optimization algorithms. Accordingly, under +suitable technical assumptions, we prove that Gradient Descent-Ascent +(\texttt{GDA}) and \texttt{EG} with delayed updates continue to guarantee +convergence to saddle points for convex-concave and strongly convex-strongly +concave settings. Our complexity bounds reveal, in a transparent manner, the +slow-down in convergence caused by delays. + +
+
+
+
+
+ + ♻ ☆ An Accelerated Block Proximal Framework with Adaptive Momentum for + Nonconvex and Nonsmooth Optimization + + +
+ We propose an accelerated block proximal linear framework with adaptive +momentum (ABPL$^+$) for nonconvex and nonsmooth optimization. We analyze the +potential causes of the extrapolation step failing in some algorithms, and +resolve this issue by enhancing the comparison process that evaluates the +trade-off between the proximal gradient step and the linear extrapolation step +in our algorithm. Furthermore, we extends our algorithm to any scenario +involving updating block variables with positive integers, allowing each cycle +to randomly shuffle the update order of the variable blocks. Additionally, +under mild assumptions, we prove that ABPL$^+$ can monotonically decrease the +function value without strictly restricting the extrapolation parameters and +step size, demonstrates the viability and effectiveness of updating these +blocks in a random order, and we also more obviously and intuitively +demonstrate that the derivative set of the sequence generated by our algorithm +is a critical point set. Moreover, we demonstrate the global convergence as +well as the linear and sublinear convergence rates of our algorithm by +utilizing the Kurdyka-Lojasiewicz (K{\L}) condition. To enhance the +effectiveness and flexibility of our algorithm, we also expand the study to the +imprecise version of our algorithm and construct an adaptive extrapolation +parameter strategy, which improving its overall performance. We apply our +algorithm to multiple non-negative matrix factorization with the $\ell_0$ norm, +nonnegative tensor decomposition with the $\ell_0$ norm, and perform extensive +numerical experiments to validate its effectiveness and efficiency. + +
+
+
+
+
+ + ♻ ☆ Individual Privacy Accounting with Gaussian Differential Privacy + + +
+ Individual privacy accounting enables bounding differential privacy (DP) loss +individually for each participant involved in the analysis. This can be +informative as often the individual privacy losses are considerably smaller +than those indicated by the DP bounds that are based on considering worst-case +bounds at each data access. In order to account for the individual privacy +losses in a principled manner, we need a privacy accountant for adaptive +compositions of randomised mechanisms, where the loss incurred at a given data +access is allowed to be smaller than the worst-case loss. This kind of analysis +has been carried out for the R\'enyi differential privacy (RDP) by Feldman and +Zrnic (2021), however not yet for the so-called optimal privacy accountants. We +make first steps in this direction by providing a careful analysis using the +Gaussian differential privacy which gives optimal bounds for the Gaussian +mechanism, one of the most versatile DP mechanisms. This approach is based on +determining a certain supermartingale for the hockey-stick divergence and on +extending the R\'enyi divergence-based fully adaptive composition results by +Feldman and Zrnic. We also consider measuring the individual +$(\varepsilon,\delta)$-privacy losses using the so-called privacy loss +distributions. With the help of the Blackwell theorem, we can then make use of +the RDP analysis to construct an approximative individual +$(\varepsilon,\delta)$-accountant. + +
+
+ comment: 31 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Interneurons accelerate learning dynamics in recurrent neural networks + for statistical adaptation + + +
+ Early sensory systems in the brain rapidly adapt to fluctuating input +statistics, which requires recurrent communication between neurons. +Mechanistically, such recurrent communication is often indirect and mediated by +local interneurons. In this work, we explore the computational benefits of +mediating recurrent communication via interneurons compared with direct +recurrent connections. To this end, we consider two mathematically tractable +recurrent linear neural networks that statistically whiten their inputs -- one +with direct recurrent connections and the other with interneurons that mediate +recurrent communication. By analyzing the corresponding continuous synaptic +dynamics and numerically simulating the networks, we show that the network with +interneurons is more robust to initialization than the network with direct +recurrent connections in the sense that the convergence time for the synaptic +dynamics in the network with interneurons (resp. direct recurrent connections) +scales logarithmically (resp. linearly) with the spectrum of their +initialization. Our results suggest that interneurons are computationally +useful for rapid adaptation to changing input statistics. Interestingly, the +network with interneurons is an overparameterized solution of the whitening +objective for the network with direct recurrent connections, so our results can +be viewed as a recurrent linear neural network analogue of the implicit +acceleration phenomenon observed in overparameterized feedforward linear neural +networks. + +
+
+ comment: 16 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Evaluation of ChatGPT on Biomedical Tasks: A Zero-Shot Comparison with + Fine-Tuned Generative Transformers ACL 2023 + + +
+ ChatGPT is a large language model developed by OpenAI. Despite its impressive +performance across various tasks, no prior work has investigated its capability +in the biomedical domain yet. To this end, this paper aims to evaluate the +performance of ChatGPT on various benchmark biomedical tasks, such as relation +extraction, document classification, question answering, and summarization. To +the best of our knowledge, this is the first work that conducts an extensive +evaluation of ChatGPT in the biomedical domain. Interestingly, we find based on +our evaluation that in biomedical datasets that have smaller training sets, +zero-shot ChatGPT even outperforms the state-of-the-art fine-tuned generative +transformer models, such as BioGPT and BioBART. This suggests that ChatGPT's +pre-training on large text corpora makes it quite specialized even in the +biomedical domain. Our findings demonstrate that ChatGPT has the potential to +be a valuable tool for various tasks in the biomedical domain that lack large +annotated data. + +
+
+ comment: Accepted by BioNLP@ACL 2023 +
+
+
+
+
+ + ♻ ☆ Efficient Sensor Placement from Regression with Sparse Gaussian + Processes in Continuous and Discrete Spaces + + +
+ The sensor placement problem is a common problem that arises when monitoring +correlated phenomena, such as temperature and precipitation. Existing +approaches to this problem typically use discrete optimization methods, which +are computationally expensive and cannot scale to large problems. We address +the sensor placement problem in correlated environments by reducing it to a +regression problem that can be efficiently solved using sparse Gaussian +processes (SGPs). Our approach can handle both discrete sensor placement +problems-where sensors are limited to a subset of a given set of locations-and +continuous sensor placement problems-where sensors can be placed anywhere in a +bounded continuous region. We further generalize our approach to handle sensors +with a non-point field of view and integrated observations. Our experimental +results on three real-world datasets show that our approach generates sensor +placements that result in reconstruction quality that is consistently on par or +better than the prior state-of-the-art approach while being significantly +faster. Our computationally efficient approach enables both large-scale sensor +placement and fast robotic sensor placement for informative path planning +algorithms. + +
+
+ comment: 10 pages, 4 figures, preprint, appendix +
+
+
+
+
+ + ♻ ☆ Universal Soldier: Using Universal Adversarial Perturbations for + Detecting Backdoor Attacks + + +
+ Deep learning models achieve excellent performance in numerous machine +learning tasks. Yet, they suffer from security-related issues such as +adversarial examples and poisoning (backdoor) attacks. A deep learning model +may be poisoned by training with backdoored data or by modifying inner network +parameters. Then, a backdoored model performs as expected when receiving a +clean input, but it misclassifies when receiving a backdoored input stamped +with a pre-designed pattern called "trigger". Unfortunately, it is difficult to +distinguish between clean and backdoored models without prior knowledge of the +trigger. This paper proposes a backdoor detection method by utilizing a special +type of adversarial attack, universal adversarial perturbation (UAP), and its +similarities with a backdoor trigger. We observe an intuitive phenomenon: UAPs +generated from backdoored models need fewer perturbations to mislead the model +than UAPs from clean models. UAPs of backdoored models tend to exploit the +shortcut from all classes to the target class, built by the backdoor trigger. +We propose a novel method called Universal Soldier for Backdoor detection (USB) +and reverse engineering potential backdoor triggers via UAPs. Experiments on +345 models trained on several datasets show that USB effectively detects the +injected backdoor and provides comparable or better results than +state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ HypBO: Expert-Guided Chemist-in-the-Loop Bayesian Search for New + Materials + + +
+ Robotics and automation offer massive accelerations for solving intractable, +multivariate scientific problems such as materials discovery, but the available +search spaces can be dauntingly large. Bayesian optimization (BO) has emerged +as a popular sample-efficient optimization engine, thriving in tasks where no +analytic form of the target function/property is known. Here we exploit expert +human knowledge in the form of hypotheses to direct Bayesian searches more +quickly to promising regions of chemical space. Previous methods have used +underlying distributions derived from existing experimental measurements, which +is unfeasible for new, unexplored scientific tasks. Also, such distributions +cannot capture intricate hypotheses. Our proposed method, which we call HypBO, +uses expert human hypotheses to generate an improved seed of samples. +Unpromising seeds are automatically discounted, while promising seeds are used +to augment the surrogate model data, thus achieving better-informed sampling. +This process continues in a global versus local search fashion, organized in a +bilevel optimization framework. We validate the performance of our method on a +range of synthetic functions and demonstrate its practical utility on a real +chemical design task where the use of expert hypotheses accelerates the search +performance significantly. + +
+
+
+
+
+ + ♻ ☆ On the Generalization of PINNs outside the training domain and the + Hyperparameters influencing it + + +
+ Physics-Informed Neural Networks (PINNs) are Neural Network architectures +trained to emulate solutions of differential equations without the necessity of +solution data. They are currently ubiquitous in the scientific literature due +to their flexible and promising settings. However, very little of the available +research provides practical studies that aim for a better quantitative +understanding of such architecture and its functioning. In this paper, we +perform an empirical analysis of the behavior of PINN predictions outside their +training domain. The primary goal is to investigate the scenarios in which a +PINN can provide consistent predictions outside the training area. +Thereinafter, we assess whether the algorithmic setup of PINNs can influence +their potential for generalization and showcase the respective effect on the +prediction. The results obtained in this study returns insightful and at times +counterintuitive perspectives which can be highly relevant for architectures +which combines PINNs with domain decomposition and/or adaptive training +strategies. + +
+
+
+
+
+ + ♻ ☆ The Polynomial Method is Universal for Distribution-Free Correlational + SQ Learning + + +
+ We consider the problem of distribution-free learning for Boolean function +classes in the PAC and agnostic models. Generalizing a beautiful work of Malach +and Shalev-Shwartz (2022) that gave tight correlational SQ (CSQ) lower bounds +for learning DNF formulas, we give new proofs that lower bounds on the +threshold or approximate degree of any function class directly imply CSQ lower +bounds for PAC or agnostic learning respectively. While such bounds implicitly +follow by combining prior results by Feldman (2008, 2012) and Sherstov (2008, +2011), to our knowledge the precise statements we give had not appeared in this +form before. Moreover, our proofs are simple and largely self-contained. + These lower bounds match corresponding positive results using upper bounds on +the threshold or approximate degree in the SQ model for PAC or agnostic +learning, and in this sense these results show that the polynomial method is a +universal, best-possible approach for distribution-free CSQ learning. + +
+
+ comment: v3: Improved discussion of relation to prior work +
+
+
+
+
+ + ♻ ☆ A temporally and spatially local spike-based backpropagation algorithm + to enable training in hardware + + +
+ Spiking Neural Networks (SNNs) have emerged as a hardware efficient +architecture for classification tasks. The challenge of spike-based encoding +has been the lack of a universal training mechanism performed entirely using +spikes. There have been several attempts to adopt the powerful backpropagation +(BP) technique used in non-spiking artificial neural networks (ANN): (1) SNNs +can be trained by externally computed numerical gradients. (2) A major +advancement towards native spike-based learning has been the use of approximate +Backpropagation using spike-time dependent plasticity (STDP) with phased +forward/backward passes. However, the transfer of information between such +phases for gradient and weight update calculation necessitates external memory +and computational access. This is a challenge for standard neuromorphic +hardware implementations. In this paper, we propose a stochastic SNN based +Back-Prop (SSNN-BP) algorithm that utilizes a composite neuron to +simultaneously compute the forward pass activations and backward pass gradients +explicitly with spikes. Although signed gradient values are a challenge for +spike-based representation, we tackle this by splitting the gradient signal +into positive and negative streams. We show that our method approaches BP ANN +baseline with sufficiently long spike-trains. Finally, we show that the +well-performing softmax cross-entropy loss function can be implemented through +inhibitory lateral connections enforcing a Winner Take All (WTA) rule. Our SNN +with a 2-layer network shows excellent generalization through comparable +performance to ANNs with equivalent architecture and regularization parameters +on static image datasets like MNIST, Fashion-MNIST, Extended MNIST, and +temporally encoded image datasets like Neuromorphic MNIST datasets. Thus, +SSNN-BP enables BP compatible with purely spike-based neuromorphic hardware. + +
+
+
+
+
+ + ♻ ☆ Dealing with Small Datasets for Deep Learning in Medical Imaging: An + Evaluation of Self-Supervised Pre-Training on CT Scans Comparing Contrastive + and Masked Autoencoder Methods for Convolutional Models + + +
+ Deep learning in medical imaging has the potential to minimize the risk of +diagnostic errors, reduce radiologist workload, and accelerate diagnosis. +Training such deep learning models requires large and accurate datasets, with +annotations for all training samples. However, in the medical imaging domain, +annotated datasets for specific tasks are often small due to the high +complexity of annotations, limited access, or the rarity of diseases. To +address this challenge, deep learning models can be pre-trained on large image +datasets without annotations using methods from the field of self-supervised +learning. After pre-training, small annotated datasets are sufficient to +fine-tune the models for a specific task. The most popular self-supervised +pre-training approaches in medical imaging are based on contrastive learning. +However, recent studies in natural image processing indicate a strong potential +for masked autoencoder approaches. Our work compares state-of-the-art +contrastive learning methods with the recently introduced masked autoencoder +approach "SparK" for convolutional neural networks (CNNs) on medical images. +Therefore we pre-train on a large unannotated CT image dataset and fine-tune on +several CT classification tasks. Due to the challenge of obtaining sufficient +annotated training data in medical imaging, it is of particular interest to +evaluate how the self-supervised pre-training methods perform when fine-tuning +on small datasets. By experimenting with gradually reducing the training +dataset size for fine-tuning, we find that the reduction has different effects +depending on the type of pre-training chosen. The SparK pre-training method is +more robust to the training dataset size than the contrastive methods. Based on +our results, we propose the SparK pre-training for medical imaging tasks with +only small annotated datasets. + +
+
+ comment: This paper is under review. The code will be released if accepted +
+
+
+
+
+ + ♻ ☆ FlexFringe: Modeling Software Behavior by Learning Probabilistic + Automata + + +
+ We present the efficient implementations of probabilistic deterministic +finite automaton learning methods available in FlexFringe. These implement +well-known strategies for state-merging including several modifications to +improve their performance in practice. We show experimentally that these +algorithms obtain competitive results and significant improvements over a +default implementation. We also demonstrate how to use FlexFringe to learn +interpretable models from software logs and use these for anomaly detection. +Although less interpretable, we show that learning smaller more convoluted +models improves the performance of FlexFringe on anomaly detection, +outperforming an existing solution based on neural nets. + +
+
+
+
+
+ + ♻ ☆ Self-Supervised Training with Autoencoders for Visual Anomaly Detection + + +
+ Deep autoencoders provide an effective tool for learning non-linear +dimensionality reduction in an unsupervised way. Recently, they have been used +for the task of anomaly detection in the visual domain. By optimizing for the +reconstruction error using anomaly-free examples, the common belief is that a +corresponding network should fail to accurately reconstruct anomalous regions +in the application phase. This goal is typically addressed by controlling the +capacity of the network, either by reducing the size of the bottleneck layer or +by enforcing sparsity constraints on the activations. However, neither of these +techniques does explicitly penalize reconstruction of anomalous signals often +resulting in poor detection. We tackle this problem by adapting a +self-supervised learning regime that allows the use of discriminative +information during training but focuses on the data manifold of normal +examples. We emphasize that inference with our approach is very efficient +during training and prediction requiring a single forward pass for each input +image. Our experiments on the MVTec AD dataset demonstrate high detection and +localization performance. On the texture-subset, in particular, our approach +consistently outperforms recent anomaly detection methods by a significant +margin. + +
+
+
+
+
+ + ♻ ☆ Riemannian Hamiltonian methods for min-max optimization on manifolds + + +
+ In this paper, we study min-max optimization problems on Riemannian +manifolds. We introduce a Riemannian Hamiltonian function, minimization of +which serves as a proxy for solving the original min-max problems. Under the +Riemannian Polyak--{\L}ojasiewicz condition on the Hamiltonian function, its +minimizer corresponds to the desired min-max saddle point. We also provide +cases where this condition is satisfied. For geodesic-bilinear optimization in +particular, solving the proxy problem leads to the correct search direction +towards global optimality, which becomes challenging with the min-max +formulation. To minimize the Hamiltonian function, we propose Riemannian +Hamiltonian methods (RHM) and present their convergence analyses. We extend RHM +to include consensus regularization and to the stochastic setting. We +illustrate the efficacy of the proposed RHM in applications such as subspace +robust Wasserstein distance, robust training of neural networks, and generative +adversarial networks. + +
+
+ comment: Extended version with proofs +
+
+
+
+
+ + ♻ To Compress or Not to Compress- Self-Supervised Learning and Information + Theory: A Review + + +
+ \begin{abstract} Deep neural networks excel in supervised learning tasks but +are constrained by the need for extensive labeled data. Self-supervised +learning emerges as a promising alternative, allowing models to learn without +explicit labels. Information theory, and notably the information bottleneck +principle, has been pivotal in shaping deep neural networks. This principle +focuses on optimizing the trade-off between compression and preserving relevant +information, providing a foundation for efficient network design in supervised +contexts. However, its precise role and adaptation in self-supervised learning +remain unclear. In this work, we scrutinize various self-supervised learning +approaches from an information-theoretic perspective, introducing a unified +framework that encapsulates the self-supervised information-theoretic learning +problem. We weave together existing research into a cohesive narrative, delve +into contemporary self-supervised methodologies, and spotlight potential +research avenues and inherent challenges. Additionally, we discuss the +empirical evaluation of information-theoretic quantities and their estimation +methods. Overall, this paper furnishes an exhaustive review of the intersection +of information theory, self-supervised learning, and deep neural networks. + +
+
+
+
+
+ + ♻ ☆ Efficient-Adam: Communication-Efficient Distributed Adam + + +
+ Distributed adaptive stochastic gradient methods have been widely used for +large-scale nonconvex optimization, such as training deep learning models. +However, their communication complexity on finding $\varepsilon$-stationary +points has rarely been analyzed in the nonconvex setting. In this work, we +present a novel communication-efficient distributed Adam in the +parameter-server model for stochastic nonconvex optimization, dubbed {\em +Efficient-Adam}. Specifically, we incorporate a two-way quantization scheme +into Efficient-Adam to reduce the communication cost between the workers and +server. Simultaneously, we adopt a two-way error feedback strategy to reduce +the biases caused by the two-way quantization on both the server and workers, +respectively. In addition, we establish the iteration complexity for the +proposed Efficient-Adam with a class of quantization operators, and further +characterize its communication complexity between the server and workers when +an $\varepsilon$-stationary point is achieved. Finally, we apply Efficient-Adam +to solve a toy stochastic convex optimization problem and train deep learning +models on real-world vision and language tasks. Extensive experiments together +with a theoretical guarantee justify the merits of Efficient Adam. + +
+
+ comment: IEEE Transactions on Signal Processing +
+
+
+
+
+ + ♻ ☆ HyperTab: Hypernetwork Approach for Deep Learning on Small Tabular + Datasets + + +
+ Deep learning has achieved impressive performance in many domains, such as +computer vision and natural language processing, but its advantage over +classical shallow methods on tabular datasets remains questionable. It is +especially challenging to surpass the performance of tree-like ensembles, such +as XGBoost or Random Forests, on small-sized datasets (less than 1k samples). +To tackle this challenge, we introduce HyperTab, a hypernetwork-based approach +to solving small sample problems on tabular datasets. By combining the +advantages of Random Forests and neural networks, HyperTab generates an +ensemble of neural networks, where each target model is specialized to process +a specific lower-dimensional view of the data. Since each view plays the role +of data augmentation, we virtually increase the number of training samples +while keeping the number of trainable parameters unchanged, which prevents +model overfitting. We evaluated HyperTab on more than 40 tabular datasets of a +varying number of samples and domains of origin, and compared its performance +with shallow and deep learning models representing the current +state-of-the-art. We show that HyperTab consistently outranks other methods on +small data (with a statistically significant difference) and scores comparable +to them on larger datasets. + We make a python package with the code available to download at +https://pypi.org/project/hypertab/ + +
+
+
+
+
+ + ♻ ☆ Breaking the Communication-Privacy-Accuracy Tradeoff with + $f$-Differential Privacy + + +
+ We consider a federated data analytics problem in which a server coordinates +the collaborative data analysis of multiple users with privacy concerns and +limited communication capability. The commonly adopted compression schemes +introduce information loss into local data while improving communication +efficiency, and it remains an open problem whether such discrete-valued +mechanisms provide any privacy protection. In this paper, we study the local +differential privacy guarantees of discrete-valued mechanisms with finite +output space through the lens of $f$-differential privacy (DP). More +specifically, we advance the existing literature by deriving tight $f$-DP +guarantees for a variety of discrete-valued mechanisms, including the binomial +noise and the binomial mechanisms that are proposed for privacy preservation, +and the sign-based methods that are proposed for data compression, in +closed-form expressions. We further investigate the amplification in privacy by +sparsification and propose a ternary stochastic compressor. By leveraging +compression for privacy amplification, we improve the existing methods by +removing the dependency of accuracy (in terms of mean square error) on +communication cost in the popular use case of distributed mean estimation, +therefore breaking the three-way tradeoff between privacy, communication, and +accuracy. Finally, we discuss the Byzantine resilience of the proposed +mechanism and its application in federated learning. + +
+
+
+
+
+ + ♻ ☆ Bridging the Gap between Chemical Reaction Pretraining and Conditional + Molecule Generation with a Unified Model + + +
+ Chemical reactions are the fundamental building blocks of drug design and +organic chemistry research. In recent years, there has been a growing need for +a large-scale deep-learning framework that can efficiently capture the basic +rules of chemical reactions. In this paper, we have proposed a unified +framework that addresses both the reaction representation learning and molecule +generation tasks, which allows for a more holistic approach. Inspired by the +organic chemistry mechanism, we develop a novel pretraining framework that +enables us to incorporate inductive biases into the model. Our framework +achieves state-of-the-art results on challenging downstream tasks. By +possessing chemical knowledge, our generative framework overcome the +limitations of current molecule generation models that rely on a small number +of reaction templates. In the extensive experiments, our model generates +synthesizable drug-like structures of high quality. Overall, our work presents +a significant step toward a large-scale deep-learning framework for a variety +of reaction-based applications. + +
+
+
+
+
+ + ♻ ☆ Tackling Face Verification Edge Cases: In-Depth Analysis and + Human-Machine Fusion Approach + + +
+ Nowadays, face recognition systems surpass human performance on several +datasets. However, there are still edge cases that the machine can't correctly +classify. This paper investigates the effect of a combination of machine and +human operators in the face verification task. First, we look closer at the +edge cases for several state-of-the-art models to discover common datasets' +challenging settings. Then, we conduct a study with 60 participants on these +selected tasks with humans and provide an extensive analysis. Finally, we +demonstrate that combining machine and human decisions can further improve the +performance of state-of-the-art face verification systems on various benchmark +datasets. Code and data are publicly available on GitHub. + +
+
+
+
+
+ + ♻ ☆ Equal Treatment: Measuring Fairness using Explanation Distributions + + +
+ Liberalism-oriented political philosophy reasons that all individuals should +be treated equally independently of their protected characteristics. Related +work in machine learning has translated the concept of equal treatment into +terms of equal outcome and measured it as demographic parity (also called +statistical parity). Our analysis reveals that the two concepts of equal +outcome and equal treatment diverge; therefore, demographic parity does not +faithfully represent the notion of equal treatment. We propose a new +formalization for equal treatment by (i) considering the influence of feature +values on predictions, such as computed by Shapley values explaining +classifications, (ii) defining distributions of explanations, and (iii) +comparing explanation distributions between populations with different +protected characteristics. We show the theoretical properties of our notion of +equal treatment and devise a classifier two-sample test based on the AUC of an +equal treatment inspector. We study our formalization of equal treatment on +synthetic and natural data. We release explanationspace, an open-source Python +package with methods and tutorials. + +
+
+
+
+
+ + ♻ ☆ Towards Top-Down Automated Development in Limited Scopes: A + Neuro-Symbolic Framework from Expressibles to Executables + + +
+ Deep code generation is a topic of deep learning for software engineering +(DL4SE), which adopts neural models to generate code for the intended +functions. Since end-to-end neural methods lack domain knowledge and software +hierarchy awareness, they tend to perform poorly w.r.t project-level tasks. To +systematically explore the potential improvements of code generation, we let it +participate in the whole top-down development from \emph{expressibles} to +\emph{executables}, which is possible in limited scopes. In the process, it +benefits from massive samples, features, and knowledge. As the foundation, we +suggest building a taxonomy on code data, namely code taxonomy, leveraging the +categorization of code information. Moreover, we introduce a three-layer +semantic pyramid (SP) to associate text data and code data. It identifies the +information of different abstraction levels, and thus introduces the domain +knowledge on development and reveals the hierarchy of software. Furthermore, we +propose a semantic pyramid framework (SPF) as the approach, focusing on +software of high modularity and low complexity. SPF divides the code generation +process into stages and reserves spots for potential interactions. In addition, +we conceived preliminary applications in software development to confirm the +neuro-symbolic framework. + +
+
+ comment: 5 pages, 3 figures, 2 tables, accepted by ESEC/FSE 2023, the + camera-ready version +
+
+
+
+
+ + ♻ ☆ A multiobjective continuation method to compute the regularization path + of deep neural networks + + +
+ Sparsity is a highly desired feature in deep neural networks (DNNs) since it +ensures numerical efficiency, improves the interpretability of models (due to +the smaller number of relevant features), and robustness. In machine learning +approaches based on linear models, it is well known that there exists a +connecting path between the sparsest solution in terms of the $\ell^1$ norm +(i.e., zero weights) and the non-regularized solution, which is called the +regularization path. Very recently, there was a first attempt to extend the +concept of regularization paths to DNNs by means of treating the empirical loss +and sparsity ($\ell^1$ norm) as two conflicting criteria and solving the +resulting multiobjective optimization problem. However, due to the +non-smoothness of the $\ell^1$ norm and the high number of parameters, this +approach is not very efficient from a computational perspective. To overcome +this limitation, we present an algorithm that allows for the approximation of +the entire Pareto front for the above-mentioned objectives in a very efficient +manner. We present numerical examples using both deterministic and stochastic +gradients. We furthermore demonstrate that knowledge of the regularization path +allows for a well-generalizing network parametrization. + +
+
+ comment: 7 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Exact Manifold Gaussian Variational Bayes + + +
+ We propose an optimization algorithm for Variational Inference (VI) in +complex models. Our approach relies on natural gradient updates where the +variational space is a Riemann manifold. We develop an efficient algorithm for +Gaussian Variational Inference that implicitly satisfies the positive definite +constraint on the variational covariance matrix. Our Exact manifold Gaussian +Variational Bayes (EMGVB) provides exact but simple update rules and is +straightforward to implement. Due to its black-box nature, EMGVB stands as a +ready-to-use solution for VI in complex models. Over five datasets, we +empirically validate our feasible approach on different statistical, +econometric, and deep learning models, discussing its performance with respect +to baseline methods. + +
+
+
+
+
+ + ♻ ☆ PromptBench: Towards Evaluating the Robustness of Large Language Models + on Adversarial Prompts + + +
+ The increasing reliance on Large Language Models (LLMs) across academia and +industry necessitates a comprehensive understanding of their robustness to +prompts. In response to this vital need, we introduce PromptBench, a robustness +benchmark designed to measure LLMs' resilience to adversarial prompts. This +study uses a plethora of adversarial textual attacks targeting prompts across +multiple levels: character, word, sentence, and semantic. These prompts are +then employed in diverse tasks, such as sentiment analysis, natural language +inference, reading comprehension, machine translation, and math +problem-solving. Our study generates 4,032 adversarial prompts, meticulously +evaluated over 8 tasks and 13 datasets, with 567,084 test samples in total. Our +findings demonstrate that contemporary LLMs are vulnerable to adversarial +prompts. Furthermore, we present comprehensive analysis to understand the +mystery behind prompt robustness and its transferability. We then offer +insightful robustness analysis and pragmatic recommendations for prompt +composition, beneficial to both researchers and everyday users. We make our +code, prompts, and methodologies to generate adversarial prompts publicly +accessible, thereby enabling and encouraging collaborative exploration in this +pivotal field: https://github.com/microsoft/promptbench. + +
+
+ comment: Technical report; updated with new experiments and related work; 27 + pages; code is at: https://github.com/microsoft/promptbench +
+
+
+
+
+ + ♻ ☆ Expectation-Complete Graph Representations with Homomorphisms ICML 2023 + + +
+ We investigate novel random graph embeddings that can be computed in expected +polynomial time and that are able to distinguish all non-isomorphic graphs in +expectation. Previous graph embeddings have limited expressiveness and either +cannot distinguish all graphs or cannot be computed efficiently for every +graph. To be able to approximate arbitrary functions on graphs, we are +interested in efficient alternatives that become arbitrarily expressive with +increasing resources. Our approach is based on Lov\'asz' characterisation of +graph isomorphism through an infinite dimensional vector of homomorphism +counts. Our empirical evaluation shows competitive results on several benchmark +graph learning tasks. + +
+
+ comment: accepted for publication at ICML 2023 +
+
+
+
+
+ + ♻ ☆ Augmenting Reinforcement Learning with Transformer-based Scene + Representation Learning for Decision-making of Autonomous Driving + + +
+ Decision-making for urban autonomous driving is challenging due to the +stochastic nature of interactive traffic participants and the complexity of +road structures. Although reinforcement learning (RL)-based decision-making +scheme is promising to handle urban driving scenarios, it suffers from low +sample efficiency and poor adaptability. In this paper, we propose Scene-Rep +Transformer to improve the RL decision-making capabilities with better scene +representation encoding and sequential predictive latent distillation. +Specifically, a multi-stage Transformer (MST) encoder is constructed to model +not only the interaction awareness between the ego vehicle and its neighbors +but also intention awareness between the agents and their candidate routes. A +sequential latent Transformer (SLT) with self-supervised learning objectives is +employed to distill the future predictive information into the latent scene +representation, in order to reduce the exploration space and speed up training. +The final decision-making module based on soft actor-critic (SAC) takes as +input the refined latent scene representation from the Scene-Rep Transformer +and outputs driving actions. The framework is validated in five challenging +simulated urban scenarios with dense traffic, and its performance is manifested +quantitatively by the substantial improvements in data efficiency and +performance in terms of success rate, safety, and efficiency. The qualitative +results reveal that our framework is able to extract the intentions of neighbor +agents to help make decisions and deliver more diversified driving behaviors. + +
+
+
+
+
+ + ♻ ☆ Feature Unlearning for Pre-trained GANs and VAEs + + +
+ We tackle the problem of feature unlearning from a pre-trained image +generative model: GANs and VAEs. Unlike a common unlearning task where an +unlearning target is a subset of the training set, we aim to unlearn a specific +feature, such as hairstyle from facial images, from the pre-trained generative +models. As the target feature is only presented in a local region of an image, +unlearning the entire image from the pre-trained model may result in losing +other details in the remaining region of the image. To specify which features +to unlearn, we collect randomly generated images that contain the target +features. We then identify a latent representation corresponding to the target +feature and then use the representation to fine-tune the pre-trained model. +Through experiments on MNIST and CelebA datasets, we show that target features +are successfully removed while keeping the fidelity of the original models. +Further experiments with an adversarial attack show that the unlearned model is +more robust under the presence of malicious parties. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Manifold Linearizing and Clustering + + +
+ We consider the problem of simultaneously clustering and learning a linear +representation of data lying close to a union of low-dimensional manifolds, a +fundamental task in machine learning and computer vision. When the manifolds +are assumed to be linear subspaces, this reduces to the classical problem of +subspace clustering, which has been studied extensively over the past two +decades. Unfortunately, many real-world datasets such as natural images can not +be well approximated by linear subspaces. On the other hand, numerous works +have attempted to learn an appropriate transformation of the data, such that +data is mapped from a union of general non-linear manifolds to a union of +linear subspaces (with points from the same manifold being mapped to the same +subspace). However, many existing works have limitations such as assuming +knowledge of the membership of samples to clusters, requiring high sampling +density, or being shown theoretically to learn trivial representations. In this +paper, we propose to optimize the Maximal Coding Rate Reduction metric with +respect to both the data representation and a novel doubly stochastic cluster +membership, inspired by state-of-the-art subspace clustering results. We give a +parameterization of such a representation and membership, allowing efficient +mini-batching and one-shot initialization. Experiments on CIFAR-10, -20, -100, +and TinyImageNet-200 datasets show that the proposed method is much more +accurate and scalable than state-of-the-art deep clustering methods, and +further learns a latent linear representation of the data. + +
+
+
+
+
+ + ♻ ☆ Federated Learning in Big Model Era: Domain-Specific Multimodal Large + Models + + +
+ Multimodal data, which can comprehensively perceive and recognize the +physical world, has become an essential path towards general artificial +intelligence. However, multimodal large models trained on public datasets often +underperform in specific industrial domains. This paper proposes a multimodal +federated learning framework that enables multiple enterprises to utilize +private domain data to collaboratively train large models for vertical domains, +achieving intelligent services across scenarios. The authors discuss in-depth +the strategic transformation of federated learning in terms of intelligence +foundation and objectives in the era of big model, as well as the new +challenges faced in heterogeneous data, model aggregation, performance and cost +trade-off, data privacy, and incentive mechanism. The paper elaborates a case +study of leading enterprises contributing multimodal data and expert knowledge +to city safety operation management , including distributed deployment and +efficient coordination of the federated learning platform, technical +innovations on data quality improvement based on large model capabilities and +efficient joint fine-tuning approaches. Preliminary experiments show that +enterprises can enhance and accumulate intelligent capabilities through +multimodal model federated learning, thereby jointly creating an smart city +model that provides high-quality intelligent services covering energy +infrastructure safety, residential community security, and urban operation +management. The established federated learning cooperation ecosystem is +expected to further aggregate industry, academia, and research resources, +realize large models in multiple vertical domains, and promote the large-scale +industrial application of artificial intelligence and cutting-edge research on +multimodal federated learning. + +
+
+
+
+
+ + ♻ ☆ BallGAN: 3D-aware Image Synthesis with a Spherical Background ICCV 2023 + + +
+ 3D-aware GANs aim to synthesize realistic 3D scenes such that they can be +rendered in arbitrary perspectives to produce images. Although previous methods +produce realistic images, they suffer from unstable training or degenerate +solutions where the 3D geometry is unnatural. We hypothesize that the 3D +geometry is underdetermined due to the insufficient constraint, i.e., being +classified as real image to the discriminator is not enough. To solve this +problem, we propose to approximate the background as a spherical surface and +represent a scene as a union of the foreground placed in the sphere and the +thin spherical background. It reduces the degree of freedom in the background +field. Accordingly, we modify the volume rendering equation and incorporate +dedicated constraints to design a novel 3D-aware GAN framework named BallGAN. +BallGAN has multiple advantages as follows. 1) It produces more reasonable 3D +geometry; the images of a scene across different viewpoints have better +photometric consistency and fidelity than the state-of-the-art methods. 2) The +training becomes much more stable. 3) The foreground can be separately rendered +on top of different arbitrary backgrounds. + +
+
+ comment: ICCV 2023, Project Page: https://minjung-s.github.io/ballgan +
+
+
+
+
+ + ♻ ☆ On Uniformly Optimal Algorithms for Best Arm Identification in Two-Armed + Bandits with Fixed Budget + + +
+ We study the problem of best-arm identification with fixed budget in +stochastic two-arm bandits with Bernoulli rewards. We prove that surprisingly, +there is no algorithm that (i) performs as well as the algorithm sampling each +arm equally (this algorithm is referred to as the {\it uniform sampling} +algorithm) on all instances, and that (ii) strictly outperforms this algorithm +on at least one instance. In short, there is no algorithm better than the +uniform sampling algorithm. Towards this result, we introduce the natural class +of {\it consistent} and {\it stable} algorithms, and show that any algorithm +that performs as well as the uniform sampling algorithm on all instances +belongs to this class. The proof is completed by deriving a lower bound on the +error rate satisfied by any consistent and stable algorithm, and by showing +that the uniform sampling algorithm matches this lower bound. Our results +provide a solution to the two open problems presented in \cite{qin2022open}. + +
+
+
+
+
+ + ♻ ☆ MoCLIM: Towards Accurate Cancer Subtyping via Multi-Omics Contrastive + Learning with Omics-Inference Modeling CIKM'23 + + +
+ Precision medicine fundamentally aims to establish causality between +dysregulated biochemical mechanisms and cancer subtypes. Omics-based cancer +subtyping has emerged as a revolutionary approach, as different level of omics +records the biochemical products of multistep processes in cancers. This paper +focuses on fully exploiting the potential of multi-omics data to improve cancer +subtyping outcomes, and hence developed MoCLIM, a representation learning +framework. MoCLIM independently extracts the informative features from distinct +omics modalities. Using a unified representation informed by contrastive +learning of different omics modalities, we can well-cluster the subtypes, given +cancer, into a lower latent space. This contrast can be interpreted as a +projection of inter-omics inference observed in biological networks. +Experimental results on six cancer datasets demonstrate that our approach +significantly improves data fit and subtyping performance in fewer +high-dimensional cancer instances. Moreover, our framework incorporates various +medical evaluations as the final component, providing high interpretability in +medical analysis. + +
+
+ comment: CIKM'23 Long/Full Papers +
+
+
+
+
+ + ♻ ☆ Synthesize High-dimensional Longitudinal Electronic Health Records via + Hierarchical Autoregressive Language Model + + +
+ Synthetic electronic health records (EHRs) that are both realistic and +preserve privacy can serve as an alternative to real EHRs for machine learning +(ML) modeling and statistical analysis. However, generating high-fidelity and +granular electronic health record (EHR) data in its original, +highly-dimensional form poses challenges for existing methods due to the +complexities inherent in high-dimensional data. In this paper, we propose +Hierarchical Autoregressive Language mOdel (HALO) for generating longitudinal +high-dimensional EHR, which preserve the statistical properties of real EHR and +can be used to train accurate ML models without privacy concerns. Our HALO +method, designed as a hierarchical autoregressive model, generates a +probability density function of medical codes, clinical visits, and patient +records, allowing for the generation of realistic EHR data in its original, +unaggregated form without the need for variable selection or aggregation. +Additionally, our model also produces high-quality continuous variables in a +longitudinal and probabilistic manner. We conducted extensive experiments and +demonstrate that HALO can generate high-fidelity EHR data with high-dimensional +disease code probabilities (d > 10,000), disease co-occurrence probabilities +within visits (d > 1,000,000), and conditional probabilities across consecutive +visits (d > 5,000,000) and achieve above 0.9 R2 correlation in comparison to +real EHR data. This performance then enables downstream ML models trained on +its synthetic data to achieve comparable accuracy to models trained on real +data (0.938 AUROC with HALO data vs. 0.943 with real data). Finally, using a +combination of real and synthetic data enhances the accuracy of ML models +beyond that achieved by using only real EHR data. + +
+
+
+
+
+ + ♻ ☆ Natural Language is All a Graph Needs + + +
+ The emergence of large-scale pre-trained language models, such as ChatGPT, +has revolutionized various research fields in artificial intelligence. +Transformers-based large language models (LLMs) have gradually replaced CNNs +and RNNs to unify fields of computer vision and natural language processing. +Compared with the data that exists relatively independently such as images, +videos or texts, graph is a type of data that contains rich structural and +relational information. Meanwhile, natural language, as one of the most +expressive mediums, excels in describing complex structures. However, existing +work on incorporating graph learning problems into the generative language +modeling framework remains very limited. As the importance of large language +models continues to grow, it becomes essential to explore whether LLMs can also +replace GNNs as the foundation model for graphs. In this paper, we propose +InstructGLM (Instruction-finetuned Graph Language Model), systematically design +highly scalable prompts based on natural language instructions, and use natural +language to describe the geometric structure and node features of the graph for +instruction tuning an LLM to perform learning and inference on graphs in a +generative manner. Our method exceeds all competitive GNN baselines on +ogbn-arxiv, Cora and PubMed datasets, which demonstrates the effectiveness of +our method and sheds light on generative large language models as the +foundation model for graph machine learning. + +
+
+ comment: 21 pages, 2 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Adversarial Training Using Feedback Loops + + +
+ Deep neural networks (DNN) have found wide applicability in numerous fields +due to their ability to accurately learn very complex input-output relations. +Despite their accuracy and extensive use, DNNs are highly susceptible to +adversarial attacks due to limited generalizability. For future progress in the +field, it is essential to build DNNs that are robust to any kind of +perturbations to the data points. In the past, many techniques have been +proposed to robustify DNNs using first-order derivative information of the +network. + This paper proposes a new robustification approach based on control theory. A +neural network architecture that incorporates feedback control, named Feedback +Neural Networks, is proposed. The controller is itself a neural network, which +is trained using regular and adversarial data such as to stabilize the system +outputs. The novel adversarial training approach based on the feedback control +architecture is called Feedback Looped Adversarial Training (FLAT). Numerical +results on standard test problems empirically show that our FLAT method is more +effective than the state-of-the-art to guard against adversarial attacks. + +
+
+
+
+
+ + ♻ ☆ Efficient Adaptive Activation Rounding for Post-Training Quantization + + +
+ Post-training quantization attracts increasing attention due to its +convenience in deploying quantized neural networks. Although +rounding-to-nearest remains the prevailing method for DNN quantization, prior +research has demonstrated its suboptimal nature when applied to weight +quantization. They propose optimizing weight rounding schemes by leveraging +output error rather than the traditional weight quantization error. Our study +reveals that similar rounding challenges also extend to activation +quantization. Despite the easy generalization, the challenges lie in the +dynamic nature of activation. Adaptive rounding is expected for varying +activations and the method is subjected to runtime overhead. To tackle this, we +propose the AQuant quantization framework with a novel perspective to reduce +output error by adjusting rounding schemes of activations. Instead of using the +constant rounding border 0.5 of the rounding-to-nearest operation, we make the +border become a function w.r.t. the activation value to change the activation +rounding by the adaptive border. To deal with the runtime overhead, we use a +coarse-grained version of the border function. Finally, we introduce our +framework to optimize the border function. Extensive experiments show that +AQuant achieves notable improvements compared to state-of-the-art works and +pushes the accuracy of ResNet-18 up to 60.31% under the 2-bit weight and +activation quantization. + +
+
+
+
+
+ + ♻ ☆ Graph Ladling: Shockingly Simple Parallel GNN Training without + Intermediate Communication ICML 2023 + + +
+ Graphs are omnipresent and GNNs are a powerful family of neural networks for +learning over graphs. Despite their popularity, scaling GNNs either by +deepening or widening suffers from prevalent issues of unhealthy gradients, +over-smoothening, information squashing, which often lead to sub-standard +performance. In this work, we are interested in exploring a principled way to +scale GNNs capacity without deepening or widening, which can improve its +performance across multiple small and large graphs. Motivated by the recent +intriguing phenomenon of model soups, which suggest that fine-tuned weights of +multiple large-language pre-trained models can be merged to a better minima, we +argue to exploit the fundamentals of model soups to mitigate the aforementioned +issues of memory bottleneck and trainability during GNNs scaling. More +specifically, we propose not to deepen or widen current GNNs, but instead +present a data-centric perspective of model soups tailored for GNNs, i.e., to +build powerful GNNs. By dividing giant graph data, we build multiple +independently and parallelly trained weaker GNNs (soup ingredient) without any +intermediate communication, and combine their strength using a greedy +interpolation soup procedure to achieve state-of-the-art performance. Compared +to concurrent distributed GNN training works such as Jiong et. al. 2023, we +train each soup ingredient by sampling different subgraphs per epoch and their +respective sub-models are merged only after being fully trained (rather than +intermediately so). Moreover, we provide a wide variety of model soup +preparation techniques by leveraging state-of-the-art graph sampling and graph +partitioning approaches that can handle large graphs. Codes are available at: +\url{https://github.com/VITA-Group/graph_ladling}. + +
+
+ comment: Accepted in ICML 2023. Included comparison with a concurrent work + (Jiong et. al. 2023) which independently presents similar ideas, among other + SOTA distributed GNN training works +
+
+
+
+
+ + ♻ ☆ Black Box Variational Inference with a Deterministic Objective: Faster, + More Accurate, and Even More Black Box + + +
+ Automatic differentiation variational inference (ADVI) offers fast and +easy-to-use posterior approximation in multiple modern probabilistic +programming languages. However, its stochastic optimizer lacks clear +convergence criteria and requires tuning parameters. Moreover, ADVI inherits +the poor posterior uncertainty estimates of mean-field variational Bayes +(MFVB). We introduce "deterministic ADVI" (DADVI) to address these issues. +DADVI replaces the intractable MFVB objective with a fixed Monte Carlo +approximation, a technique known in the stochastic optimization literature as +the "sample average approximation" (SAA). By optimizing an approximate but +deterministic objective, DADVI can use off-the-shelf second-order optimization, +and, unlike standard mean-field ADVI, is amenable to more accurate posterior +covariances via linear response (LR). In contrast to existing worst-case +theory, we show that, on certain classes of common statistical problems, DADVI +and the SAA can perform well with relatively few samples even in very high +dimensions, though we also show that such favorable results cannot extend to +variational approximations that are too expressive relative to mean-field ADVI. +We show on a variety of real-world problems that DADVI reliably finds good +solutions with default settings (unlike ADVI) and, together with LR +covariances, is typically faster and more accurate than standard ADVI. + +
+
+
+
+
+ + ♻ ☆ On the in vivo recognition of kidney stones using machine learning + + +
+ Determining the type of kidney stones allows urologists to prescribe a +treatment to avoid recurrence of renal lithiasis. An automated in-vivo +image-based classification method would be an important step towards an +immediate identification of the kidney stone type required as a first phase of +the diagnosis. In the literature it was shown on ex-vivo data (i.e., in very +controlled scene and image acquisition conditions) that an automated kidney +stone classification is indeed feasible. This pilot study compares the kidney +stone recognition performances of six shallow machine learning methods and +three deep-learning architectures which were tested with in-vivo images of the +four most frequent urinary calculi types acquired with an endoscope during +standard ureteroscopies. This contribution details the database construction +and the design of the tested kidney stones classifiers. Even if the best +results were obtained by the Inception v3 architecture (weighted precision, +recall and F1-score of 0.97, 0.98 and 0.97, respectively), it is also shown +that choosing an appropriate colour space and texture features allows a shallow +machine learning method to approach closely the performances of the most +promising deep-learning methods (the XGBoost classifier led to weighted +precision, recall and F1-score values of 0.96). This paper is the first one +that explores the most discriminant features to be extracted from images +acquired during ureteroscopies. + +
+
+ comment: Paper submitted to IEEE Access +
+
+
+
+
+ + ♻ ☆ Rapid building damage assessment workflow: An implementation for the + 2023 Rolling Fork, Mississippi tornado event ICCV + + +
+ Rapid and accurate building damage assessments from high-resolution satellite +imagery following a natural disaster is essential to inform and optimize first +responder efforts. However, performing such building damage assessments in an +automated manner is non-trivial due to the challenges posed by variations in +disaster-specific damage, diversity in satellite imagery, and the dearth of +extensive, labeled datasets. To circumvent these issues, this paper introduces +a human-in-the-loop workflow for rapidly training building damage assessment +models after a natural disaster. This article details a case study using this +workflow, executed in partnership with the American Red Cross during a tornado +event in Rolling Fork, Mississippi in March, 2023. The output from our +human-in-the-loop modeling process achieved a precision of 0.86 and recall of +0.80 for damaged buildings when compared to ground truth data collected +post-disaster. This workflow was implemented end-to-end in under 2 hours per +satellite imagery scene, highlighting its potential for real-time deployment. + +
+
+ comment: Accepted at the 2023 ICCV Humanitarian Assistance and Disaster + Response workshop +
+
+
+
+
+ + ♻ ☆ Can Authorship Representation Learning Capture Stylistic Features? ACL 2023 + + +
+ Automatically disentangling an author's style from the content of their +writing is a longstanding and possibly insurmountable problem in computational +linguistics. At the same time, the availability of large text corpora furnished +with author labels has recently enabled learning authorship representations in +a purely data-driven manner for authorship attribution, a task that ostensibly +depends to a greater extent on encoding writing style than encoding content. +However, success on this surrogate task does not ensure that such +representations capture writing style since authorship could also be correlated +with other latent variables, such as topic. In an effort to better understand +the nature of the information these representations convey, and specifically to +validate the hypothesis that they chiefly encode writing style, we +systematically probe these representations through a series of targeted +experiments. The results of these experiments suggest that representations +learned for the surrogate authorship prediction task are indeed sensitive to +writing style. As a consequence, authorship representations may be expected to +be robust to certain kinds of data shift, such as topic drift over time. +Additionally, our findings may open the door to downstream applications that +require stylistic representations, such as style transfer. + +
+
+ comment: appearing at TACL 2023 +
+
+
+
+
+ + ♻ ☆ Coarse race data conceals disparities in clinical risk score performance + + +
+ Healthcare data in the United States often records only a patient's coarse +race group: for example, both Indian and Chinese patients are typically coded +as "Asian." It is unknown, however, whether this coarse coding conceals +meaningful disparities in the performance of clinical risk scores across +granular race groups. Here we show that it does. Using data from 418K emergency +department visits, we assess clinical risk score performance disparities across +26 granular groups for three outcomes, five risk scores, and four performance +metrics. Across outcomes and metrics, we show that the risk scores exhibit +significant granular performance disparities within coarse race groups. In +fact, variation in performance within coarse groups often *exceeds* the +variation between coarse groups. We explore why these disparities arise, +finding that outcome rates, feature distributions, and the relationships +between features and outcomes all vary significantly across granular groups. +Our results suggest that healthcare providers, hospital systems, and machine +learning researchers should strive to collect, release, and use granular race +data in place of coarse race data, and that existing analyses may significantly +underestimate racial disparities in performance. + +
+
+ comment: Published at MLHC 2023. v2 includes minor changes from the + camera-ready, such as a link to code. Code is available at + https://github.com/rmovva/granular-race-disparities_MLHC23 +
+
+
+
+
+ + ♻ ☆ Internally Rewarded Reinforcement Learning ICML 2023 + + +
+ We study a class of reinforcement learning problems where the reward signals +for policy learning are generated by an internal reward model that is dependent +on and jointly optimized with the policy. This interdependence between the +policy and the reward model leads to an unstable learning process because +reward signals from an immature reward model are noisy and impede policy +learning, and conversely, an under-optimized policy impedes reward estimation +learning. We call this learning setting $\textit{Internally Rewarded +Reinforcement Learning}$ (IRRL) as the reward is not provided directly by the +environment but $\textit{internally}$ by a reward model. In this paper, we +formally formulate IRRL and present a class of problems that belong to IRRL. We +theoretically derive and empirically analyze the effect of the reward function +in IRRL and based on these analyses propose the clipped linear reward function. +Experimental results show that the proposed reward function can consistently +stabilize the training process by reducing the impact of reward noise, which +leads to faster convergence and higher performance compared with baselines in +diverse tasks. + +
+
+ comment: Accepted at ICML 2023. Update: adopt the term "reward model" instead + of using "critic" to prevent confusion with the term "critic" in actor-critic + algorithms. Project webpage at https://ir-rl.github.io +
+
+
+
+
+ + ♻ ☆ Robust Design and Evaluation of Predictive Algorithms under Unobserved + Confounding + + +
+ Predictive algorithms inform consequential decisions in settings where the +outcome is selectively observed given some choices made by human decision +makers. There often exists unobserved confounders that affected the decision +maker's choice and the outcome. We propose a unified methodology for the robust +design and evaluation of predictive algorithms in selectively observed data +under such unobserved confounding. Our approach imposes general assumptions on +how much the outcome may vary on average between unselected and selected units +conditional on observed covariates and identified nuisance parameters, +formalizing popular empirical strategies for imputing missing data such as +proxy outcomes and instrumental variables. We develop debiased machine learning +estimators for the bounds on a large class of predictive performance estimands, +such as the conditional likelihood of the outcome, a predictive algorithm's +mean square error, true/false positive rate, and many others, under these +assumptions. In an administrative dataset from a large Australian financial +institution, we illustrate how varying assumptions on unobserved confounding +leads to meaningful changes in default risk predictions and evaluations of +credit scores across sensitive groups. + +
+
+
+
+
+ + ♻ ☆ Differentially Private Diffusion Models + + +
+ While modern machine learning models rely on increasingly large training +datasets, data is often limited in privacy-sensitive domains. Generative models +trained with differential privacy (DP) on sensitive data can sidestep this +challenge, providing access to synthetic data instead. We build on the recent +success of diffusion models (DMs) and introduce Differentially Private +Diffusion Models (DPDMs), which enforce privacy using differentially private +stochastic gradient descent (DP-SGD). We investigate the DM parameterization +and the sampling algorithm, which turn out to be crucial ingredients in DPDMs, +and propose noise multiplicity, a powerful modification of DP-SGD tailored to +the training of DMs. We validate our novel DPDMs on image generation benchmarks +and achieve state-of-the-art performance in all experiments. Moreover, on +standard benchmarks, classifiers trained on DPDM-generated synthetic data +perform on par with task-specific DP-SGD-trained classifiers, which has not +been demonstrated before for DP generative models. Project page and code: +https://nv-tlabs.github.io/DPDM. + +
+
+ comment: Accepted at TMLR (https://openreview.net/forum?id=ZPpQk7FJXF) +
+
+
+
+
+ + ♻ ☆ Dynamics of Local Elasticity During Training of Neural Nets + + +
+ In the recent past, a property of neural training trajectories in +weight-space had been isolated, that of "local elasticity" (denoted as $S_{\rm +rel}$). Local elasticity attempts to quantify the propagation of the influence +of a sampled data point on the prediction at another data. In this work, we +embark on a comprehensive study of the existing notion of $S_{\rm rel}$ and +also propose a new definition that addresses the limitations that we point out +for the original definition in the classification setting. On various +state-of-the-art neural network training on SVHN, CIFAR-10 and CIFAR-100 we +demonstrate how our new proposal of $S_{\rm rel}$, as opposed to the original +definition, much more sharply detects the property of the weight updates +preferring to make prediction changes within the same class as the sampled +data. + In neural regression experiments we demonstrate that the original $S_{\rm +rel}$ reveals a $2-$phase behavior -- that the training proceeds via an initial +elastic phase when $S_{\rm rel}$ changes rapidly and an eventual inelastic +phase when $S_{\rm rel}$ remains large. We show that some of these properties +can be analytically reproduced in various instances of doing regression via +gradient flows on model predictor classes. + +
+
+ comment: 40 pages (single column), the experiments have been significantly + improved than the previous version +
+
+
+
+
+ + ♻ ☆ PDSketch: Integrated Planning Domain Programming and Learning NeurIPS 2022 + + +
+ This paper studies a model learning and online planning approach towards +building flexible and general robots. Specifically, we investigate how to +exploit the locality and sparsity structures in the underlying environmental +transition model to improve model generalization, data-efficiency, and +runtime-efficiency. We present a new domain definition language, named +PDSketch. It allows users to flexibly define high-level structures in the +transition models, such as object and feature dependencies, in a way similar to +how programmers use TensorFlow or PyTorch to specify kernel sizes and hidden +dimensions of a convolutional neural network. The details of the transition +model will be filled in by trainable neural networks. Based on the defined +structures and learned parameters, PDSketch automatically generates +domain-independent planning heuristics without additional training. The derived +heuristics accelerate the performance-time planning for novel goals. + +
+
+ comment: Minor typo fixes. NeurIPS 2022. Project page: + https://pdsketch.csail.mit.edu +
+
+
+
+
+ + ♻ ☆ Grammar-Based Grounded Lexicon Learning NeurIPS 2021 + + +
+ We present Grammar-Based Grounded Lexicon Learning (G2L2), a lexicalist +approach toward learning a compositional and grounded meaning representation of +language from grounded data, such as paired images and texts. At the core of +G2L2 is a collection of lexicon entries, which map each word to a tuple of a +syntactic type and a neuro-symbolic semantic program. For example, the word +shiny has a syntactic type of adjective; its neuro-symbolic semantic program +has the symbolic form {\lambda}x. filter(x, SHINY), where the concept SHINY is +associated with a neural network embedding, which will be used to classify +shiny objects. Given an input sentence, G2L2 first looks up the lexicon entries +associated with each token. It then derives the meaning of the sentence as an +executable neuro-symbolic program by composing lexical meanings based on +syntax. The recovered meaning programs can be executed on grounded inputs. To +facilitate learning in an exponentially-growing compositional space, we +introduce a joint parsing and expected execution algorithm, which does local +marginalization over derivations to reduce the training time. We evaluate G2L2 +on two domains: visual reasoning and language-driven navigation. Results show +that G2L2 can generalize from small amounts of data to novel compositions of +words. + +
+
+ comment: Minor typo fixes. NeurIPS 2021. Project page: + https://g2l2.csail.mit.edu/ +
+
+
+
+
+
+
+
+ + Multimedia 8 + +
+
+
+ + ☆ Can Linguistic Knowledge Improve Multimodal Alignment in Vision-Language + Pretraining? + + +
+ The multimedia community has shown a significant interest in perceiving and +representing the physical world with multimodal pretrained neural network +models, and among them, the visual-language pertaining (VLP) is, currently, the +most captivating topic. However, there have been few endeavors dedicated to the +exploration of 1) whether essential linguistic knowledge (e.g., semantics and +syntax) can be extracted during VLP, and 2) how such linguistic knowledge +impact or enhance the multimodal alignment. In response, here we aim to +elucidate the impact of comprehensive linguistic knowledge, including semantic +expression and syntactic structure, on multimodal alignment. Specifically, we +design and release the SNARE, the first large-scale multimodal alignment +probing benchmark, to detect the vital linguistic components, e.g., lexical, +semantic, and syntax knowledge, containing four tasks: Semantic structure, +Negation logic, Attribute ownership, and Relationship composition. Based on our +proposed probing benchmarks, our holistic analyses of five advanced VLP models +illustrate that the VLP model: i) shows insensitivity towards complex syntax +structures and relies on content words for sentence comprehension; ii) +demonstrates limited comprehension of combinations between sentences and +negations; iii) faces challenges in determining the presence of actions or +spatial relationships within visual information and struggles with verifying +the correctness of triple combinations. We make our benchmark and code +available at \url{https://github.com/WangFei-2019/SNARE/}. + +
+
+
+
+
+ + ☆ Masked Feature Modelling: Feature Masking for the Unsupervised + Pre-training of a Graph Attention Network Block for Bottom-up Video Event + Recognition + + +
+ In this paper, we introduce Masked Feature Modelling (MFM), a novel approach +for the unsupervised pre-training of a Graph Attention Network (GAT) block. MFM +utilizes a pretrained Visual Tokenizer to reconstruct masked features of +objects within a video, leveraging the MiniKinetics dataset. We then +incorporate the pre-trained GAT block into a state-of-the-art bottom-up +supervised video-event recognition architecture, ViGAT, to improve the model's +starting point and overall accuracy. Experimental evaluations on the YLI-MED +dataset demonstrate the effectiveness of MFM in improving event recognition +performance. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ Exploring Transferability of Multimodal Adversarial Samples for + Vision-Language Pre-training Models with Contrastive Learning + + +
+ Vision-language pre-training models (VLP) are vulnerable, especially to +multimodal adversarial samples, which can be crafted by adding imperceptible +perturbations on both original images and texts. However, under the black-box +setting, there have been no works to explore the transferability of multimodal +adversarial attacks against the VLP models. In this work, we take CLIP as the +surrogate model and propose a gradient-based multimodal attack method to +generate transferable adversarial samples against the VLP models. By applying +the gradient to optimize the adversarial images and adversarial texts +simultaneously, our method can better search for and attack the vulnerable +images and text information pairs. To improve the transferability of the +attack, we utilize contrastive learning including image-text contrastive +learning and intra-modal contrastive learning to have a more generalized +understanding of the underlying data distribution and mitigate the overfitting +of the surrogate model so that the generated multimodal adversarial samples +have a higher transferability for VLP models. Extensive experiments validate +the effectiveness of the proposed method. + +
+
+
+
+
+ + ☆ Emotion-Aligned Contrastive Learning Between Images and Music + + +
+ Traditional music search engines rely on retrieval methods that match natural +language queries with music metadata. There have been increasing efforts to +expand retrieval methods to consider the audio characteristics of music itself, +using queries of various modalities including text, video, and speech. Most +approaches aim to match general music semantics to the input queries, while +only a few focus on affective qualities. We address the task of retrieving +emotionally-relevant music from image queries by proposing a framework for +learning an affective alignment between images and music audio. Our approach +focuses on learning an emotion-aligned joint embedding space between images and +music. This joint embedding space is learned via emotion-supervised contrastive +learning, using an adapted cross-modal version of the SupCon loss. We directly +evaluate the joint embeddings with cross-modal retrieval tasks (image-to-music +and music-to-image) based on emotion labels. In addition, we investigate the +generalizability of the learned music embeddings with automatic music tagging +as a downstream task. Our experiments show that our approach successfully +aligns images and music, and that the learned embedding space is effective for +cross-modal retrieval applications. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Spherical Vision Transformer for 360-degree Video Saliency Prediction BMVC 2023 + + +
+ The growing interest in omnidirectional videos (ODVs) that capture the full +field-of-view (FOV) has gained 360-degree saliency prediction importance in +computer vision. However, predicting where humans look in 360-degree scenes +presents unique challenges, including spherical distortion, high resolution, +and limited labelled data. We propose a novel vision-transformer-based model +for omnidirectional videos named SalViT360 that leverages tangent image +representations. We introduce a spherical geometry-aware spatiotemporal +self-attention mechanism that is capable of effective omnidirectional video +understanding. Furthermore, we present a consistency-based unsupervised +regularization term for projection-based 360-degree dense-prediction models to +reduce artefacts in the predictions that occur after inverse projection. Our +approach is the first to employ tangent images for omnidirectional saliency +prediction, and our experimental results on three ODV saliency datasets +demonstrate its effectiveness compared to the state-of-the-art. + +
+
+ comment: 12 pages, 4 figures, accepted to BMVC 2023 +
+
+
+
+
+ + ♻ ☆ Flow-Guided Controllable Line Drawing Generation + + +
+ In this paper, we investigate the problem of automatically controllable +artistic character line drawing generation from photographs by proposing a +Vector Flow Aware and Line Controllable Image-to-Image Translation +architecture, which can be viewed as an appealing intersection between +Artificial Intelligence and Arts. Specifically, we first present an +Image-to-Flow network (I2FNet) to efficiently and robustly create the vector +flow field in a learning-based manner, which can provide a direction guide for +drawing lines. Then, we introduce our well-designed Double Flow Generator (DFG) +framework to fuse features from learned vector flow and input image flow +guaranteeing the spatial coherence of lines. Meanwhile, in order to allow for +controllable character line drawing generation, we integrate a Line Control +Matrix (LCM) into DFG and train a Line Control Regressor (LCR) to synthesize +drawings with different styles by elaborately controlling the level of details, +such as thickness, smoothness, and continuity, of lines. Finally, we design a +Fourier Transformation Loss to further constrain the character line generation +from the frequency domain view of the point. Quantitative and qualitative +experiments demonstrate that our approach can obtain superior performance in +producing high-resolution character line-drawing images with perceptually +realistic characteristics. + +
+
+
+
+
+ + ♻ ☆ Structure-CLIP: Towards Scene Graph Knowledge to Enhance Multi-modal + Structured Representations + + +
+ Large-scale vision-language pre-training has achieved significant performance +in multi-modal understanding and generation tasks. However, existing methods +often perform poorly on image-text matching tasks that require structured +representations, i.e., representations of objects, attributes, and relations. +Previous models cannot make a distinction between ``An astronaut rides a horse" +and ``A horse rides an astronaut". This is because they fail to fully leverage +structured knowledge when learning representations in multi-modal scenarios. In +this paper, we present an end-to-end framework Structure-CLIP, which integrates +Scene Graph Knowledge (SGK) to enhance multi-modal structured representations. +Firstly, we use scene graphs to guide the construction of semantic negative +examples, which results in an increased emphasis on learning structured +representations. Moreover, a Knowledge-Enhance Encoder (KEE) is proposed to +leverage SGK as input to further enhance structured representations. To verify +the effectiveness of the proposed framework, we pre-train our model with the +aforementioned approaches and conduct experiments on downstream tasks. +Experimental results demonstrate that Structure-CLIP achieves state-of-the-art +(SOTA) performance on VG-Attribution and VG-Relation datasets, with 12.5% and +4.1% ahead of the multi-modal SOTA model respectively. Meanwhile, the results +on MSCOCO indicate that Structure-CLIP significantly enhances the structured +representations while maintaining the ability of general representations. Our +code will be available soon. + +
+
+ comment: Version 2.0. Improve grammar and experiments +
+
+
+
+
+ + ♻ ☆ Explainable Multimodal Emotion Reasoning + + +
+ Multimodal emotion recognition is an active research topic in artificial +intelligence. Its primary objective is to integrate multi-modalities (such as +acoustic, visual, and lexical clues) to identify human emotional states. +Current works generally assume accurate emotion labels for benchmark datasets +and focus on developing more effective architectures. But due to the inherent +subjectivity of emotions, existing datasets often lack high annotation +consistency, resulting in potentially inaccurate labels. Consequently, models +built on these datasets may struggle to meet the demands of practical +applications. To address this issue, it is crucial to enhance the reliability +of emotion annotations. In this paper, we propose a novel task called +``\textbf{Explainable Multimodal Emotion Reasoning (EMER)}''. In contrast to +previous works that primarily focus on predicting emotions, EMER takes a step +further by providing explanations for these predictions. The prediction is +considered correct as long as the reasoning process behind the predicted +emotion is plausible. This paper presents our initial efforts on EMER, where we +introduce a benchmark dataset, establish baseline models, and define evaluation +metrics. Meanwhile, we observe the necessity of integrating multi-faceted +capabilities to deal with EMER. Therefore, we propose the first multimodal +large language model (LLM) in affective computing, called \textbf{AffectGPT}. +We aim to tackle the long-standing challenge of label ambiguity and chart a +path toward more reliable techniques. Furthermore, EMER offers an opportunity +to evaluate the audio-video-text understanding capabilities of recent +multimodal LLM. To facilitate further research, we make the code and data +available at: https://github.com/zeroQiaoba/AffectGPT. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 56 + +
+
+
+ + ☆ D4: Improving LLM Pretraining via Document De-Duplication and + Diversification + + +
+ Over recent years, an increasing amount of compute and data has been poured +into training large language models (LLMs), usually by doing one-pass learning +on as many tokens as possible randomly selected from large-scale web corpora. +While training on ever-larger portions of the internet leads to consistent +performance improvements, the size of these improvements diminishes with scale, +and there has been little work exploring the effect of data selection on +pre-training and downstream performance beyond simple de-duplication methods +such as MinHash. Here, we show that careful data selection (on top of +de-duplicated data) via pre-trained model embeddings can speed up training (20% +efficiency gains) and improves average downstream accuracy on 16 NLP tasks (up +to 2%) at the 6.7B model scale. Furthermore, we show that repeating data +intelligently consistently outperforms baseline training (while repeating +random data performs worse than baseline training). Our results indicate that +clever data selection can significantly improve LLM pre-training, calls into +question the common practice of training for a single epoch on as much data as +possible, and demonstrates a path to keep improving our models past the limits +of randomly sampling web data. + +
+
+
+
+
+ + ☆ Simple is Better and Large is Not Enough: Towards Ensembling of + Foundational Language Models SC + + +
+ Foundational Language Models (FLMs) have advanced natural language processing +(NLP) research. Current researchers are developing larger FLMs (e.g., XLNet, +T5) to enable contextualized language representation, classification, and +generation. While developing larger FLMs has been of significant advantage, it +is also a liability concerning hallucination and predictive uncertainty. +Fundamentally, larger FLMs are built on the same foundations as smaller FLMs +(e.g., BERT); hence, one must recognize the potential of smaller FLMs which can +be realized through an ensemble. In the current research, we perform a reality +check on FLMs and their ensemble on benchmark and real-world datasets. We +hypothesize that the ensembling of FLMs can influence the individualistic +attention of FLMs and unravel the strength of coordination and cooperation of +different FLMs. We utilize BERT and define three other ensemble techniques: +{Shallow, Semi, and Deep}, wherein the Deep-Ensemble introduces a +knowledge-guided reinforcement learning approach. We discovered that the +suggested Deep-Ensemble BERT outperforms its large variation i.e. BERTlarge, by +a factor of many times using datasets that show the usefulness of NLP in +sensitive fields, such as mental health. + +
+
+ comment: Accepted at the 10th Mid-Atlantic Student Colloquium on Speech, + Language and Learning (MASC-SLL 2023) +
+
+
+
+
+ + ☆ Prompt2Model: Generating Deployable Models from Natural Language + Instructions + + +
+ Large language models (LLMs) enable system builders today to create competent +NLP systems through prompting, where they only need to describe the task in +natural language and provide a few examples. However, in other ways, LLMs are a +step backward from traditional special-purpose NLP models; they require +extensive computational resources for deployment and can be gated behind APIs. +In this paper, we propose Prompt2Model, a general-purpose method that takes a +natural language task description like the prompts provided to LLMs, and uses +it to train a special-purpose model that is conducive to deployment. This is +done through a multi-step process of retrieval of existing datasets and +pretrained models, dataset generation using LLMs, and supervised fine-tuning on +these retrieved and generated datasets. Over three tasks, we demonstrate that +given the same few-shot prompt as input, Prompt2Model trains models that +outperform the results of a strong LLM, gpt-3.5-turbo, by an average of 20% +while being up to 700 times smaller. We also show that this data can be used to +obtain reliable performance estimates of model performance, enabling model +developers to assess model reliability before deployment. Prompt2Model is +available open-source at https://github.com/neulab/prompt2model. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ How to Protect Copyright Data in Optimization of Large Language Models? + + +
+ Large language models (LLMs) and generative AI have played a transformative +role in computer research and applications. Controversy has arisen as to +whether these models output copyrighted data, which can occur if the data the +models are trained on is copyrighted. LLMs are built on the transformer neural +network architecture, which in turn relies on a mathematical computation called +Attention that uses the softmax function. + In this paper, we show that large language model training and optimization +can be seen as a softmax regression problem. We then establish a method of +efficiently performing softmax regression, in a way that prevents the +regression function from generating copyright data. This establishes a +theoretical method of training large language models in a way that avoids +generating copyright data. + +
+
+
+
+
+ + ☆ Diffusion Language Models Can Perform Many Tasks with Scaling and + Instruction-Finetuning + + +
+ The recent surge of generative AI has been fueled by the generative power of +diffusion probabilistic models and the scalable capabilities of large language +models. Despite their potential, it remains elusive whether diffusion language +models can solve general language tasks comparable to their autoregressive +counterparts. This paper demonstrates that scaling diffusion models w.r.t. +data, sizes, and tasks can effectively make them strong language learners. We +build competent diffusion language models at scale by first acquiring knowledge +from massive data via masked language modeling pretraining thanks to their +intrinsic connections. We then reprogram pretrained masked language models into +diffusion language models via diffusive adaptation, wherein task-specific +finetuning and instruction finetuning are explored to unlock their versatility +in solving general language tasks. Experiments show that scaling diffusion +language models consistently improves performance across downstream language +tasks. We further discover that instruction finetuning can elicit zero-shot and +few-shot in-context learning abilities that help tackle many unseen tasks by +following natural language instructions, and show promise in advanced and +challenging abilities such as reasoning + +
+
+
+
+
+ + ☆ The Challenges of Machine Learning for Trust and Safety: A Case Study on + Misinformation Detection + + +
+ We examine the disconnect between scholarship and practice in applying +machine learning to trust and safety problems, using misinformation detection +as a case study. We systematize literature on automated detection of +misinformation across a corpus of 270 well-cited papers in the field. We then +examine subsets of papers for data and code availability, design missteps, +reproducibility, and generalizability. We find significant shortcomings in the +literature that call into question claimed performance and practicality. +Detection tasks are often meaningfully distinct from the challenges that online +services actually face. Datasets and model evaluation are often +non-representative of real-world contexts, and evaluation frequently is not +independent of model training. Data and code availability is poor. Models do +not generalize well to out-of-domain data. Based on these results, we offer +recommendations for evaluating machine learning applications to trust and +safety problems. Our aim is for future work to avoid the pitfalls that we +identify. + +
+
+
+
+
+ + ☆ Curriculum Learning with Adam: The Devil Is in the Wrong Details + + +
+ Curriculum learning (CL) posits that machine learning models -- similar to +humans -- may learn more efficiently from data that match their current +learning progress. However, CL methods are still poorly understood and, in +particular for natural language processing (NLP), have achieved only limited +success. In this paper, we explore why. Starting from an attempt to replicate +and extend a number of recent curriculum methods, we find that their results +are surprisingly brittle when applied to NLP. A deep dive into the +(in)effectiveness of the curricula in some scenarios shows us why: when +curricula are employed in combination with the popular Adam optimisation +algorithm, they oftentimes learn to adapt to suboptimally chosen optimisation +parameters for this algorithm. We present a number of different case studies +with different common hand-crafted and automated CL approaches to illustrate +this phenomenon, and we find that none of them outperforms optimisation with +only Adam with well-chosen hyperparameters. As such, our results contribute to +understanding why CL methods work, but at the same time urge caution when +claiming positive results. + +
+
+
+
+
+ + ☆ Evaluation of Faithfulness Using the Longest Supported Subsequence + + +
+ As increasingly sophisticated language models emerge, their trustworthiness +becomes a pivotal issue, especially in tasks such as summarization and +question-answering. Ensuring their responses are contextually grounded and +faithful is challenging due to the linguistic diversity and the myriad of +possible answers. In this paper, we introduce a novel approach to evaluate +faithfulness of machine-generated text by computing the longest noncontinuous +substring of the claim that is supported by the context, which we refer to as +the Longest Supported Subsequence (LSS). Using a new human-annotated dataset, +we finetune a model to generate LSS. We introduce a new method of evaluation +and demonstrate that these metrics correlate better with human ratings when LSS +is employed, as opposed to when it is not. Our proposed metric demonstrates an +18% enhancement over the prevailing state-of-the-art metric for faithfulness on +our dataset. Our metric consistently outperforms other metrics on a +summarization dataset across six different models. Finally, we compare several +popular Large Language Models (LLMs) for faithfulness using this metric. We +release the human-annotated dataset built for predicting LSS and our fine-tuned +model for evaluating faithfulness. + +
+
+
+
+
+ + ☆ Semantic Change Detection for the Romanian Language + + +
+ Automatic semantic change methods try to identify the changes that appear +over time in the meaning of words by analyzing their usage in diachronic +corpora. In this paper, we analyze different strategies to create static and +contextual word embedding models, i.e., Word2Vec and ELMo, on real-world +English and Romanian datasets. To test our pipeline and determine the +performance of our models, we first evaluate both word embedding models on an +English dataset (SEMEVAL-CCOHA). Afterward, we focus our experiments on a +Romanian dataset, and we underline different aspects of semantic changes in +this low-resource language, such as meaning acquisition and loss. The +experimental results show that, depending on the corpus, the most important +factors to consider are the choice of model and the distance to calculate a +score for detecting semantic change. + +
+
+
+
+
+ + ☆ Instruction Position Matters in Sequence Generation with Large Language + Models + + +
+ Large language models (LLMs) are capable of performing conditional sequence +generation tasks, such as translation or summarization, through instruction +fine-tuning. The fine-tuning data is generally sequentially concatenated from a +specific task instruction, an input sentence, and the corresponding response. +Considering the locality modeled by the self-attention mechanism of LLMs, these +models face the risk of instruction forgetting when generating responses for +long input sentences. To mitigate this issue, we propose enhancing the +instruction-following capability of LLMs by shifting the position of task +instructions after the input sentences. Theoretical analysis suggests that our +straightforward method can alter the model's learning focus, thereby +emphasizing the training of instruction-following capabilities. Concurrently, +experimental results demonstrate that our approach consistently outperforms +traditional settings across various model scales (1B / 7B / 13B) and different +sequence generation tasks (translation and summarization), without any +additional data or annotation costs. Notably, our method significantly improves +the zero-shot performance on conditional sequence generation, e.g., up to 9.7 +BLEU points on WMT zero-shot translation tasks. + +
+
+ comment: Codes and results are at + https://github.com/Adaxry/Post-Instruction/tree/main +
+
+
+
+
+ + ☆ Out of the Cage: How Stochastic Parrots Win in Cyber Security + Environments + + +
+ Large Language Models (LLMs) have gained widespread popularity across diverse +domains involving text generation, summarization, and various natural language +processing tasks. Despite their inherent limitations, LLM-based designs have +shown promising capabilities in planning and navigating open-world scenarios. +This paper introduces a novel application of pre-trained LLMs as agents within +cybersecurity network environments, focusing on their utility for sequential +decision-making processes. + We present an approach wherein pre-trained LLMs are leveraged as attacking +agents in two reinforcement learning environments. Our proposed agents +demonstrate similar or better performance against state-of-the-art agents +trained for thousands of episodes in most scenarios and configurations. In +addition, the best LLM agents perform similarly to human testers of the +environment without any additional training process. This design highlights the +potential of LLMs to efficiently address complex decision-making tasks within +cybersecurity. + Furthermore, we introduce a new network security environment named +NetSecGame. The environment is designed to eventually support complex +multi-agent scenarios within the network security domain. The proposed +environment mimics real network attacks and is designed to be highly modular +and adaptable for various scenarios. + +
+
+ comment: Under review. 10 pages plus appendices, 7 figures, 4 tables +
+
+
+
+
+ + ☆ InstructionGPT-4: A 200-Instruction Paradigm for Fine-Tuning MiniGPT-4 + + +
+ Multimodal large language models acquire their instruction-following +capabilities through a two-stage training process: pre-training on image-text +pairs and fine-tuning on supervised vision-language instruction data. Recent +studies have shown that large language models can achieve satisfactory results +even with a limited amount of high-quality instruction-following data. In this +paper, we introduce InstructionGPT-4, which is fine-tuned on a small dataset +comprising only 200 examples, amounting to approximately 6% of the +instruction-following data used in the alignment dataset for MiniGPT-4. We +first propose several metrics to access the quality of multimodal instruction +data. Based on these metrics, we present a simple and effective data selector +to automatically identify and filter low-quality vision-language data. By +employing this method, InstructionGPT-4 outperforms the original MiniGPT-4 on +various evaluations (e.g., visual question answering, GPT-4 preference). +Overall, our findings demonstrate that less but high-quality instruction tuning +data is efficient to enable multimodal large language models to generate better +output. + +
+
+
+
+
+ + ☆ FlexKBQA: A Flexible LLM-Powered Framework for Few-Shot Knowledge Base + Question Answering + + +
+ Knowledge base question answering (KBQA) is a critical yet challenging task +due to the vast number of entities within knowledge bases and the diversity of +natural language questions posed by users. Unfortunately, the performance of +most KBQA models tends to decline significantly in real-world scenarios where +high-quality annotated data is insufficient. To mitigate the burden associated +with manual annotation, we introduce FlexKBQA by utilizing Large Language +Models (LLMs) as program translators for addressing the challenges inherent in +the few-shot KBQA task. Specifically, FlexKBQA leverages automated algorithms +to sample diverse programs, such as SPARQL queries, from the knowledge base, +which are subsequently converted into natural language questions via LLMs. This +synthetic dataset facilitates training a specialized lightweight model for the +KB. Additionally, to reduce the barriers of distribution shift between +synthetic data and real user questions, FlexKBQA introduces an executionguided +self-training method to iterative leverage unlabeled user questions. +Furthermore, we explore harnessing the inherent reasoning capability of LLMs to +enhance the entire framework. Consequently, FlexKBQA delivers substantial +flexibility, encompassing data annotation, deployment, and being domain +agnostic. Through extensive experiments on GrailQA, WebQSP, and KQA Pro, we +observe that under the few-shot even the more challenging zero-shot scenarios, +FlexKBQA achieves impressive results with a few annotations, surpassing all +previous baselines and even approaching the performance of supervised models, +achieving a remarkable 93% performance relative to the fully-supervised models. +We posit that FlexKBQA represents a significant advancement towards exploring +better integration of large and lightweight models. The code is open-sourced. + +
+
+
+
+
+ + ☆ Aligning Language Models with Offline Reinforcement Learning from Human + Feedback + + +
+ Learning from human preferences is crucial for language models (LMs) to +effectively cater to human needs and societal values. Previous research has +made notable progress by leveraging human feedback to follow instructions. +However, these approaches rely primarily on online reinforcement learning (RL) +techniques like Proximal Policy Optimization (PPO), which have been proven +unstable and challenging to tune for language models. Moreover, PPO requires +complex distributed system implementation, hindering the efficiency of +large-scale distributed training. In this study, we propose an offline +reinforcement learning from human feedback (RLHF) framework to align LMs using +pre-generated samples without interacting with RL environments. Specifically, +we explore maximum likelihood estimation (MLE) with filtering, reward-weighted +regression (RWR), and Decision Transformer (DT) to align language models to +human preferences. By employing a loss function similar to supervised +fine-tuning, our methods ensure more stable model training than PPO with a +simple machine learning system~(MLSys) and much fewer (around 12.3\%) computing +resources. Experimental results demonstrate the DT alignment outperforms other +Offline RLHF methods and is better than PPO. + +
+
+
+
+
+ + ☆ CgT-GAN: CLIP-guided Text GAN for Image Captioning ACM MM 2023 + + +
+ The large-scale visual-language pre-trained model, Contrastive Language-Image +Pre-training (CLIP), has significantly improved image captioning for scenarios +without human-annotated image-caption pairs. Recent advanced CLIP-based image +captioning without human annotations follows a text-only training paradigm, +i.e., reconstructing text from shared embedding space. Nevertheless, these +approaches are limited by the training/inference gap or huge storage +requirements for text embeddings. Given that it is trivial to obtain images in +the real world, we propose CLIP-guided text GAN (CgT-GAN), which incorporates +images into the training process to enable the model to "see" real visual +modality. Particularly, we use adversarial training to teach CgT-GAN to mimic +the phrases of an external text corpus and CLIP-based reward to provide +semantic guidance. The caption generator is jointly rewarded based on the +caption naturalness to human language calculated from the GAN's discriminator +and the semantic guidance reward computed by the CLIP-based reward module. In +addition to the cosine similarity as the semantic guidance reward (i.e., +CLIP-cos), we further introduce a novel semantic guidance reward called +CLIP-agg, which aligns the generated caption with a weighted text embedding by +attentively aggregating the entire corpus. Experimental results on three +subtasks (ZS-IC, In-UIC and Cross-UIC) show that CgT-GAN outperforms +state-of-the-art methods significantly across all metrics. Code is available at +https://github.com/Lihr747/CgtGAN. + +
+
+ comment: Accepted at ACM MM 2023 +
+
+
+
+
+ + ☆ IncreLoRA: Incremental Parameter Allocation Method for + Parameter-Efficient Fine-tuning + + +
+ With the increasing size of pre-trained language models (PLMs), fine-tuning +all the parameters in the model is not efficient, especially when there are a +large number of downstream tasks, which incur significant training and storage +costs. Many parameter-efficient fine-tuning (PEFT) approaches have been +proposed, among which, Low-Rank Adaptation (LoRA) is a representative approach +that injects trainable rank decomposition matrices into every target module. +Yet LoRA ignores the importance of parameters in different modules. To address +this problem, many works have been proposed to prune the parameters of LoRA. +However, under limited training conditions, the upper bound of the rank of the +pruned parameter matrix is still affected by the preset values. We, therefore, +propose IncreLoRA, an incremental parameter allocation method that adaptively +adds trainable parameters during training based on the importance scores of +each module. This approach is different from the pruning method as it is not +limited by the initial number of training parameters, and each parameter matrix +has a higher rank upper bound for the same training overhead. We conduct +extensive experiments on GLUE to demonstrate the effectiveness of IncreLoRA. +The results show that our method owns higher parameter efficiency, especially +when under the low-resource settings where our method significantly outperforms +the baselines. Our code is publicly available. + +
+
+
+
+
+ + ☆ Hybrid Retrieval and Multi-stage Text Ranking Solution at TREC 2022 Deep + Learning Track + + +
+ Large-scale text retrieval technology has been widely used in various +practical business scenarios. This paper presents our systems for the TREC 2022 +Deep Learning Track. We explain the hybrid text retrieval and multi-stage text +ranking method adopted in our solution. The retrieval stage combined the two +structures of traditional sparse retrieval and neural dense retrieval. In the +ranking stage, in addition to the full interaction-based ranking model built on +large pre-trained language model, we also proposes a lightweight sub-ranking +module to further enhance the final text ranking performance. Evaluation +results demonstrate the effectiveness of our proposed approach. Our models +achieve the 1st and 4th rank on the test set of passage ranking and document +ranking respectively. + +
+
+ comment: TREC 2022 Deep Learning Track +
+
+
+
+
+ + ☆ Large Multilingual Models Pivot Zero-Shot Multimodal Learning across + Languages + + +
+ Recently there has been a significant surge in multimodal learning in terms +of both image-to-text and text-to-image generation. However, the success is +typically limited to English, leaving other languages largely behind. Building +a competitive counterpart in other languages is highly challenging due to the +low-resource nature of non-English multimodal data (i.e., lack of large-scale, +high-quality image-text data). In this work, we propose MPM, an effective +training paradigm for training large multimodal models in low-resource +languages. MPM demonstrates that Multilingual language models can Pivot +zero-shot Multimodal learning across languages. Specifically, based on a strong +multilingual large language model, multimodal models pretrained on English-only +image-text data can well generalize to other languages in a zero-shot manner +for both image-to-text and text-to-image generation, even surpassing models +trained on image-text data in native languages. Taking Chinese as a practice of +MPM, we build large multimodal models VisCPM in image-to-text and text-to-image +generation, which achieve state-of-the-art (open-source) performance in +Chinese. To facilitate future research, we open-source codes and model weights +at https://github.com/OpenBMB/VisCPM.git. + +
+
+ comment: https://github.com/OpenBMB/VisCPM.git +
+
+
+
+
+ + ☆ PREFER: Prompt Ensemble Learning via Feedback-Reflect-Refine + + +
+ As an effective tool for eliciting the power of Large Language Models (LLMs), +prompting has recently demonstrated unprecedented abilities across a variety of +complex tasks. To further improve the performance, prompt ensemble has +attracted substantial interest for tackling the hallucination and instability +of LLMs. However, existing methods usually adopt a two-stage paradigm, which +requires a pre-prepared set of prompts with substantial manual effort, and is +unable to perform directed optimization for different weak learners. In this +paper, we propose a simple, universal, and automatic method named PREFER (Pompt +Ensemble learning via Feedback-Reflect-Refine) to address the stated +limitations. Specifically, given the fact that weak learners are supposed to +focus on hard examples during boosting, PREFER builds a feedback mechanism for +reflecting on the inadequacies of existing weak learners. Based on this, the +LLM is required to automatically synthesize new prompts for iterative +refinement. Moreover, to enhance stability of the prompt effect evaluation, we +propose a novel prompt bagging method involving forward and backward thinking, +which is superior to majority voting and is beneficial for both feedback and +weight calculation in boosting. Extensive experiments demonstrate that our +PREFER achieves state-of-the-art performance in multiple types of tasks by a +significant margin. We have made our code publicly available. + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ☆ From Quantity to Quality: Boosting LLM Performance with Self-Guided Data + Selection for Instruction Tuning + + +
+ In the realm of Large Language Models, the balance between instruction data +quality and quantity has become a focal point. Recognizing this, we introduce a +self-guided methodology for LLMs to autonomously discern and select cherry +samples from vast open-source datasets, effectively minimizing manual curation +and potential cost for instruction tuning an LLM. Our key innovation, the +Instruction-Following Difficulty (IFD) metric, emerges as a pivotal tool to +identify discrepancies between a model's expected responses and its autonomous +generation prowess. Through the adept application of IFD, cherry samples are +pinpointed, leading to a marked uptick in model training efficiency. Empirical +validations on renowned datasets like Alpaca and WizardLM underpin our +findings; with a mere 10% of conventional data input, our strategy showcases +improved results. This synthesis of self-guided cherry-picking and the IFD +metric signifies a transformative leap in the optimization of LLMs, promising +both efficiency and resource-conscious advancements. + +
+
+
+
+
+ + ☆ Prompt-Based Length Controlled Generation with Reinforcement Learning + + +
+ Recently, large language models (LLMs) like ChatGPT and GPT-4 have attracted +great attention given their surprising improvement and performance. Length +controlled generation of LLMs emerges as an important topic, which also enables +users to fully leverage the capability of LLMs in more real-world scenarios +like generating a proper answer or essay of a desired length. In addition, the +autoregressive generation in LLMs is extremely time-consuming, while the +ability of controlling this generated length can arbitrarily reduce the +inference cost by limiting the length, and thus satisfy different needs. +Therefore, we aim to propose a prompt-based length control method to achieve +this length controlled generation, which can also be widely applied in +GPT-style LLMs. In particular, we adopt reinforcement learning with the reward +signal given by either trainable or rule-based reward model, which further +affects the generation of LLMs via rewarding a pre-defined target length. +Experiments show that our method significantly improves the accuracy of +prompt-based length control for summarization task on popular datasets like +CNNDM and NYT. We believe this length-controllable ability can provide more +potentials towards the era of LLMs. + +
+
+
+
+
+ + ☆ Knowledge-injected Prompt Learning for Chinese Biomedical Entity + Normalization + + +
+ The Biomedical Entity Normalization (BEN) task aims to align raw, +unstructured medical entities to standard entities, thus promoting data +coherence and facilitating better downstream medical applications. Recently, +prompt learning methods have shown promising results in this task. However, +existing research falls short in tackling the more complex Chinese BEN task, +especially in the few-shot scenario with limited medical data, and the vast +potential of the external medical knowledge base has yet to be fully harnessed. +To address these challenges, we propose a novel Knowledge-injected Prompt +Learning (PL-Knowledge) method. Specifically, our approach consists of five +stages: candidate entity matching, knowledge extraction, knowledge encoding, +knowledge injection, and prediction output. By effectively encoding the +knowledge items contained in medical entities and incorporating them into our +tailor-made knowledge-injected templates, the additional knowledge enhances the +model's ability to capture latent relationships between medical entities, thus +achieving a better match with the standard entities. We extensively evaluate +our model on a benchmark dataset in both few-shot and full-scale scenarios. Our +method outperforms existing baselines, with an average accuracy boost of +12.96\% in few-shot and 0.94\% in full-data cases, showcasing its excellence in +the BEN task. + +
+
+
+
+
+ + ☆ Reranking Passages with Coarse-to-Fine Neural Retriever using + List-Context Information + + +
+ Passage reranking is a crucial task in many applications, particularly when +dealing with large-scale documents. Traditional neural architectures are +limited in retrieving the best passage for a question because they usually +match the question to each passage separately, seldom considering contextual +information in other passages that can provide comparison and reference +information. This paper presents a list-context attention mechanism to augment +the passage representation by incorporating the list-context information from +other candidates. The proposed coarse-to-fine (C2F) neural retriever addresses +the out-of-memory limitation of the passage attention mechanism by dividing the +list-context modeling process into two sub-processes, allowing for efficient +encoding of context information from a large number of candidate answers. This +method can be generally used to encode context information from any number of +candidate answers in one pass. Different from most multi-stage information +retrieval architectures, this model integrates the coarse and fine rankers into +the joint optimization process, allowing for feedback between the two layers to +update the model simultaneously. Experiments demonstrate the effectiveness of +the proposed approach. + +
+
+
+
+
+ + ☆ From Instructions to Intrinsic Human Values -- A Survey of Alignment + Goals for Big Models + + +
+ Big models, exemplified by Large Language Models (LLMs), are models typically +pre-trained on massive data and comprised of enormous parameters, which not +only obtain significantly improved performance across diverse tasks but also +present emergent capabilities absent in smaller models. However, the growing +intertwining of big models with everyday human lives poses potential risks and +might cause serious social harm. Therefore, many efforts have been made to +align LLMs with humans to make them better follow user instructions and satisfy +human preferences. Nevertheless, `what to align with' has not been fully +discussed, and inappropriate alignment goals might even backfire. In this +paper, we conduct a comprehensive survey of different alignment goals in +existing work and trace their evolution paths to help identify the most +essential goal. Particularly, we investigate related works from two +perspectives: the definition of alignment goals and alignment evaluation. Our +analysis encompasses three distinct levels of alignment goals and reveals a +goal transformation from fundamental abilities to value orientation, indicating +the potential of intrinsic human values as the alignment goal for enhanced +LLMs. Based on such results, we further discuss the challenges of achieving +such intrinsic value alignment and provide a collection of available resources +for future research on the alignment of big models. + +
+
+ comment: 20 pages, 5 figures +
+
+
+
+
+ + ☆ Graecia capta ferum victorem cepit. Detecting Latin Allusions to Ancient + Greek Literature + + +
+ Intertextual allusions hold a pivotal role in Classical Philology, with Latin +authors frequently referencing Ancient Greek texts. Until now, the automatic +identification of these intertextual references has been constrained to +monolingual approaches, seeking parallels solely within Latin or Greek texts. +In this study, we introduce SPhilBERTa, a trilingual Sentence-RoBERTa model +tailored for Classical Philology, which excels at cross-lingual semantic +comprehension and identification of identical sentences across Ancient Greek, +Latin, and English. We generate new training data by automatically translating +English texts into Ancient Greek. Further, we present a case study, +demonstrating SPhilBERTa's capability to facilitate automated detection of +intertextual parallels. Our models and resources are available at +https://github.com/Heidelberg-NLP/ancient-language-models. + +
+
+ comment: Paper accepted for publication at the First Workshop on Ancient + Language Processing (ALP) 2023; 9 pages, 5 tables +
+
+
+
+
+ + ☆ Topical-Chat: Towards Knowledge-Grounded Open-Domain Conversations INTERSPEECH 2019 + + +
+ Building socialbots that can have deep, engaging open-domain conversations +with humans is one of the grand challenges of artificial intelligence (AI). To +this end, bots need to be able to leverage world knowledge spanning several +domains effectively when conversing with humans who have their own world +knowledge. Existing knowledge-grounded conversation datasets are primarily +stylized with explicit roles for conversation partners. These datasets also do +not explore depth or breadth of topical coverage with transitions in +conversations. We introduce Topical-Chat, a knowledge-grounded human-human +conversation dataset where the underlying knowledge spans 8 broad topics and +conversation partners don't have explicitly defined roles, to help further +research in open-domain conversational AI. We also train several +state-of-the-art encoder-decoder conversational models on Topical-Chat and +perform automated and human evaluation for benchmarking. + +
+
+ comment: arXiving an old paper accepted at INTERSPEECH 2019 +
+
+
+
+
+ + ☆ EVE: Efficient Vision-Language Pre-training with Masked Prediction and + Modality-Aware MoE + + +
+ Building scalable vision-language models to learn from diverse, multimodal +data remains an open challenge. In this paper, we introduce an Efficient +Vision-languagE foundation model, namely EVE, which is one unified multimodal +Transformer pre-trained solely by one unified pre-training task. Specifically, +EVE encodes both vision and language within a shared Transformer network +integrated with modality-aware sparse Mixture-of-Experts (MoE) modules, which +capture modality-specific information by selectively switching to different +experts. To unify pre-training tasks of vision and language, EVE performs +masked signal modeling on image-text pairs to reconstruct masked signals, i.e., +image pixels and text tokens, given visible signals. This simple yet effective +pre-training objective accelerates training by 3.5x compared to the model +pre-trained with Image-Text Contrastive and Image-Text Matching losses. Owing +to the combination of the unified architecture and pre-training task, EVE is +easy to scale up, enabling better downstream performance with fewer resources +and faster training speed. Despite its simplicity, EVE achieves +state-of-the-art performance on various vision-language downstream tasks, +including visual question answering, visual reasoning, and image-text +retrieval. + +
+
+
+
+
+ + ☆ Audio Generation with Multiple Conditional Diffusion Model AAAI 2024 + + +
+ Text-based audio generation models have limitations as they cannot encompass +all the information in audio, leading to restricted controllability when +relying solely on text. To address this issue, we propose a novel model that +enhances the controllability of existing pre-trained text-to-audio models by +incorporating additional conditions including content (timestamp) and style +(pitch contour and energy contour) as supplements to the text. This approach +achieves fine-grained control over the temporal order, pitch, and energy of +generated audio. To preserve the diversity of generation, we employ a trainable +control condition encoder that is enhanced by a large language model and a +trainable Fusion-Net to encode and fuse the additional conditions while keeping +the weights of the pre-trained text-to-audio model frozen. Due to the lack of +suitable datasets and evaluation metrics, we consolidate existing datasets into +a new dataset comprising the audio and corresponding conditions and use a +series of evaluation metrics to evaluate the controllability performance. +Experimental results demonstrate that our model successfully achieves +fine-grained control to accomplish controllable audio generation. Audio samples +and our dataset are publicly available at +https://conditionaudiogen.github.io/conditionaudiogen/ + +
+
+ comment: Submitted to AAAI 2024 +
+
+
+
+
+ + ☆ Audio Difference Captioning Utilizing Similarity-Discrepancy + Disentanglement + + +
+ We proposed Audio Difference Captioning (ADC) as a new extension task of +audio captioning for describing the semantic differences between input pairs of +similar but slightly different audio clips. The ADC solves the problem that +conventional audio captioning sometimes generates similar captions for similar +audio clips, failing to describe the difference in content. We also propose a +cross-attention-concentrated transformer encoder to extract differences by +comparing a pair of audio clips and a similarity-discrepancy disentanglement to +emphasize the difference in the latent space. To evaluate the proposed methods, +we built an AudioDiffCaps dataset consisting of pairs of similar but slightly +different audio clips with human-annotated descriptions of their differences. +The experiment with the AudioDiffCaps dataset showed that the proposed methods +solve the ADC task effectively and improve the attention weights to extract the +difference by visualizing them in the transformer encoder. + +
+
+ comment: Accepted to DCASE2023 Workshop +
+
+
+
+
+ + ☆ Bridging the Gap: Deciphering Tabular Data Using Large Language Model + + +
+ In the realm of natural language processing, the understanding of tabular +data has perpetually stood as a focal point of scholarly inquiry. The emergence +of expansive language models, exemplified by the likes of ChatGPT, has ushered +in a wave of endeavors wherein researchers aim to harness these models for +tasks related to table-based question answering. Central to our investigative +pursuits is the elucidation of methodologies that amplify the aptitude of such +large language models in discerning both the structural intricacies and +inherent content of tables, ultimately facilitating their capacity to provide +informed responses to pertinent queries. To this end, we have architected a +distinctive module dedicated to the serialization of tables for seamless +integration with expansive language models. Additionally, we've instituted a +corrective mechanism within the model to rectify potential inaccuracies. +Experimental results indicate that, although our proposed method trails the +SOTA by approximately 11.7% in overall metrics, it surpasses the SOTA by about +1.2% in tests on specific datasets. This research marks the first application +of large language models to table-based question answering tasks, enhancing the +model's comprehension of both table structures and content. + +
+
+
+
+
+ + ☆ Cabrita: closing the gap for foreign languages + + +
+ The strategy of training the model from scratch in a specific language or +domain serves two essential purposes: i) enhancing performance in the +particular linguistic or domain context, and ii) ensuring effective +tokenization. The main limitation inherent to this approach lies in the +associated cost, which can reach six to seven-digit dollar values, depending on +the model size and the number of parameters involved. + The main solution to overcome the cost challenge is to rely on available +pre-trained models, which, despite recent advancements such as the LLaMA and +LLaMA-2 models, still demonstrate inefficiency for certain specific domain +problems or prove ineffective in scenarios involving conversational memory +resources, given the large number of tokens required to represent text. + To overcome this issue, we present a methodology named Cabrita, which, as our +research demonstrates, successfully addresses the performance and efficient +tokenization problem, all at an affordable cost. We believe that this +methodology can be applied to any transformer-like architecture model. To +validate the study, we conducted continuous pre-training exclusively using +Portuguese text on a 3-billion-parameter model known as OpenLLaMA, resulting in +a model named openCabrita 3B. The openCabrita 3B also features a new tokenizer +that results in a significant reduction in the number of tokens required to +represent the text. In our assessment, for few-shot learning tasks, we achieved +similar results with this 3B model compared to a traditional continuous +pre-training approach as well as to 7B models English pre-trained models. + +
+
+ comment: 9 pages, 1 figure +
+
+
+
+
+ + ☆ Are ChatGPT and GPT-4 Good Poker Players? -- A Pre-Flop Analysis + + +
+ Since the introduction of ChatGPT and GPT-4, these models have been tested +across a large number of tasks. Their adeptness across domains is evident, but +their aptitude in playing games and specifically their aptitude in the realm of +poker has remained unexplored. Poker is a game that requires decision making +under uncertainty and incomplete information. In this paper, we put ChatGPT and +GPT-4 through the poker test and evaluate their poker skills. Our findings +reveal that while both models display an advanced understanding of poker, +encompassing concepts like the valuation of starting hands, playing positions +and other intricacies of game theory optimal (GTO) poker, both ChatGPT and +GPT-4 are NOT game theory optimal poker players. + Through a series of experiments, we first discover the characteristics of +optimal prompts and model parameters for playing poker with these models. Our +observations then unveil the distinct playing personas of the two models. We +first conclude that GPT-4 is a more advanced poker player than ChatGPT. This +exploration then sheds light on the divergent poker tactics of the two models: +ChatGPT's conservativeness juxtaposed against GPT-4's aggression. In poker +vernacular, when tasked to play GTO poker, ChatGPT plays like a Nit, which +means that it has a propensity to only engage with premium hands and folds a +majority of hands. When subjected to the same directive, GPT-4 plays like a +maniac, showcasing a loose and aggressive style of play. Both strategies, +although relatively advanced, are not game theory optimal. + +
+
+
+
+
+ + ☆ Evolution of ESG-focused DLT Research: An NLP Analysis of the Literature + + +
+ Distributed Ledger Technologies (DLTs) have rapidly evolved, necessitating +comprehensive insights into their diverse components. However, a systematic +literature review that emphasizes the Environmental, Sustainability, and +Governance (ESG) components of DLT remains lacking. To bridge this gap, we +selected 107 seed papers to build a citation network of 63,083 references and +refined it to a corpus of 24,539 publications for analysis. Then, we labeled +the named entities in 46 papers according to twelve top-level categories +derived from an established technology taxonomy and enhanced the taxonomy by +pinpointing DLT's ESG elements. Leveraging transformer-based language models, +we fine-tuned a pre-trained language model for a Named Entity Recognition (NER) +task using our labeled dataset. We used our fine-tuned language model to +distill the corpus to 505 key papers, facilitating a literature review via +named entities and temporal graph analysis on DLT evolution in the context of +ESG. Our contributions are a methodology to conduct a machine learning-driven +systematic literature review in the DLT field, placing a special emphasis on +ESG aspects. Furthermore, we present a first-of-its-kind NER dataset, composed +of 54,808 named entities, designed for DLT and ESG-related explorations. + +
+
+
+
+
+ + ☆ Toward American Sign Language Processing in the Real World: Data, Tasks, + and Methods + + +
+ Sign language, which conveys meaning through gestures, is the chief means of +communication among deaf people. Recognizing sign language in natural settings +presents significant challenges due to factors such as lighting, background +clutter, and variations in signer characteristics. In this thesis, I study +automatic sign language processing in the wild, using signing videos collected +from the Internet. This thesis contributes new datasets, tasks, and methods. +Most chapters of this thesis address tasks related to fingerspelling, an +important component of sign language and yet has not been studied widely by +prior work. I present three new large-scale ASL datasets in the wild: +ChicagoFSWild, ChicagoFSWild+, and OpenASL. Using ChicagoFSWild and +ChicagoFSWild+, I address fingerspelling recognition, which consists of +transcribing fingerspelling sequences into text. I propose an end-to-end +approach based on iterative attention that allows recognition from a raw video +without explicit hand detection. I further show that using a Conformer-based +network jointly modeling handshape and mouthing can bring performance close to +that of humans. Next, I propose two tasks for building real-world +fingerspelling-based applications: fingerspelling detection and search. For +fingerspelling detection, I introduce a suite of evaluation metrics and a new +detection model via multi-task training. To address the problem of searching +for fingerspelled keywords in raw sign language videos, we propose a novel +method that jointly localizes and matches fingerspelling segments to text. +Finally, I will describe a benchmark for large-vocabulary open-domain sign +language translation based on OpenASL. To address the challenges of sign +language translation in realistic settings, we propose a set of techniques +including sign search as a pretext task for pre-training and fusion of mouthing +and handshape features. + +
+
+ comment: PhD thesis +
+
+
+
+
+ + ☆ With a Little Help from your own Past: Prototypical Memory Networks for + Image Captioning ICCV 2023 + + +
+ Image captioning, like many tasks involving vision and language, currently +relies on Transformer-based architectures for extracting the semantics in an +image and translating it into linguistically coherent descriptions. Although +successful, the attention operator only considers a weighted summation of +projections of the current input sample, therefore ignoring the relevant +semantic information which can come from the joint observation of other +samples. In this paper, we devise a network which can perform attention over +activations obtained while processing other training samples, through a +prototypical memory model. Our memory models the distribution of past keys and +values through the definition of prototype vectors which are both +discriminative and compact. Experimentally, we assess the performance of the +proposed model on the COCO dataset, in comparison with carefully designed +baselines and state-of-the-art approaches, and by investigating the role of +each of the proposed components. We demonstrate that our proposal can increase +the performance of an encoder-decoder Transformer by 3.7 CIDEr points both when +training in cross-entropy only and when fine-tuning with self-critical sequence +training. Source code and trained models are available at: +https://github.com/aimagelab/PMA-Net. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Vision Transformer Adapters for Generalizable Multitask Learning ICCV 2023 + + +
+ We introduce the first multitasking vision transformer adapters that learn +generalizable task affinities which can be applied to novel tasks and domains. +Integrated into an off-the-shelf vision transformer backbone, our adapters can +simultaneously solve multiple dense vision tasks in a parameter-efficient +manner, unlike existing multitasking transformers that are parametrically +expensive. In contrast to concurrent methods, we do not require retraining or +fine-tuning whenever a new task or domain is added. We introduce a task-adapted +attention mechanism within our adapter framework that combines gradient-based +task similarities with attention-based ones. The learned task affinities +generalize to the following settings: zero-shot task transfer, unsupervised +domain adaptation, and generalization without fine-tuning to novel domains. We +demonstrate that our approach outperforms not only the existing convolutional +neural network-based multitasking methods but also the vision transformer-based +ones. Our project page is at \url{https://ivrl.github.io/VTAGML}. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ Diagnosing Infeasible Optimization Problems Using Large Language Models + + +
+ Decision-making problems can be represented as mathematical optimization +models, finding wide applications in fields such as economics, engineering and +manufacturing, transportation, and health care. Optimization models are +mathematical abstractions of the problem of making the best decision while +satisfying a set of requirements or constraints. One of the primary barriers to +deploying these models in practice is the challenge of helping practitioners +understand and interpret such models, particularly when they are infeasible, +meaning no decision satisfies all the constraints. Existing methods for +diagnosing infeasible optimization models often rely on expert systems, +necessitating significant background knowledge in optimization. In this paper, +we introduce OptiChat, a first-of-its-kind natural language-based system +equipped with a chatbot GUI for engaging in interactive conversations about +infeasible optimization models. OptiChat can provide natural language +descriptions of the optimization model itself, identify potential sources of +infeasibility, and offer suggestions to make the model feasible. The +implementation of OptiChat is built on GPT-4, which interfaces with an +optimization solver to identify the minimal subset of constraints that render +the entire optimization problem infeasible, also known as the Irreducible +Infeasible Subset (IIS). We utilize few-shot learning, expert chain-of-thought, +key-retrieve, and sentiment prompts to enhance OptiChat's reliability. Our +experiments demonstrate that OptiChat assists both expert and non-expert users +in improving their understanding of the optimization models, enabling them to +quickly identify the sources of infeasibility. + +
+
+
+
+
+ + ♻ ☆ Tryage: Real-time, intelligent Routing of User Prompts to Large Language + Models + + +
+ The introduction of the transformer architecture and the self-attention +mechanism has led to an explosive production of language models trained on +specific downstream tasks and data domains. With over 200, 000 models in the +Hugging Face ecosystem, users grapple with selecting and optimizing models to +suit multifaceted workflows and data domains while addressing computational, +security, and recency concerns. There is an urgent need for machine learning +frameworks that can eliminate the burden of model selection and customization +and unleash the incredible power of the vast emerging model library for end +users. Here, we propose a context-aware routing system, Tryage, that leverages +a language model router for optimal selection of expert models from a model +library based on analysis of individual input prompts. Inspired by the thalamic +router in the brain, Tryage employs a perceptive router to predict down-stream +model performance on prompts and, then, makes a routing decision using an +objective function that integrates performance predictions with user goals and +constraints that are incorporated through flags (e.g., model size, model +recency). Tryage allows users to explore a Pareto front and automatically +trade-off between task accuracy and secondary goals including minimization of +model size, recency, security, verbosity, and readability. Across heterogeneous +data sets that include code, text, clinical data, and patents, the Tryage +framework surpasses Gorilla and GPT3.5 turbo in dynamic model selection +identifying the optimal model with an accuracy of 50.9% , compared to 23.6% by +GPT 3.5 Turbo and 10.8% by Gorilla. Conceptually, Tryage demonstrates how +routing models can be applied to program and control the behavior of +multi-model LLM systems to maximize efficient use of the expanding and evolving +language model ecosystem. + +
+
+
+
+
+ + ♻ ☆ How Good Are Large Language Models at Out-of-Distribution Detection? + + +
+ Out-of-distribution (OOD) detection plays a vital role in enhancing the +reliability of machine learning (ML) models. The emergence of large language +models (LLMs) has catalyzed a paradigm shift within the ML community, +showcasing their exceptional capabilities across diverse natural language +processing tasks. While existing research has probed OOD detection with +relative small-scale Transformers like BERT, RoBERTa and GPT-2, the stark +differences in scales, pre-training objectives, and inference paradigms call +into question the applicability of these findings to LLMs. This paper embarks +on a pioneering empirical investigation of OOD detection in the domain of LLMs, +focusing on LLaMA series ranging from 7B to 65B in size. We thoroughly evaluate +commonly-used OOD detectors, scrutinizing their performance in both zero-grad +and fine-tuning scenarios. Notably, we alter previous discriminative +in-distribution fine-tuning into generative fine-tuning, aligning the +pre-training objective of LLMs with downstream tasks. Our findings unveil that +a simple cosine distance OOD detector demonstrates superior efficacy, +outperforming other OOD detectors. We provide an intriguing explanation for +this phenomenon by highlighting the isotropic nature of the embedding spaces of +LLMs, which distinctly contrasts with the anisotropic property observed in +smaller BERT family models. The new insight enhances our understanding of how +LLMs detect OOD data, thereby enhancing their adaptability and reliability in +dynamic environments. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ Low-Resource Authorship Style Transfer: Can Non-Famous Authors Be + Imitated? + + +
+ Authorship style transfer involves altering text to match the style of a +target author whilst preserving the original meaning. Existing unsupervised +approaches like STRAP have largely focused on style transfer to target authors +with many examples of their writing style in books, speeches, or other +published works. This high-resource training data requirement (often greater +than 100,000 words) makes these approaches primarily useful for style transfer +to published authors, politicians, or other well-known figures and authorship +styles, while style transfer to non-famous authors has not been well-studied. +We introduce the \textit{low-resource authorship style transfer} task, a more +challenging class of authorship style transfer where only a limited amount of +text in the target author's style may exist. In our experiments, we +specifically choose source and target authors from Reddit and style transfer +their Reddit posts, limiting ourselves to just 16 posts (on average ~500 words) +of the target author's style. Style transfer accuracy is typically measured by +how often a classifier or human judge will classify an output as written by the +target author. Recent authorship representations models excel at authorship +identification even with just a few writing samples, making automatic +evaluation of this task possible for the first time through evaluation metrics +we propose. Our results establish an in-context learning technique we develop +as the strongest baseline, though we find current approaches do not yet achieve +mastery of this challenging task. We release our data and implementations to +encourage further investigation. + +
+
+
+
+
+ + ♻ ☆ Large Language Model as a User Simulator + + +
+ The unparalleled performance of closed-sourced ChatGPT has sparked efforts +towards its democratization, with notable strides made by leveraging real user +and ChatGPT conversations, as evidenced by Vicuna. However, while current +endeavors like Baize and UltraChat aim to auto-generate conversational data due +to challenges in gathering human participation, they primarily rely on ChatGPT +to simulate human behaviors based on directives rather than genuine human +learning. This results in a limited scope, diminished diversity, and an absence +of genuine multi-round conversational dynamics. To address the above issues, we +innovatively target human questions extracted from genuine human-machine +conversations as a learning goal and train a user simulator, UserGPT, to +produce a high-quality human-centric synthetic conversation dataset, RealChat. +Subsequently, this dataset trains our assistant model, ReaLM. Experimentally, +ReaLM outpaces baseline models in both Vicuna-Bench and MT-Bench by pairwise +comparison when considering equivalent training set sizes, and manual +evaluation also shows that our model is highly competitive. Impressively, when +fine-tuned with the latest LLaMA 2 model, ReaLM secured a leading score of 6.33 +in the MT-Bench, outshining the contemporary same-scale models, including the +LLaMA-2-7B-chat model. Further in-depth analysis demonstrates the scalability +and transferability of our approach. A preliminary exploration into the +interplay between training set data quality and resultant model performance is +also undertaken, laying a robust groundwork for future investigations. The code +is available at https://github.com/FreedomIntelligence/ReaLM. + +
+
+
+
+
+ + ♻ ☆ Domain Specific Question Answering Over Knowledge Graphs Using Logical + Programming and Large Language Models + + +
+ Answering questions over domain-specific graphs requires a tailored approach +due to the limited number of relations and the specific nature of the domain. +Our approach integrates classic logical programming languages into large +language models (LLMs), enabling the utilization of logical reasoning +capabilities to tackle the KGQA task. By representing the questions as Prolog +queries, which are readable and near close to natural language in +representation, we facilitate the generation of programmatically derived +answers. To validate the effectiveness of our approach, we evaluate it using a +well-known benchmark dataset, MetaQA. Our experimental results demonstrate that +our method achieves accurate identification of correct answer entities for all +test questions, even when trained on a small fraction of annotated data. +Overall, our work presents a promising approach to addressing question +answering over domain-specific graphs, offering an explainable and robust +solution by incorporating logical programming languages. + +
+
+
+
+
+ + ♻ ☆ Exploring the Landscape of Natural Language Processing Research + + +
+ As an efficient approach to understand, generate, and process natural +language texts, research in natural language processing (NLP) has exhibited a +rapid spread and wide adoption in recent years. Given the increasing research +work in this area, several NLP-related approaches have been surveyed in the +research community. However, a comprehensive study that categorizes established +topics, identifies trends, and outlines areas for future research remains +absent. Contributing to closing this gap, we have systematically classified and +analyzed research papers in the ACL Anthology. As a result, we present a +structured overview of the research landscape, provide a taxonomy of fields of +study in NLP, analyze recent developments in NLP, summarize our findings, and +highlight directions for future work. + +
+
+ comment: Extended version of the paper published at the 14th International + Conference on Recent Advances in Natural Language Processing (RANLP 2023) +
+
+
+
+
+ + ♻ ☆ Comparison of Machine Learning Methods for Assigning Software Issues to + Team Members + + +
+ Software issues contain units of work to fix, improve, or create new threads +during the development and facilitate communication among the team members. +Assigning an issue to the most relevant team member and determining a category +of an issue is a tedious and challenging task. Wrong classifications cause +delays and rework in the project and trouble among the team members. This paper +proposes a set of carefully curated linguistic features for shallow machine +learning methods and compares the performance of shallow and ensemble methods +with deep language models. Unlike the state-of-the-art, we assign issues to +four roles (designer, developer, tester, and leader) rather than to specific +individuals or teams to contribute to the generality of our solution. We also +consider the level of experience of the developers to reflect the industrial +practices in our solution formulation. We collect and annotate five industrial +data sets from one of the top three global television producers to evaluate our +proposal and compare it with deep language models. Our data sets contain 5324 +issues in total. We show that an ensemble classifier of shallow techniques +achieves 0.92 for issue assignment in accuracy which is statistically +comparable to the state-of-the-art deep language models. The contributions +include the public sharing of five annotated industrial issue data sets, the +development of a clear and comprehensive feature set, the introduction of a +novel label set, and the validation of the efficacy of an ensemble classifier +of shallow machine learning techniques. + +
+
+
+
+
+ + ♻ ☆ BAN-PL: a Novel Polish Dataset of Banned Harmful and Offensive Content + from Wykop.pl web service + + +
+ Advances in automated detection of offensive language online, including hate +speech and cyberbullying, require improved access to publicly available +datasets comprising social media content. In this paper, we introduce BAN-PL, +the first open dataset in the Polish language that encompasses texts flagged as +harmful and subsequently removed by professional moderators. The dataset +encompasses a total of 691,662 pieces of content from a popular social +networking service, Wykop, often referred to as the "Polish Reddit", including +both posts and comments, and is evenly distributed into two distinct classes: +"harmful" and "neutral". We provide a comprehensive description of the data +collection and preprocessing procedures, as well as highlight the linguistic +specificity of the data. The BAN-PL dataset, along with advanced preprocessing +scripts for, i.a., unmasking profanities, will be publicly available. + +
+
+
+
+
+ + ♻ ☆ SONAR: Sentence-Level Multimodal and Language-Agnostic Representations + + +
+ We introduce SONAR, a new multilingual and multimodal fixed-size sentence +embedding space. Our single text encoder, covering 200 languages, substantially +outperforms existing sentence embeddings such as LASER3 and LabSE on the xsim +and xsim++ multilingual similarity search tasks. Speech segments can be +embedded in the same SONAR embedding space using language-specific speech +encoders trained in a teacher-student setting on speech transcription data. Our +encoders outperform existing speech encoders on similarity search tasks. We +also provide a text decoder for 200 languages, which allows us to perform +text-to-text and speech-to-text machine translation, including for zero-shot +language and modality combinations. Our text-to-text results are competitive +compared to the state-of-the-art NLLB~1B model, despite the fixed-size +bottleneck representation. Our zero-shot speech-to-text translation results +compare favorably with strong supervised baselines such as Whisper. + +
+
+
+
+
+ + ♻ ☆ Forward-Backward Reasoning in Large Language Models for Verification + + +
+ Chain-of-Though (CoT) prompting has shown promising performance in various +reasoning tasks. Recently, Self-Consistency \citep{wang2023selfconsistency} +proposes to sample a diverse set of reasoning chains which may lead to +different answers while the answer that receives the most votes is selected. In +this paper, we propose a novel method to use backward reasoning in verifying +candidate answers. We mask a token in the question by ${\bf x}$ and ask the LLM +to predict the masked token when a candidate answer is provided by \textit{a +simple template}, i.e., "\textit{\textbf{If we know the answer of the above +question is \{a candidate answer\}, what is the value of unknown variable ${\bf +x}$?}}" Intuitively, the LLM is expected to predict the masked token +successfully if the provided candidate answer is correct. We further propose +FOBAR to combine forward and backward reasoning for estimating the probability +of candidate answers. We conduct extensive experiments on six data sets and +three LLMs. Experimental results demonstrate that FOBAR achieves +state-of-the-art performance on various reasoning benchmarks. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ Self-consistency for open-ended generations + + +
+ Large Language Models (LLMs) can exhibit considerable variation in the +quality of their sampled outputs. Reranking and selecting the best generation +from the sampled set is a popular way of obtaining strong gains in generation +quality. In this paper, we present a novel approach for reranking LLM +generations. Unlike other techniques that might involve additional inferences +or training a specialized reranker, our approach relies on easy to compute +pairwise statistics between the generations that have minimal compute overhead. +We show that our approach can be formalized as an extension of self-consistency +and analyze its performance in that framework, theoretically as well as via +simulations. We show strong improvements for selecting the best $k$ generations +for code generation tasks as well as robust improvements for best generation +for the tasks of autoformalization, and summarization. While our approach only +assumes black-box access to LLMs, we show that additional access to token +probabilities can improve performance even further. + +
+
+
+
+
+ + ♻ ☆ Making first order linear logic a generating grammar + + +
+ It is known that different categorial grammars have surface representation in +a fragment of first order multiplicative linear logic (MLL1). We show that the +fragment of interest is equivalent to the recently introduced extended tensor +type calculus (ETTC). ETTC is a calculus of specific typed terms, which +represent tuples of strings, more precisely bipartite graphs decorated with +strings. Types are derived from linear logic formulas, and rules correspond to +concrete operations on these string-labeled graphs, so that they can be +conveniently visualized. This provides the above mentioned fragment of MLL1 +that is relevant for language modeling not only with some alternative syntax +and intuitive geometric representation, but also with an intrinsic deductive +system, which has been absent. + In this work we consider a non-trivial notationally enriched variation of the +previously introduced {\bf ETTC}, which allows more concise and transparent +computations. We present both a cut-free sequent calculus and a natural +deduction formalism. + +
+
+ comment: Revised and extended version with detailed proofs. arXiv admin note: + substantial text overlap with arXiv:2112.15253 +
+
+
+
+
+ + ♻ ☆ A Structured Span Selector NAACL 2022 + + +
+ Many natural language processing tasks, e.g., coreference resolution and +semantic role labeling, require selecting text spans and making decisions about +them. A typical approach to such tasks is to score all possible spans and +greedily select spans for task-specific downstream processing. This approach, +however, does not incorporate any inductive bias about what sort of spans ought +to be selected, e.g., that selected spans tend to be syntactic constituents. In +this paper, we propose a novel grammar-based structured span selection model +which learns to make use of the partial span-level annotation provided for such +problems. Compared to previous approaches, our approach gets rid of the +heuristic greedy span selection scheme, allowing us to model the downstream +task on an optimal set of spans. We evaluate our model on two popular span +prediction tasks: coreference resolution and semantic role labeling. We show +empirical improvements on both. + +
+
+ comment: NAACL 2022 camera-ready +
+
+
+
+
+ + ♻ ☆ Chain-of-Thought Prompt Distillation for Multimodal Named Entity + Recognition and Multimodal Relation Extraction + + +
+ Multimodal Named Entity Recognition (MNER) and Multimodal Relation Extraction +(MRE) necessitate the fundamental reasoning capacity for intricate linguistic +and multimodal comprehension. In this study, we explore distilling the +reasoning ability of large language models (LLMs) into a more compact student +model by generating a \textit{chain of thought} (CoT) -- a sequence of +intermediate reasoning steps. Specifically, we commence by exemplifying the +elicitation of such reasoning ability from LLMs through CoT prompts covering +multi-grain (noun, sentence, multimodality) and data-augmentation (style, +entity, image) dimensions. Subsequently, we present a novel conditional prompt +distillation method to assimilate the commonsense reasoning ability from LLMs, +thereby enhancing the utility of the student model in addressing text-only +inputs without the requisite addition of image and CoT knowledge. Extensive +experiments reveal that our approach attains state-of-the-art accuracy and +manifests a plethora of advantages concerning interpretability, data +efficiency, and cross-domain generalization on MNER and MRE datasets. + +
+
+ comment: modification +
+
+
+
+
+ + ♻ ☆ MemoChat: Tuning LLMs to Use Memos for Consistent Long-Range Open-Domain + Conversation + + +
+ We propose MemoChat, a pipeline for refining instructions that enables large +language models (LLMs) to effectively employ self-composed memos for +maintaining consistent long-range open-domain conversations. We demonstrate a +long-range open-domain conversation through iterative +"memorization-retrieval-response" cycles. This requires us to carefully design +tailored tuning instructions for each distinct stage. The instructions are +reconstructed from a collection of public datasets to teach the LLMs to +memorize and retrieve past dialogues with structured memos, leading to enhanced +consistency when participating in future conversations. We invite experts to +manually annotate a test set designed to evaluate the consistency of long-range +conversations questions. Experiments on three testing scenarios involving both +open-source and API-accessible chatbots at scale verify the efficacy of +MemoChat, which outperforms strong baselines. Our codes, data and models are +available here: https://github.com/LuJunru/MemoChat. + +
+
+
+
+
+ + ♻ ☆ On the Trustworthiness Landscape of State-of-the-art Generative Models: + A Comprehensive Survey + + +
+ Diffusion models and large language models have emerged as leading-edge +generative models and have sparked a revolutionary impact on various aspects of +human life. However, the practical implementation of these models has also +exposed inherent risks, highlighting their dual nature and raising concerns +regarding their trustworthiness. Despite the abundance of literature on this +subject, a comprehensive survey specifically delving into the intersection of +large-scale generative models and their trustworthiness remains largely absent. +To bridge this gap, This paper investigates both the long-standing and emerging +threats associated with these models across four fundamental dimensions: +privacy, security, fairness, and responsibility. In this way, we construct an +extensive map outlining the trustworthiness of these models, while also +providing practical recommendations and identifying future directions. These +efforts are crucial for promoting the trustworthy deployment of these models, +ultimately benefiting society as a whole. + +
+
+ comment: Draft Version +
+
+
+
+
+ + ♻ ☆ NLP as a Lens for Causal Analysis and Perception Mining to Infer Mental + Health on Social Media + + +
+ Interactions among humans on social media often convey intentions behind +their actions, yielding a psychological language resource for Mental Health +Analysis (MHA) of online users. The success of Computational Intelligence +Techniques (CIT) for inferring mental illness from such social media resources +points to NLP as a lens for causal analysis and perception mining. However, we +argue that more consequential and explainable research is required for optimal +impact on clinical psychology practice and personalized mental healthcare. To +bridge this gap, we posit two significant dimensions: (1) Causal analysis to +illustrate a cause and effect relationship in the user generated text; (2) +Perception mining to infer psychological perspectives of social effects on +online users intentions. Within the scope of Natural Language Processing (NLP), +we further explore critical areas of inquiry associated with these two +dimensions, specifically through recent advancements in discourse analysis. +This position paper guides the community to explore solutions in this space and +advance the state of practice in developing conversational agents for inferring +mental health from social media. We advocate for a more explainable approach +toward modeling computational psychology problems through the lens of language +as we observe an increased number of research contributions in dataset and +problem formulation for causal relation extraction and perception enhancements +while inferring mental states. + +
+
+
+
+
+ + ♻ ☆ A Human-on-the-Loop Optimization Autoformalism Approach for + Sustainability + + +
+ This paper outlines a natural conversational approach to solving personalized +energy-related problems using large language models (LLMs). We focus on +customizable optimization problems that necessitate repeated solving with +slight variations in modeling and are user-specific, hence posing a challenge +to devising a one-size-fits-all model. We put forward a strategy that augments +an LLM with an optimization solver, enhancing its proficiency in understanding +and responding to user specifications and preferences while providing nonlinear +reasoning capabilities. Our approach pioneers the novel concept of human-guided +optimization autoformalism, translating a natural language task specification +automatically into an optimization instance. This enables LLMs to analyze, +explain, and tackle a variety of instance-specific energy-related problems, +pushing beyond the limits of current prompt-based techniques. + Our research encompasses various commonplace tasks in the energy sector, from +electric vehicle charging and Heating, Ventilation, and Air Conditioning (HVAC) +control to long-term planning problems such as cost-benefit evaluations for +installing rooftop solar photovoltaics (PVs) or heat pumps. This pilot study +marks an essential stride towards the context-based formulation of optimization +using LLMs, with the potential to democratize optimization processes. As a +result, stakeholders are empowered to optimize their energy consumption, +promoting sustainable energy practices customized to personal needs and +preferences. + +
+
+
+
+
+ + ♻ ☆ SeamlessM4T-Massively Multilingual & Multimodal Machine Translation + + +
+ What does it take to create the Babel Fish, a tool that can help individuals +translate speech between any two languages? While recent breakthroughs in +text-based models have pushed machine translation coverage beyond 200 +languages, unified speech-to-speech translation models have yet to achieve +similar strides. More specifically, conventional speech-to-speech translation +systems rely on cascaded systems that perform translation progressively, +putting high-performing unified systems out of reach. To address these gaps, we +introduce SeamlessM4T, a single model that supports speech-to-speech +translation, speech-to-text translation, text-to-speech translation, +text-to-text translation, and automatic speech recognition for up to 100 +languages. To build this, we used 1 million hours of open speech audio data to +learn self-supervised speech representations with w2v-BERT 2.0. Subsequently, +we created a multimodal corpus of automatically aligned speech translations. +Filtered and combined with human-labeled and pseudo-labeled data, we developed +the first multilingual system capable of translating from and into English for +both speech and text. On FLEURS, SeamlessM4T sets a new standard for +translations into multiple target languages, achieving an improvement of 20% +BLEU over the previous SOTA in direct speech-to-text translation. Compared to +strong cascaded models, SeamlessM4T improves the quality of into-English +translation by 1.3 BLEU points in speech-to-text and by 2.6 ASR-BLEU points in +speech-to-speech. Tested for robustness, our system performs better against +background noises and speaker variations in speech-to-text tasks compared to +the current SOTA model. Critically, we evaluated SeamlessM4T on gender bias and +added toxicity to assess translation safety. Finally, all contributions in this +work are open-sourced and accessible at +https://github.com/facebookresearch/seamless_communication + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 135 + +
+
+
+ + ☆ CHORUS: Learning Canonicalized 3D Human-Object Spatial Relations from + Unbounded Synthesized Images ICCV 2023 + + +
+ We present a method for teaching machines to understand and model the +underlying spatial common sense of diverse human-object interactions in 3D in a +self-supervised way. This is a challenging task, as there exist specific +manifolds of the interactions that can be considered human-like and natural, +but the human pose and the geometry of objects can vary even for similar +interactions. Such diversity makes the annotating task of 3D interactions +difficult and hard to scale, which limits the potential to reason about that in +a supervised way. One way of learning the 3D spatial relationship between +humans and objects during interaction is by showing multiple 2D images captured +from different viewpoints when humans interact with the same type of objects. +The core idea of our method is to leverage a generative model that produces +high-quality 2D images from an arbitrary text prompt input as an "unbounded" +data generator with effective controllability and view diversity. Despite its +imperfection of the image quality over real images, we demonstrate that the +synthesized images are sufficient to learn the 3D human-object spatial +relations. We present multiple strategies to leverage the synthesized images, +including (1) the first method to leverage a generative image model for 3D +human-object spatial relation learning; (2) a framework to reason about the 3D +spatial relations from inconsistent 2D cues in a self-supervised manner via 3D +occupancy reasoning with pose canonicalization; (3) semantic clustering to +disambiguate different types of interactions with the same object types; and +(4) a novel metric to assess the quality of 3D spatial learning of interaction. +Project Page: https://jellyheadandrew.github.io/projects/chorus + +
+
+ comment: Accepted to ICCV 2023 (Oral Presentation). Project Page: + https://jellyheadandrew.github.io/projects/chorus +
+
+
+
+
+ + ☆ A Generative Approach for Image Registration of Visible-Thermal (VT) + Cancer Faces MICCAI 2023 + + +
+ Since thermal imagery offers a unique modality to investigate pain, the U.S. +National Institutes of Health (NIH) has collected a large and diverse set of +cancer patient facial thermograms for AI-based pain research. However, +differing angles from camera capture between thermal and visible sensors has +led to misalignment between Visible-Thermal (VT) images. We modernize the +classic computer vision task of image registration by applying and modifying a +generative alignment algorithm to register VT cancer faces, without the need +for a reference or alignment parameters. By registering VT faces, we +demonstrate that the quality of thermal images produced in the generative AI +downstream task of Visible-to-Thermal (V2T) image translation significantly +improves up to 52.5\%, than without registration. Images in this paper have +been approved by the NIH NCI for public dissemination. + +
+
+ comment: 2nd Annual Artificial Intelligence over Infrared Images for Medical + Applications Workshop (AIIIMA) at the 26th International Conference on + Medical Image Computing and Computer Assisted Intervention (MICCAI 2023) +
+
+
+
+
+ + ☆ MolGrapher: Graph-based Visual Recognition of Chemical Structures + + +
+ The automatic analysis of chemical literature has immense potential to +accelerate the discovery of new materials and drugs. Much of the critical +information in patent documents and scientific articles is contained in +figures, depicting the molecule structures. However, automatically parsing the +exact chemical structure is a formidable challenge, due to the amount of +detailed information, the diversity of drawing styles, and the need for +training data. In this work, we introduce MolGrapher to recognize chemical +structures visually. First, a deep keypoint detector detects the atoms. Second, +we treat all candidate atoms and bonds as nodes and put them in a graph. This +construct allows a natural graph representation of the molecule. Last, we +classify atom and bond nodes in the graph with a Graph Neural Network. To +address the lack of real training data, we propose a synthetic data generation +pipeline producing diverse and realistic results. In addition, we introduce a +large-scale benchmark of annotated real molecule images, USPTO-30K, to spur +research on this critical topic. Extensive experiments on five datasets show +that our approach significantly outperforms classical and learning-based +methods in most settings. Code, models, and datasets are available. + +
+
+
+
+
+ + ☆ SPPNet: A Single-Point Prompt Network for Nuclei Image Segmentation + + +
+ Image segmentation plays an essential role in nuclei image analysis. +Recently, the segment anything model has made a significant breakthrough in +such tasks. However, the current model exists two major issues for cell +segmentation: (1) the image encoder of the segment anything model involves a +large number of parameters. Retraining or even fine-tuning the model still +requires expensive computational resources. (2) in point prompt mode, points +are sampled from the center of the ground truth and more than one set of points +is expected to achieve reliable performance, which is not efficient for +practical applications. In this paper, a single-point prompt network is +proposed for nuclei image segmentation, called SPPNet. We replace the original +image encoder with a lightweight vision transformer. Also, an effective +convolutional block is added in parallel to extract the low-level semantic +information from the image and compensate for the performance degradation due +to the small image encoder. We propose a new point-sampling method based on the +Gaussian kernel. The proposed model is evaluated on the MoNuSeg-2018 dataset. +The result demonstrated that SPPNet outperforms existing U-shape architectures +and shows faster convergence in training. Compared to the segment anything +model, SPPNet shows roughly 20 times faster inference, with 1/70 parameters and +computational cost. Particularly, only one set of points is required in both +the training and inference phases, which is more reasonable for clinical +applications. The code for our work and more technical details can be found at +https://github.com/xq141839/SPPNet. + +
+
+
+
+
+ + ☆ CIParsing: Unifying Causality Properties into Multiple Human Parsing + + +
+ Existing methods of multiple human parsing (MHP) apply statistical models to +acquire underlying associations between images and labeled body parts. However, +acquired associations often contain many spurious correlations that degrade +model generalization, leading statistical models to be vulnerable to visually +contextual variations in images (e.g., unseen image styles/external +interventions). To tackle this, we present a causality inspired parsing +paradigm termed CIParsing, which follows fundamental causal principles +involving two causal properties for human parsing (i.e., the causal diversity +and the causal invariance). Specifically, we assume that an input image is +constructed by a mix of causal factors (the characteristics of body parts) and +non-causal factors (external contexts), where only the former ones cause the +generation process of human parsing.Since causal/non-causal factors are +unobservable, a human parser in proposed CIParsing is required to construct +latent representations of causal factors and learns to enforce representations +to satisfy the causal properties. In this way, the human parser is able to rely +on causal factors w.r.t relevant evidence rather than non-causal factors w.r.t +spurious correlations, thus alleviating model degradation and yielding improved +parsing ability. Notably, the CIParsing is designed in a plug-and-play fashion +and can be integrated into any existing MHP models. Extensive experiments +conducted on two widely used benchmarks demonstrate the effectiveness and +generalizability of our method. + +
+
+
+
+
+ + ☆ SG-Former: Self-guided Transformer with Evolving Token Reallocation ICCV 2023 + + +
+ Vision Transformer has demonstrated impressive success across various vision +tasks. However, its heavy computation cost, which grows quadratically with +respect to the token sequence length, largely limits its power in handling +large feature maps. To alleviate the computation cost, previous works rely on +either fine-grained self-attentions restricted to local small regions, or +global self-attentions but to shorten the sequence length resulting in coarse +granularity. In this paper, we propose a novel model, termed as Self-guided +Transformer~(SG-Former), towards effective global self-attention with adaptive +fine granularity. At the heart of our approach is to utilize a significance +map, which is estimated through hybrid-scale self-attention and evolves itself +during training, to reallocate tokens based on the significance of each region. +Intuitively, we assign more tokens to the salient regions for achieving +fine-grained attention, while allocating fewer tokens to the minor regions in +exchange for efficiency and global receptive fields. The proposed SG-Former +achieves performance superior to state of the art: our base size model achieves +\textbf{84.7\%} Top-1 accuracy on ImageNet-1K, \textbf{51.2mAP} bbAP on CoCo, +\textbf{52.7mIoU} on ADE20K surpassing the Swin Transformer by \textbf{+1.3\% / ++2.7 mAP/ +3 mIoU}, with lower computation costs and fewer parameters. The code +is available at +\href{https://github.com/OliverRensu/SG-Former}{https://github.com/OliverRensu/SG-Former} + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ CLIPN for Zero-Shot OOD Detection: Teaching CLIP to Say No ICCV 2023 + + +
+ Out-of-distribution (OOD) detection refers to training the model on an +in-distribution (ID) dataset to classify whether the input images come from +unknown classes. Considerable effort has been invested in designing various OOD +detection methods based on either convolutional neural networks or +transformers. However, zero-shot OOD detection methods driven by CLIP, which +only require class names for ID, have received less attention. This paper +presents a novel method, namely CLIP saying "no" (\textbf{CLIPN}), which +empowers the logic of saying "no" within CLIP. Our key motivation is to equip +CLIP with the capability of distinguishing OOD and ID samples using +positive-semantic prompts and negation-semantic prompts. Specifically, we +design a novel learnable "no" prompt and a "no" text encoder to capture +negation semantics within images. Subsequently, we introduce two loss +functions: the image-text binary-opposite loss and the text semantic-opposite +loss, which we use to teach CLIPN to associate images with "no" prompts, +thereby enabling it to identify unknown samples. Furthermore, we propose two +threshold-free inference algorithms to perform OOD detection by utilizing +negation semantics from "no" prompts and the text encoder. Experimental results +on 9 benchmark datasets (3 ID datasets and 6 OOD datasets) for the OOD +detection task demonstrate that CLIPN, based on ViT-B-16, outperforms 7 +well-used algorithms by at least 2.34\% and 11.64\% in terms of AUROC and FPR95 +for zero-shot OOD detection on ImageNet-1K. Our CLIPN can serve as a solid +foundation for effectively leveraging CLIP in downstream OOD tasks. The code is +available on +https://github.com/xmed-lab/CLIPN}{https://github.com/xmed-lab/CLIPN. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Towards Real-Time Analysis of Broadcast Badminton Videos + + +
+ Analysis of player movements is a crucial subset of sports analysis. Existing +player movement analysis methods use recorded videos after the match is over. +In this work, we propose an end-to-end framework for player movement analysis +for badminton matches on live broadcast match videos. We only use the visual +inputs from the match and, unlike other approaches which use multi-modal sensor +data, our approach uses only visual cues. We propose a method to calculate the +on-court distance covered by both the players from the video feed of a live +broadcast badminton match. To perform this analysis, we focus on the gameplay +by removing replays and other redundant parts of the broadcast match. We then +perform player tracking to identify and track the movements of both players in +each frame. Finally, we calculate the distance covered by each player and the +average speed with which they move on the court. We further show a heatmap of +the areas covered by the player on the court which is useful for analyzing the +gameplay of the player. Our proposed framework was successfully used to analyze +live broadcast matches in real-time during the Premier Badminton League 2019 +(PBL 2019), with commentators and broadcasters appreciating the utility. + +
+
+
+
+
+ + ☆ Sign Language Translation with Iterative Prototype ICCV 2023 + + +
+ This paper presents IP-SLT, a simple yet effective framework for sign +language translation (SLT). Our IP-SLT adopts a recurrent structure and +enhances the semantic representation (prototype) of the input sign language +video via an iterative refinement manner. Our idea mimics the behavior of human +reading, where a sentence can be digested repeatedly, till reaching accurate +understanding. Technically, IP-SLT consists of feature extraction, prototype +initialization, and iterative prototype refinement. The initialization module +generates the initial prototype based on the visual feature extracted by the +feature extraction module. Then, the iterative refinement module leverages the +cross-attention mechanism to polish the previous prototype by aggregating it +with the original video feature. Through repeated refinement, the prototype +finally converges to a more stable and accurate state, leading to a fluent and +appropriate translation. In addition, to leverage the sequential dependence of +prototypes, we further propose an iterative distillation loss to compress the +knowledge of the final iteration into previous ones. As the autoregressive +decoding process is executed only once in inference, our IP-SLT is ready to +improve various SLT systems with acceptable overhead. Extensive experiments are +conducted on public benchmarks to demonstrate the effectiveness of the IP-SLT. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Tumor-Centered Patching for Enhanced Medical Image Segmentation + + +
+ The realm of medical image diagnosis has advanced significantly with the +integration of computer-aided diagnosis and surgical systems. However, +challenges persist, particularly in achieving precise image segmentation. While +deep learning techniques show potential, obstacles like limited resources, slow +convergence, and class imbalance impede their effectiveness. Traditional +patch-based methods, though common, struggle to capture intricate tumor +boundaries and often lead to redundant samples, compromising computational +efficiency and feature quality. To tackle these issues, this research +introduces an innovative approach centered on the tumor itself for patch-based +image analysis. This novel tumor-centered patching method aims to address the +class imbalance and boundary deficiencies, enabling focused and accurate tumor +segmentation. By aligning patches with the tumor's anatomical context, this +technique enhances feature extraction accuracy and reduces computational load. +Experimental results demonstrate improved class imbalance, with segmentation +scores of 0.78, 0.76, and 0.71 for whole, core, and enhancing tumors, +respectively using a lightweight simple U-Net. This approach shows potential +for enhancing medical image segmentation and improving computer-aided diagnosis +systems. + +
+
+ comment: 20 pages, 12 figures +
+
+
+
+
+ + ☆ NPF-200: A Multi-Modal Eye Fixation Dataset and Method for + Non-Photorealistic Videos ACM MM 2023 + + +
+ Non-photorealistic videos are in demand with the wave of the metaverse, but +lack of sufficient research studies. This work aims to take a step forward to +understand how humans perceive non-photorealistic videos with eye fixation +(\ie, saliency detection), which is critical for enhancing media production, +artistic design, and game user experience. To fill in the gap of missing a +suitable dataset for this research line, we present NPF-200, the first +large-scale multi-modal dataset of purely non-photorealistic videos with eye +fixations. Our dataset has three characteristics: 1) it contains soundtracks +that are essential according to vision and psychological studies; 2) it +includes diverse semantic content and videos are of high-quality; 3) it has +rich motions across and within videos. We conduct a series of analyses to gain +deeper insights into this task and compare several state-of-the-art methods to +explore the gap between natural images and non-photorealistic data. +Additionally, as the human attention system tends to extract visual and audio +features with different frequencies, we propose a universal frequency-aware +multi-modal non-photorealistic saliency detection model called NPSNet, +demonstrating the state-of-the-art performance of our task. The results uncover +strengths and weaknesses of multi-modal network design and multi-domain +training, opening up promising directions for future works. {Our dataset and +code can be found at \url{https://github.com/Yangziyu/NPF200}}. + +
+
+ comment: Accepted by ACM MM 2023 +
+
+
+
+
+ + ☆ Multimodal Latent Emotion Recognition from Micro-expression and + Physiological Signals + + +
+ This paper discusses the benefits of incorporating multimodal data for +improving latent emotion recognition accuracy, focusing on micro-expression +(ME) and physiological signals (PS). The proposed approach presents a novel +multimodal learning framework that combines ME and PS, including a 1D separable +and mixable depthwise inception network, a standardised normal distribution +weighted feature fusion method, and depth/physiology guided attention modules +for multimodal learning. Experimental results show that the proposed approach +outperforms the benchmark method, with the weighted fusion method and guided +attention modules both contributing to enhanced performance. + +
+
+
+
+
+ + ☆ A Probabilistic Fluctuation based Membership Inference Attack for + Generative Models + + +
+ Membership Inference Attack (MIA) identifies whether a record exists in a +machine learning model's training set by querying the model. MIAs on the +classic classification models have been well-studied, and recent works have +started to explore how to transplant MIA onto generative models. Our +investigation indicates that existing MIAs designed for generative models +mainly depend on the overfitting in target models. However, overfitting can be +avoided by employing various regularization techniques, whereas existing MIAs +demonstrate poor performance in practice. Unlike overfitting, memorization is +essential for deep learning models to attain optimal performance, making it a +more prevalent phenomenon. Memorization in generative models leads to an +increasing trend in the probability distribution of generating records around +the member record. Therefore, we propose a Probabilistic Fluctuation Assessing +Membership Inference Attack (PFAMI), a black-box MIA that infers memberships by +detecting these trends via analyzing the overall probabilistic fluctuations +around given records. We conduct extensive experiments across multiple +generative models and datasets, which demonstrate PFAMI can improve the attack +success rate (ASR) by about 27.9% when compared with the best baseline. + +
+
+
+
+
+ + ☆ Mesh Conflation of Oblique Photogrammetric Models using Virtual Cameras + and Truncated Signed Distance Field + + +
+ Conflating/stitching 2.5D raster digital surface models (DSM) into a large +one has been a running practice in geoscience applications, however, conflating +full-3D mesh models, such as those from oblique photogrammetry, is extremely +challenging. In this letter, we propose a novel approach to address this +challenge by conflating multiple full-3D oblique photogrammetric models into a +single, and seamless mesh for high-resolution site modeling. Given two or more +individually collected and created photogrammetric meshes, we first propose to +create a virtual camera field (with a panoramic field of view) to incubate +virtual spaces represented by Truncated Signed Distance Field (TSDF), an +implicit volumetric field friendly for linear 3D fusion; then we adaptively +leverage the truncated bound of meshes in TSDF to conflate them into a single +and accurate full 3D site model. With drone-based 3D meshes, we show that our +approach significantly improves upon traditional methods for model conflations, +to drive new potentials to create excessively large and accurate full 3D mesh +models in support of geoscience and environmental applications. + +
+
+ comment: 5 Figures +
+
+
+
+
+ + ☆ Select-and-Combine (SAC): A Novel Multi-Stereo Depth Fusion Algorithm + for Point Cloud Generation via Efficient Local Markov Netlets + + +
+ Many practical systems for image-based surface reconstruction employ a +stereo/multi-stereo paradigm, due to its ability to scale for large scenes and +its ease of implementation for out-of-core operations. In this process, +multiple and abundant depth maps from stereo matching must be combined and +fused into a single, consistent, and clean point cloud. However, the noises and +outliers caused by stereo matching and the heterogenous geometric errors of the +poses present a challenge for existing fusion algorithms, since they mostly +assume Gaussian errors and predict fused results based on data from local +spatial neighborhoods, which may inherit uncertainties from multiple depths +resulting in lowered accuracy. In this paper, we propose a novel depth fusion +paradigm, that instead of numerically fusing points from multiple depth maps, +selects the best depth map per point, and combines them into a single and clean +point cloud. This paradigm, called select-and-combine (SAC), is achieved +through modeling the point level fusion using local Markov Netlets, a +micro-network over point across neighboring views for depth/view selection, +followed by a Netlets collapse process for point combination. The Markov +Netlets are optimized such that they can inherently leverage spatial +consistencies among depth maps of neighboring views, thus they can address +errors beyond Gaussian ones. Our experiment results show that our approach +outperforms existing depth fusion approaches by increasing the F1 score that +considers both accuracy and completeness by 2.07% compared to the best existing +method. Finally, our approach generates clearer point clouds that are 18% less +redundant while with a higher accuracy before fusion + +
+
+ comment: 6 Figures +
+
+
+
+
+ + ☆ Lite-HRNet Plus: Fast and Accurate Facial Landmark Detection ICIP2023 + + +
+ Facial landmark detection is an essential technology for driver status +tracking and has been in demand for real-time estimations. As a landmark +coordinate prediction, heatmap-based methods are known to achieve a high +accuracy, and Lite-HRNet can achieve a fast estimation. However, with +Lite-HRNet, the problem of a heavy computational cost of the fusion block, +which connects feature maps with different resolutions, has yet to be solved. +In addition, the strong output module used in HRNetV2 is not applied to +Lite-HRNet. Given these problems, we propose a novel architecture called +Lite-HRNet Plus. Lite-HRNet Plus achieves two improvements: a novel fusion +block based on a channel attention and a novel output module with less +computational intensity using multi-resolution feature maps. Through +experiments conducted on two facial landmark datasets, we confirmed that +Lite-HRNet Plus further improved the accuracy in comparison with conventional +methods, and achieved a state-of-the-art accuracy with a computational +complexity with the range of 10M FLOPs. + +
+
+ comment: Accepted at ICIP2023 +
+
+
+
+
+ + ☆ Masking Strategies for Background Bias Removal in Computer Vision Models ICCV + + +
+ Models for fine-grained image classification tasks, where the difference +between some classes can be extremely subtle and the number of samples per +class tends to be low, are particularly prone to picking up background-related +biases and demand robust methods to handle potential examples with +out-of-distribution (OOD) backgrounds. To gain deeper insights into this +critical problem, our research investigates the impact of background-induced +bias on fine-grained image classification, evaluating standard backbone models +such as Convolutional Neural Network (CNN) and Vision Transformers (ViT). We +explore two masking strategies to mitigate background-induced bias: Early +masking, which removes background information at the (input) image level, and +late masking, which selectively masks high-level spatial features corresponding +to the background. Extensive experiments assess the behavior of CNN and ViT +models under different masking strategies, with a focus on their generalization +to OOD backgrounds. The obtained findings demonstrate that both proposed +strategies enhance OOD performance compared to the baseline models, with early +masking consistently exhibiting the best OOD performance. Notably, a ViT +variant employing GAP-Pooled Patch token-based classification combined with +early masking achieves the highest OOD robustness. + +
+
+ comment: Accepted at the 2023 IEEE/CVF International Conference on Computer + Vision Workshop (ICCVW) on Out Of Distribution Generalization in Computer + Vision (OOD-CV) +
+
+
+
+
+ + ☆ The TYC Dataset for Understanding Instance-Level Semantics and Motions + of Cells in Microstructures ICCV 2023 + + +
+ Segmenting cells and tracking their motion over time is a common task in +biomedical applications. However, predicting accurate instance-wise +segmentation and cell motions from microscopy imagery remains a challenging +task. Using microstructured environments for analyzing single cells in a +constant flow of media adds additional complexity. While large-scale labeled +microscopy datasets are available, we are not aware of any large-scale dataset, +including both cells and microstructures. In this paper, we introduce the +trapped yeast cell (TYC) dataset, a novel dataset for understanding +instance-level semantics and motions of cells in microstructures. We release +$105$ dense annotated high-resolution brightfield microscopy images, including +about $19$k instance masks. We also release $261$ curated video clips composed +of $1293$ high-resolution microscopy images to facilitate unsupervised +understanding of cell motions and morphology. TYC offers ten times more +instance annotations than the previously largest dataset, including cells and +microstructures. Our effort also exceeds previous attempts in terms of +microstructure variability, resolution, complexity, and capturing device +(microscopy) variability. We facilitate a unified comparison on our novel +dataset by introducing a standardized evaluation strategy. TYC and evaluation +code are publicly available under CC BY 4.0 license. + +
+
+ comment: Accepted at ICCV 2023 Workshop on BioImage Computing. Project page + (with links to the dataset and code): + https://christophreich1996.github.io/tyc_dataset/ +
+
+
+
+
+ + ☆ Less is More -- Towards parsimonious multi-task models using structured + sparsity + + +
+ Group sparsity in Machine Learning (ML) encourages simpler, more +interpretable models with fewer active parameter groups. This work aims to +incorporate structured group sparsity into the shared parameters of a +Multi-Task Learning (MTL) framework, to develop parsimonious models that can +effectively address multiple tasks with fewer parameters while maintaining +comparable or superior performance to a dense model. Sparsifying the model +during training helps decrease the model's memory footprint, computation +requirements, and prediction time during inference. We use channel-wise l1/l2 +group sparsity in the shared layers of the Convolutional Neural Network (CNN). +This approach not only facilitates the elimination of extraneous groups +(channels) but also imposes a penalty on the weights, thereby enhancing the +learning of all tasks. We compare the outcomes of single-task and multi-task +experiments under group sparsity on two publicly available MTL datasets, NYU-v2 +and CelebAMask-HQ. We also investigate how changing the sparsification degree +impacts both the performance of the model and the sparsity of groups. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Advancements in Point Cloud Data Augmentation for Deep Learning: A + Survey + + +
+ Point cloud has a wide range of applications in areas such as autonomous +driving, mapping, navigation, scene reconstruction, and medical imaging. Due to +its great potentials in these applications, point cloud processing has gained +great attention in the field of computer vision. Among various point cloud +processing techniques, deep learning (DL) has become one of the mainstream and +effective methods for tasks such as detection, segmentation and classification. +To reduce overfitting during training DL models and improve model performance +especially when the amount and/or diversity of training data are limited, +augmentation is often crucial. Although various point cloud data augmentation +methods have been widely used in different point cloud processing tasks, there +are currently no published systematic surveys or reviews of these methods. +Therefore, this article surveys and discusses these methods and categorizes +them into a taxonomy framework. Through the comprehensive evaluation and +comparison of the augmentation methods, this article identifies their +potentials and limitations and suggests possible future research directions. +This work helps researchers gain a holistic understanding of the current status +of point cloud data augmentation and promotes its wider application and +development. + +
+
+
+
+
+ + ☆ Generalized Continual Category Discovery + + +
+ Most of Continual Learning (CL) methods push the limit of supervised learning +settings, where an agent is expected to learn new labeled tasks and not forget +previous knowledge. However, these settings are not well aligned with real-life +scenarios, where a learning agent has access to a vast amount of unlabeled data +encompassing both novel (entirely unlabeled) classes and examples from known +classes. Drawing inspiration from Generalized Category Discovery (GCD), we +introduce a novel framework that relaxes this assumption. Precisely, in any +task, we allow for the existence of novel and known classes, and one must use +continual version of unsupervised learning methods to discover them. We call +this setting Generalized Continual Category Discovery (GCCD). It unifies CL and +GCD, bridging the gap between synthetic benchmarks and real-life scenarios. +With a series of experiments, we present that existing methods fail to +accumulate knowledge from subsequent tasks in which unlabeled samples of novel +classes are present. In light of these limitations, we propose a method that +incorporates both supervised and unsupervised signals and mitigates the +forgetting through the use of centroid adaptation. Our method surpasses strong +CL methods adopted for GCD techniques and presents a superior representation +learning performance. + +
+
+ comment: 7 pages, 5 figures +
+
+
+
+
+ + ☆ Cross-Modality Proposal-guided Feature Mining for Unregistered + RGB-Thermal Pedestrian Detection + + +
+ RGB-Thermal (RGB-T) pedestrian detection aims to locate the pedestrians in +RGB-T image pairs to exploit the complementation between the two modalities for +improving detection robustness in extreme conditions. Most existing algorithms +assume that the RGB-T image pairs are well registered, while in the real world +they are not aligned ideally due to parallax or different field-of-view of the +cameras. The pedestrians in misaligned image pairs may locate at different +positions in two images, which results in two challenges: 1) how to achieve +inter-modality complementation using spatially misaligned RGB-T pedestrian +patches, and 2) how to recognize the unpaired pedestrians at the boundary. To +deal with these issues, we propose a new paradigm for unregistered RGB-T +pedestrian detection, which predicts two separate pedestrian locations in the +RGB and thermal images, respectively. Specifically, we propose a cross-modality +proposal-guided feature mining (CPFM) mechanism to extract the two precise +fusion features for representing the pedestrian in the two modalities, even if +the RGB-T image pair is unaligned. It enables us to effectively exploit the +complementation between the two modalities. With the CPFM mechanism, we build a +two-stream dense detector; it predicts the two pedestrian locations in the two +modalities based on the corresponding fusion feature mined by the CPFM +mechanism. Besides, we design a data augmentation method, named Homography, to +simulate the discrepancy in scales and views between images. We also +investigate two non-maximum suppression (NMS) methods for post-processing. +Favorable experimental results demonstrate the effectiveness and robustness of +our method in dealing with unregistered pedestrians with different shifts. + +
+
+
+
+
+ + ☆ DISGAN: Wavelet-informed Discriminator Guides GAN to MRI + Super-resolution with Noise Cleaning + + +
+ MRI super-resolution (SR) and denoising tasks are fundamental challenges in +the field of deep learning, which have traditionally been treated as distinct +tasks with separate paired training data. In this paper, we propose an +innovative method that addresses both tasks simultaneously using a single deep +learning model, eliminating the need for explicitly paired noisy and clean +images during training. Our proposed model is primarily trained for SR, but +also exhibits remarkable noise-cleaning capabilities in the super-resolved +images. Instead of conventional approaches that introduce frequency-related +operations into the generative process, our novel approach involves the use of +a GAN model guided by a frequency-informed discriminator. To achieve this, we +harness the power of the 3D Discrete Wavelet Transform (DWT) operation as a +frequency constraint within the GAN framework for the SR task on magnetic +resonance imaging (MRI) data. Specifically, our contributions include: 1) a 3D +generator based on residual-in-residual connected blocks; 2) the integration of +the 3D DWT with $1\times 1$ convolution into a DWT+conv unit within a 3D Unet +for the discriminator; 3) the use of the trained model for high-quality image +SR, accompanied by an intrinsic denoising process. We dub the model "Denoising +Induced Super-resolution GAN (DISGAN)" due to its dual effects of SR image +generation and simultaneous denoising. Departing from the traditional approach +of training SR and denoising tasks as separate models, our proposed DISGAN is +trained only on the SR task, but also achieves exceptional performance in +denoising. The model is trained on 3D MRI data from dozens of subjects from the +Human Connectome Project (HCP) and further evaluated on previously unseen MRI +data from subjects with brain tumours and epilepsy to assess its denoising and +SR performance. + +
+
+ comment: 10 pages, 9 figures +
+
+
+
+
+ + ☆ InstructionGPT-4: A 200-Instruction Paradigm for Fine-Tuning MiniGPT-4 + + +
+ Multimodal large language models acquire their instruction-following +capabilities through a two-stage training process: pre-training on image-text +pairs and fine-tuning on supervised vision-language instruction data. Recent +studies have shown that large language models can achieve satisfactory results +even with a limited amount of high-quality instruction-following data. In this +paper, we introduce InstructionGPT-4, which is fine-tuned on a small dataset +comprising only 200 examples, amounting to approximately 6% of the +instruction-following data used in the alignment dataset for MiniGPT-4. We +first propose several metrics to access the quality of multimodal instruction +data. Based on these metrics, we present a simple and effective data selector +to automatically identify and filter low-quality vision-language data. By +employing this method, InstructionGPT-4 outperforms the original MiniGPT-4 on +various evaluations (e.g., visual question answering, GPT-4 preference). +Overall, our findings demonstrate that less but high-quality instruction tuning +data is efficient to enable multimodal large language models to generate better +output. + +
+
+
+
+
+ + ☆ SILT: Shadow-aware Iterative Label Tuning for Learning to Detect Shadows + from Noisy Labels ICCV 2023 + + +
+ Existing shadow detection datasets often contain missing or mislabeled +shadows, which can hinder the performance of deep learning models trained +directly on such data. To address this issue, we propose SILT, the Shadow-aware +Iterative Label Tuning framework, which explicitly considers noise in shadow +labels and trains the deep model in a self-training manner. Specifically, we +incorporate strong data augmentations with shadow counterfeiting to help the +network better recognize non-shadow regions and alleviate overfitting. We also +devise a simple yet effective label tuning strategy with global-local fusion +and shadow-aware filtering to encourage the network to make significant +refinements on the noisy labels. We evaluate the performance of SILT by +relabeling the test set of the SBU dataset and conducting various experiments. +Our results show that even a simple U-Net trained with SILT can outperform all +state-of-the-art methods by a large margin. When trained on SBU / UCF / ISTD, +our network can successfully reduce the Balanced Error Rate by 25.2% / 36.9% / +21.3% over the best state-of-the-art method. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ HarvestNet: A Dataset for Detecting Smallholder Farming Activity Using + Harvest Piles and Remote Sensing + + +
+ Small farms contribute to a large share of the productive land in developing +countries. In regions such as sub-Saharan Africa, where 80% of farms are small +(under 2 ha in size), the task of mapping smallholder cropland is an important +part of tracking sustainability measures such as crop productivity. However, +the visually diverse and nuanced appearance of small farms has limited the +effectiveness of traditional approaches to cropland mapping. Here we introduce +a new approach based on the detection of harvest piles characteristic of many +smallholder systems throughout the world. We present HarvestNet, a dataset for +mapping the presence of farms in the Ethiopian regions of Tigray and Amhara +during 2020-2023, collected using expert knowledge and satellite images, +totaling 7k hand-labeled images and 2k ground collected labels. We also +benchmark a set of baselines including SOTA models in remote sensing with our +best models having around 80% classification performance on hand labelled data +and 90%, 98% accuracy on ground truth data for Tigray, Amhara respectively. We +also perform a visual comparison with a widely used pre-existing coverage map +and show that our model detects an extra 56,621 hectares of cropland in Tigray. +We conclude that remote sensing of harvest piles can contribute to more timely +and accurate cropland assessments in food insecure region. + +
+
+ comment: 18 pages, 22 figures +
+
+
+
+
+ + ☆ Manipulating Embeddings of Stable Diffusion Prompts + + +
+ Generative text-to-image models such as Stable Diffusion allow users to +generate images based on a textual description, the prompt. Changing the prompt +is still the primary means for the user to change a generated image as desired. +However, changing the image by reformulating the prompt remains a difficult +process of trial and error, which has led to the emergence of prompt +engineering as a new field of research. We propose and analyze methods to +change the embedding of a prompt directly instead of the prompt text. It allows +for more fine-grained and targeted control that takes into account user +intentions. Our approach treats the generative text-to-image model as a +continuous function and passes gradients between the image space and the prompt +embedding space. By addressing different user interaction problems, we can +apply this idea in three scenarios: (1) Optimization of a metric defined in +image space that could measure, for example, image style. (2) Assistance of +users in creative tasks by enabling them to navigate the image space along a +selection of directions of "near" prompt embeddings. (3) Changing the embedding +of the prompt to include information that the user has seen in a particular +seed but finds difficult to describe in the prompt. Our experiments demonstrate +the feasibility of the described methods. + +
+
+
+
+
+ + ☆ DR-Tune: Improving Fine-tuning of Pretrained Visual Models by + Distribution Regularization with Semantic Calibration ICCV'2023 + + +
+ The visual models pretrained on large-scale benchmarks encode general +knowledge and prove effective in building more powerful representations for +downstream tasks. Most existing approaches follow the fine-tuning paradigm, +either by initializing or regularizing the downstream model based on the +pretrained one. The former fails to retain the knowledge in the successive +fine-tuning phase, thereby prone to be over-fitting, and the latter imposes +strong constraints to the weights or feature maps of the downstream model +without considering semantic drift, often incurring insufficient optimization. +To deal with these issues, we propose a novel fine-tuning framework, namely +distribution regularization with semantic calibration (DR-Tune). It employs +distribution regularization by enforcing the downstream task head to decrease +its classification error on the pretrained feature distribution, which prevents +it from over-fitting while enabling sufficient training of downstream encoders. +Furthermore, to alleviate the interference by semantic drift, we develop the +semantic calibration (SC) module to align the global shape and class centers of +the pretrained and downstream feature distributions. Extensive experiments on +widely used image classification datasets show that DR-Tune consistently +improves the performance when combing with various backbones under different +pretraining strategies. Code is available at: +https://github.com/weeknan/DR-Tune. + +
+
+ comment: Accepted by ICCV'2023 +
+
+
+
+
+ + ☆ Towards Privacy-Supporting Fall Detection via Deep Unsupervised + RGB2Depth Adaptation + + +
+ Fall detection is a vital task in health monitoring, as it allows the system +to trigger an alert and therefore enabling faster interventions when a person +experiences a fall. Although most previous approaches rely on standard RGB +video data, such detailed appearance-aware monitoring poses significant privacy +concerns. Depth sensors, on the other hand, are better at preserving privacy as +they merely capture the distance of objects from the sensor or camera, omitting +color and texture information. In this paper, we introduce a privacy-supporting +solution that makes the RGB-trained model applicable in depth domain and +utilizes depth data at test time for fall detection. To achieve cross-modal +fall detection, we present an unsupervised RGB to Depth (RGB2Depth) cross-modal +domain adaptation approach that leverages labelled RGB data and unlabelled +depth data during training. Our proposed pipeline incorporates an intermediate +domain module for feature bridging, modality adversarial loss for modality +discrimination, classification loss for pseudo-labeled depth data and labeled +source data, triplet loss that considers both source and target domains, and a +novel adaptive loss weight adjustment method for improved coordination among +various losses. Our approach achieves state-of-the-art results in the +unsupervised RGB2Depth domain adaptation task for fall detection. Code is +available at https://github.com/1015206533/privacy_supporting_fall_detection. + +
+
+
+
+
+ + ☆ Head-Tail Cooperative Learning Network for Unbiased Scene Graph + Generation + + +
+ Scene Graph Generation (SGG) as a critical task in image understanding, +facing the challenge of head-biased prediction caused by the long-tail +distribution of predicates. However, current unbiased SGG methods can easily +prioritize improving the prediction of tail predicates while ignoring the +substantial sacrifice in the prediction of head predicates, leading to a shift +from head bias to tail bias. To address this issue, we propose a model-agnostic +Head-Tail Collaborative Learning (HTCL) network that includes head-prefer and +tail-prefer feature representation branches that collaborate to achieve +accurate recognition of both head and tail predicates. We also propose a +self-supervised learning approach to enhance the prediction ability of the +tail-prefer feature representation branch by constraining tail-prefer predicate +features. Specifically, self-supervised learning converges head predicate +features to their class centers while dispersing tail predicate features as +much as possible through contrast learning and head center loss. We demonstrate +the effectiveness of our HTCL by applying it to various SGG models on VG150, +Open Images V6 and GQA200 datasets. The results show that our method achieves +higher mean Recall with a minimal sacrifice in Recall and achieves a new +state-of-the-art overall performance. Our code is available at +https://github.com/wanglei0618/HTCL. + +
+
+ comment: 12 pages, 8 figures +
+
+
+
+
+ + ☆ CgT-GAN: CLIP-guided Text GAN for Image Captioning ACM MM 2023 + + +
+ The large-scale visual-language pre-trained model, Contrastive Language-Image +Pre-training (CLIP), has significantly improved image captioning for scenarios +without human-annotated image-caption pairs. Recent advanced CLIP-based image +captioning without human annotations follows a text-only training paradigm, +i.e., reconstructing text from shared embedding space. Nevertheless, these +approaches are limited by the training/inference gap or huge storage +requirements for text embeddings. Given that it is trivial to obtain images in +the real world, we propose CLIP-guided text GAN (CgT-GAN), which incorporates +images into the training process to enable the model to "see" real visual +modality. Particularly, we use adversarial training to teach CgT-GAN to mimic +the phrases of an external text corpus and CLIP-based reward to provide +semantic guidance. The caption generator is jointly rewarded based on the +caption naturalness to human language calculated from the GAN's discriminator +and the semantic guidance reward computed by the CLIP-based reward module. In +addition to the cosine similarity as the semantic guidance reward (i.e., +CLIP-cos), we further introduce a novel semantic guidance reward called +CLIP-agg, which aligns the generated caption with a weighted text embedding by +attentively aggregating the entire corpus. Experimental results on three +subtasks (ZS-IC, In-UIC and Cross-UIC) show that CgT-GAN outperforms +state-of-the-art methods significantly across all metrics. Code is available at +https://github.com/Lihr747/CgtGAN. + +
+
+ comment: Accepted at ACM MM 2023 +
+
+
+
+
+ + ☆ Large Multilingual Models Pivot Zero-Shot Multimodal Learning across + Languages + + +
+ Recently there has been a significant surge in multimodal learning in terms +of both image-to-text and text-to-image generation. However, the success is +typically limited to English, leaving other languages largely behind. Building +a competitive counterpart in other languages is highly challenging due to the +low-resource nature of non-English multimodal data (i.e., lack of large-scale, +high-quality image-text data). In this work, we propose MPM, an effective +training paradigm for training large multimodal models in low-resource +languages. MPM demonstrates that Multilingual language models can Pivot +zero-shot Multimodal learning across languages. Specifically, based on a strong +multilingual large language model, multimodal models pretrained on English-only +image-text data can well generalize to other languages in a zero-shot manner +for both image-to-text and text-to-image generation, even surpassing models +trained on image-text data in native languages. Taking Chinese as a practice of +MPM, we build large multimodal models VisCPM in image-to-text and text-to-image +generation, which achieve state-of-the-art (open-source) performance in +Chinese. To facilitate future research, we open-source codes and model weights +at https://github.com/OpenBMB/VisCPM.git. + +
+
+ comment: https://github.com/OpenBMB/VisCPM.git +
+
+
+
+
+ + ☆ RefEgo: Referring Expression Comprehension Dataset from First-Person + Perception of Ego4D ICCV2023 + + +
+ Grounding textual expressions on scene objects from first-person views is a +truly demanding capability in developing agents that are aware of their +surroundings and behave following intuitive text instructions. Such capability +is of necessity for glass-devices or autonomous robots to localize referred +objects in the real-world. In the conventional referring expression +comprehension tasks of images, however, datasets are mostly constructed based +on the web-crawled data and don't reflect diverse real-world structures on the +task of grounding textual expressions in diverse objects in the real world. +Recently, a massive-scale egocentric video dataset of Ego4D was proposed. Ego4D +covers around the world diverse real-world scenes including numerous indoor and +outdoor situations such as shopping, cooking, walking, talking, manufacturing, +etc. Based on egocentric videos of Ego4D, we constructed a broad coverage of +the video-based referring expression comprehension dataset: RefEgo. Our dataset +includes more than 12k video clips and 41 hours for video-based referring +expression comprehension annotation. In experiments, we combine the +state-of-the-art 2D referring expression comprehension models with the object +tracking algorithm, achieving the video-wise referred object tracking even in +difficult conditions: the referred object becomes out-of-frame in the middle of +the video or multiple similar objects are presented in the video. + +
+
+ comment: 15 pages, 11 figures. ICCV2023 +
+
+
+
+
+ + ☆ Distribution-Aware Calibration for Object Detection with Noisy Bounding + Boxes + + +
+ Large-scale well-annotated datasets are of great importance for training an +effective object detector. However, obtaining accurate bounding box annotations +is laborious and demanding. Unfortunately, the resultant noisy bounding boxes +could cause corrupt supervision signals and thus diminish detection +performance. Motivated by the observation that the real ground-truth is usually +situated in the aggregation region of the proposals assigned to a noisy +ground-truth, we propose DIStribution-aware CalibratiOn (DISCO) to model the +spatial distribution of proposals for calibrating supervision signals. In +DISCO, spatial distribution modeling is performed to statistically extract the +potential locations of objects. Based on the modeled distribution, three +distribution-aware techniques, i.e., distribution-aware proposal augmentation +(DA-Aug), distribution-aware box refinement (DA-Ref), and distribution-aware +confidence estimation (DA-Est), are developed to improve classification, +localization, and interpretability, respectively. Extensive experiments on +large-scale noisy image datasets (i.e., Pascal VOC and MS-COCO) demonstrate +that DISCO can achieve state-of-the-art detection performance, especially at +high noise levels. + +
+
+ comment: 12 pages, 9 figures +
+
+
+
+
+ + ☆ StofNet: Super-resolution Time of Flight Network + + +
+ Time of Flight (ToF) is a prevalent depth sensing technology in the fields of +robotics, medical imaging, and non-destructive testing. Yet, ToF sensing faces +challenges from complex ambient conditions making an inverse modelling from the +sparse temporal information intractable. This paper highlights the potential of +modern super-resolution techniques to learn varying surroundings for a reliable +and accurate ToF detection. Unlike existing models, we tailor an architecture +for sub-sample precise semi-global signal localization by combining +super-resolution with an efficient residual contraction block to balance +between fine signal details and large scale contextual information. We +consolidate research on ToF by conducting a benchmark comparison against six +state-of-the-art methods for which we employ two publicly available datasets. +This includes the release of our SToF-Chirp dataset captured by an airborne +ultrasound transducer. Results showcase the superior performance of our +proposed StofNet in terms of precision, reliability and model complexity. Our +code is available at https://github.com/hahnec/stofnet. + +
+
+ comment: pre-print +
+
+
+
+
+ + ☆ Multi-stage Factorized Spatio-Temporal Representation for RGB-D Action + and Gesture Recognition ACM MM'23 + + +
+ RGB-D action and gesture recognition remain an interesting topic in +human-centered scene understanding, primarily due to the multiple granularities +and large variation in human motion. Although many RGB-D based action and +gesture recognition approaches have demonstrated remarkable results by +utilizing highly integrated spatio-temporal representations across multiple +modalities (i.e., RGB and depth data), they still encounter several challenges. +Firstly, vanilla 3D convolution makes it hard to capture fine-grained motion +differences between local clips under different modalities. Secondly, the +intricate nature of highly integrated spatio-temporal modeling can lead to +optimization difficulties. Thirdly, duplicate and unnecessary information can +add complexity and complicate entangled spatio-temporal modeling. To address +the above issues, we propose an innovative heuristic architecture called +Multi-stage Factorized Spatio-Temporal (MFST) for RGB-D action and gesture +recognition. The proposed MFST model comprises a 3D Central Difference +Convolution Stem (CDC-Stem) module and multiple factorized spatio-temporal +stages. The CDC-Stem enriches fine-grained temporal perception, and the +multiple hierarchical spatio-temporal stages construct dimension-independent +higher-order semantic primitives. Specifically, the CDC-Stem module captures +bottom-level spatio-temporal features and passes them successively to the +following spatio-temporal factored stages to capture the hierarchical spatial +and temporal features through the Multi- Scale Convolution and Transformer +(MSC-Trans) hybrid block and Weight-shared Multi-Scale Transformer (WMS-Trans) +block. The seamless integration of these innovative designs results in a robust +spatio-temporal representation that outperforms state-of-the-art approaches on +RGB-D action and gesture recognition datasets. + +
+
+ comment: ACM MM'23 has accepted this paper +
+
+
+
+
+ + ☆ Local Distortion Aware Efficient Transformer Adaptation for Image + Quality Assessment + + +
+ Image Quality Assessment (IQA) constitutes a fundamental task within the +field of computer vision, yet it remains an unresolved challenge, owing to the +intricate distortion conditions, diverse image contents, and limited +availability of data. Recently, the community has witnessed the emergence of +numerous large-scale pretrained foundation models, which greatly benefit from +dramatically increased data and parameter capacities. However, it remains an +open problem whether the scaling law in high-level tasks is also applicable to +IQA task which is closely related to low-level clues. In this paper, we +demonstrate that with proper injection of local distortion features, a larger +pretrained and fixed foundation model performs better in IQA tasks. +Specifically, for the lack of local distortion structure and inductive bias of +vision transformer (ViT), alongside the large-scale pretrained ViT, we use +another pretrained convolution neural network (CNN), which is well known for +capturing the local structure, to extract multi-scale image features. Further, +we propose a local distortion extractor to obtain local distortion features +from the pretrained CNN and a local distortion injector to inject the local +distortion features into ViT. By only training the extractor and injector, our +method can benefit from the rich knowledge in the powerful foundation models +and achieve state-of-the-art performance on popular IQA datasets, indicating +that IQA is not only a low-level problem but also benefits from stronger +high-level features drawn from large-scale pretrained models. + +
+
+
+
+
+ + ☆ Progressive Feature Mining and External Knowledge-Assisted + Text-Pedestrian Image Retrieval + + +
+ Text-Pedestrian Image Retrieval aims to use the text describing pedestrian +appearance to retrieve the corresponding pedestrian image. This task involves +not only modality discrepancy, but also the challenge of the textual diversity +of pedestrians with the same identity. At present, although existing research +progress has been made in text-pedestrian image retrieval, these methods do not +comprehensively consider the above-mentioned problems. Considering these, this +paper proposes a progressive feature mining and external knowledge-assisted +feature purification method. Specifically, we use a progressive mining mode to +enable the model to mine discriminative features from neglected information, +thereby avoiding the loss of discriminative information and improving the +expression ability of features. In addition, to further reduce the negative +impact of modal discrepancy and text diversity on cross-modal matching, we +propose to use other sample knowledge of the same modality, i.e., external +knowledge to enhance identity-consistent features and weaken +identity-inconsistent features. This process purifies features and alleviates +the interference caused by textual diversity and negative sample correlation +features of the same modal. Extensive experiments on three challenging datasets +demonstrate the effectiveness and superiority of the proposed method, and the +retrieval performance even surpasses that of the large-scale model-based method +on large-scale datasets. + +
+
+
+
+
+ + ☆ RankMixup: Ranking-Based Mixup Training for Network Calibration ICCV 2023 + + +
+ Network calibration aims to accurately estimate the level of confidences, +which is particularly important for employing deep neural networks in +real-world systems. Recent approaches leverage mixup to calibrate the network's +predictions during training. However, they do not consider the problem that +mixtures of labels in mixup may not accurately represent the actual +distribution of augmented samples. In this paper, we present RankMixup, a novel +mixup-based framework alleviating the problem of the mixture of labels for +network calibration. To this end, we propose to use an ordinal ranking +relationship between raw and mixup-augmented samples as an alternative +supervisory signal to the label mixtures for network calibration. We +hypothesize that the network should estimate a higher level of confidence for +the raw samples than the augmented ones (Fig.1). To implement this idea, we +introduce a mixup-based ranking loss (MRL) that encourages lower confidences +for augmented samples compared to raw ones, maintaining the ranking +relationship. We also propose to leverage the ranking relationship among +multiple mixup-augmented samples to further improve the calibration capability. +Augmented samples with larger mixing coefficients are expected to have higher +confidences and vice versa (Fig.1). That is, the order of confidences should be +aligned with that of mixing coefficients. To this end, we introduce a novel +loss, M-NDCG, in order to reduce the number of misaligned pairs of the +coefficients and confidences. Extensive experimental results on standard +benchmarks for network calibration demonstrate the effectiveness of RankMixup. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ☆ Multi-Modal Multi-Task (3MT) Road Segmentation + + +
+ Multi-modal systems have the capacity of producing more reliable results than +systems with a single modality in road detection due to perceiving different +aspects of the scene. We focus on using raw sensor inputs instead of, as it is +typically done in many SOTA works, leveraging architectures that require high +pre-processing costs such as surface normals or dense depth predictions. By +using raw sensor inputs, we aim to utilize a low-cost model thatminimizes both +the pre-processing andmodel computation costs. This study presents a +cost-effective and highly accurate solution for road segmentation by +integrating data from multiple sensorswithin a multi-task learning +architecture.Afusion architecture is proposed in which RGB and LiDAR depth +images constitute the inputs of the network. Another contribution of this study +is to use IMU/GNSS (inertial measurement unit/global navigation satellite +system) inertial navigation system whose data is collected synchronously and +calibrated with a LiDAR-camera to compute aggregated dense LiDAR depth images. +It has been demonstrated by experiments on the KITTI dataset that the proposed +method offers fast and high-performance solutions. We have also shown the +performance of our method on Cityscapes where raw LiDAR data is not available. +The segmentation results obtained for both full and half resolution images are +competitive with existing methods. Therefore, we conclude that our method is +not dependent only on raw LiDAR data; rather, it can be used with different +sensor modalities. The inference times obtained in all experiments are very +promising for real-time experiments. + +
+
+
+
+
+ + ☆ Rotation-Invariant Completion Network + + +
+ Real-world point clouds usually suffer from incompleteness and display +different poses. While current point cloud completion methods excel in +reproducing complete point clouds with consistent poses as seen in the training +set, their performance tends to be unsatisfactory when handling point clouds +with diverse poses. We propose a network named Rotation-Invariant Completion +Network (RICNet), which consists of two parts: a Dual Pipeline Completion +Network (DPCNet) and an enhancing module. Firstly, DPCNet generates a coarse +complete point cloud. The feature extraction module of DPCNet can extract +consistent features, no matter if the input point cloud has undergone rotation +or translation. Subsequently, the enhancing module refines the fine-grained +details of the final generated point cloud. RICNet achieves better rotation +invariance in feature extraction and incorporates structural relationships in +man-made objects. To assess the performance of RICNet and existing methods on +point clouds with various poses, we applied random transformations to the point +clouds in the MVP dataset and conducted experiments on them. Our experiments +demonstrate that RICNet exhibits superior completion performance compared to +existing methods. + +
+
+ comment: 12 pages, accepted to PRCV 2023 (The 6th Chinese Conference on + Pattern Recognition and Computer Vision) +
+
+
+
+
+ + ☆ Blending-NeRF: Text-Driven Localized Editing in Neural Radiance Fields ICCV 2023 + + +
+ Text-driven localized editing of 3D objects is particularly difficult as +locally mixing the original 3D object with the intended new object and style +effects without distorting the object's form is not a straightforward process. +To address this issue, we propose a novel NeRF-based model, Blending-NeRF, +which consists of two NeRF networks: pretrained NeRF and editable NeRF. +Additionally, we introduce new blending operations that allow Blending-NeRF to +properly edit target regions which are localized by text. By using a pretrained +vision-language aligned model, CLIP, we guide Blending-NeRF to add new objects +with varying colors and densities, modify textures, and remove parts of the +original object. Our extensive experiments demonstrate that Blending-NeRF +produces naturally and locally edited 3D objects from various text prompts. + +
+
+ comment: Accepted to ICCV 2023. The first two authors contributed equally to + this work +
+
+
+
+
+ + ☆ EVE: Efficient Vision-Language Pre-training with Masked Prediction and + Modality-Aware MoE + + +
+ Building scalable vision-language models to learn from diverse, multimodal +data remains an open challenge. In this paper, we introduce an Efficient +Vision-languagE foundation model, namely EVE, which is one unified multimodal +Transformer pre-trained solely by one unified pre-training task. Specifically, +EVE encodes both vision and language within a shared Transformer network +integrated with modality-aware sparse Mixture-of-Experts (MoE) modules, which +capture modality-specific information by selectively switching to different +experts. To unify pre-training tasks of vision and language, EVE performs +masked signal modeling on image-text pairs to reconstruct masked signals, i.e., +image pixels and text tokens, given visible signals. This simple yet effective +pre-training objective accelerates training by 3.5x compared to the model +pre-trained with Image-Text Contrastive and Image-Text Matching losses. Owing +to the combination of the unified architecture and pre-training task, EVE is +easy to scale up, enabling better downstream performance with fewer resources +and faster training speed. Despite its simplicity, EVE achieves +state-of-the-art performance on various vision-language downstream tasks, +including visual question answering, visual reasoning, and image-text +retrieval. + +
+
+
+
+
+ + ☆ Anisotropic Hybrid Networks for liver tumor segmentation with + uncertainty quantification MICCAI + + +
+ The burden of liver tumors is important, ranking as the fourth leading cause +of cancer mortality. In case of hepatocellular carcinoma (HCC), the delineation +of liver and tumor on contrast-enhanced magnetic resonance imaging (CE-MRI) is +performed to guide the treatment strategy. As this task is time-consuming, +needs high expertise and could be subject to inter-observer variability there +is a strong need for automatic tools. However, challenges arise from the lack +of available training data, as well as the high variability in terms of image +resolution and MRI sequence. In this work we propose to compare two different +pipelines based on anisotropic models to obtain the segmentation of the liver +and tumors. The first pipeline corresponds to a baseline multi-class model that +performs the simultaneous segmentation of the liver and tumor classes. In the +second approach, we train two distinct binary models, one segmenting the liver +only and the other the tumors. Our results show that both pipelines exhibit +different strengths and weaknesses. Moreover we propose an uncertainty +quantification strategy allowing the identification of potential false positive +tumor lesions. Both solutions were submitted to the MICCAI 2023 Atlas challenge +regarding liver and tumor segmentation. + +
+
+ comment: Accepted for presentation at MICCAI Workshop on 2nd + Resource-Efficient Medical Image Analysis (REMIA) +
+
+
+
+
+ + ☆ Pose Modulated Avatars from Video + + +
+ It is now possible to reconstruct dynamic human motion and shape from a +sparse set of cameras using Neural Radiance Fields (NeRF) driven by an +underlying skeleton. However, a challenge remains to model the deformation of +cloth and skin in relation to skeleton pose. Unlike existing avatar models that +are learned implicitly or rely on a proxy surface, our approach is motivated by +the observation that different poses necessitate unique frequency assignments. +Neglecting this distinction yields noisy artifacts in smooth areas or blurs +fine-grained texture and shape details in sharp regions. We develop a +two-branch neural network that is adaptive and explicit in the frequency +domain. The first branch is a graph neural network that models correlations +among body parts locally, taking skeleton pose as input. The second branch +combines these correlation features to a set of global frequencies and then +modulates the feature encoding. Our experiments demonstrate that our network +outperforms state-of-the-art methods in terms of preserving details and +generalization capabilities. + +
+
+
+
+
+ + ☆ High-quality Image Dehazing with Diffusion Model + + +
+ Image dehazing is quite challenging in dense-haze scenarios, where quite less +original information remains in the hazy image. Though previous methods have +made marvelous progress, they still suffer from information loss in content and +color in dense-haze scenarios. The recently emerged Denoising Diffusion +Probabilistic Model (DDPM) exhibits strong generation ability, showing +potential for solving this problem. However, DDPM fails to consider the physics +property of dehazing task, limiting its information completion capacity. In +this work, we propose DehazeDDPM: A DDPM-based and physics-aware image dehazing +framework that applies to complex hazy scenarios. Specifically, DehazeDDPM +works in two stages. The former stage physically models the dehazing task with +the Atmospheric Scattering Model (ASM), pulling the distribution closer to the +clear data and endowing DehazeDDPM with fog-aware ability. The latter stage +exploits the strong generation ability of DDPM to compensate for the +haze-induced huge information loss, by working in conjunction with the physical +modelling. Extensive experiments demonstrate that our method attains +state-of-the-art performance on both synthetic and real-world hazy datasets. + +
+
+
+
+
+ + ☆ Efficient Transfer Learning in Diffusion Models via Adversarial Noise + + +
+ Diffusion Probabilistic Models (DPMs) have demonstrated substantial promise +in image generation tasks but heavily rely on the availability of large amounts +of training data. Previous works, like GANs, have tackled the limited data +problem by transferring pre-trained models learned with sufficient data. +However, those methods are hard to be utilized in DPMs since the distinct +differences between DPM-based and GAN-based methods, showing in the unique +iterative denoising process integral and the need for many timesteps with +no-targeted noise in DPMs. In this paper, we propose a novel DPMs-based +transfer learning method, TAN, to address the limited data problem. It includes +two strategies: similarity-guided training, which boosts transfer with a +classifier, and adversarial noise selection which adaptive chooses targeted +noise based on the input image. Extensive experiments in the context of +few-shot image generation tasks demonstrate that our method is not only +efficient but also excels in terms of image quality and diversity when compared +to existing GAN-based and DDPM-based methods. + +
+
+
+
+
+ + ☆ LongDanceDiff: Long-term Dance Generation with Conditional Diffusion + Model + + +
+ Dancing with music is always an essential human art form to express emotion. +Due to the high temporal-spacial complexity, long-term 3D realist dance +generation synchronized with music is challenging. Existing methods suffer from +the freezing problem when generating long-term dances due to error accumulation +and training-inference discrepancy. To address this, we design a conditional +diffusion model, LongDanceDiff, for this sequence-to-sequence long-term dance +generation, addressing the challenges of temporal coherency and spatial +constraint. LongDanceDiff contains a transformer-based diffusion model, where +the input is a concatenation of music, past motions, and noised future motions. +This partial noising strategy leverages the full-attention mechanism and learns +the dependencies among music and past motions. To enhance the diversity of +generated dance motions and mitigate the freezing problem, we introduce a +mutual information minimization objective that regularizes the dependency +between past and future motions. We also address common visual quality issues +in dance generation, such as foot sliding and unsmooth motion, by incorporating +spatial constraints through a Global-Trajectory Modulation (GTM) layer and +motion perceptual losses, thereby improving the smoothness and naturalness of +motion generation. Extensive experiments demonstrate a significant improvement +in our approach over the existing state-of-the-art methods. We plan to release +our codes and models soon. + +
+
+
+
+
+ + ☆ Boosting Diffusion Models with an Adaptive Momentum Sampler + + +
+ Diffusion probabilistic models (DPMs) have been shown to generate +high-quality images without the need for delicate adversarial training. +However, the current sampling process in DPMs is prone to violent shaking. In +this paper, we present a novel reverse sampler for DPMs inspired by the +widely-used Adam optimizer. Our proposed sampler can be readily applied to a +pre-trained diffusion model, utilizing momentum mechanisms and adaptive +updating to smooth the reverse sampling process and ensure stable generation, +resulting in outputs of enhanced quality. By implicitly reusing update +directions from early steps, our proposed sampler achieves a better balance +between high-level semantics and low-level details. Additionally, this sampler +is flexible and can be easily integrated into pre-trained DPMs regardless of +the sampler used during training. Our experimental results on multiple +benchmarks demonstrate that our proposed reverse sampler yields remarkable +improvements over different baselines. We will make the source code available. + +
+
+
+
+
+ + ☆ Learning Bottleneck Transformer for Event Image-Voxel Feature Fusion + based Classification + + +
+ Recognizing target objects using an event-based camera draws more and more +attention in recent years. Existing works usually represent the event streams +into point-cloud, voxel, image, etc, and learn the feature representations +using various deep neural networks. Their final results may be limited by the +following factors: monotonous modal expressions and the design of the network +structure. To address the aforementioned challenges, this paper proposes a +novel dual-stream framework for event representation, extraction, and fusion. +This framework simultaneously models two common representations: event images +and event voxels. By utilizing Transformer and Structured Graph Neural Network +(GNN) architectures, spatial information and three-dimensional stereo +information can be learned separately. Additionally, a bottleneck Transformer +is introduced to facilitate the fusion of the dual-stream information. +Extensive experiments demonstrate that our proposed framework achieves +state-of-the-art performance on two widely used event-based classification +datasets. The source code of this work is available at: +\url{https://github.com/Event-AHU/EFV_event_classification} + +
+
+ comment: Accepted by PRCV-2023 +
+
+
+
+
+ + ☆ Synergistic Multiscale Detail Refinement via Intrinsic Supervision for + Underwater Image Enhancement + + +
+ Visual restoration of underwater scenes is crucial for visual tasks, and +avoiding interference from underwater media has become a prominent concern. In +this work, we present a synergistic multiscale detail refinement via intrinsic +supervision (SMDR-IS) to recover underwater scene details. The low-degradation +stage provides multiscale detail for original stage, which achieves synergistic +multiscale detail refinement through feature propagation via the adaptive +selective intrinsic supervised feature module (ASISF), which achieves +synergistic multiscale detail refinement. ASISF is developed using intrinsic +supervision to precisely control and guide feature transmission in the +multi-degradation stages. ASISF improves the multiscale detail refinement while +reducing interference from irrelevant scene information from the +low-degradation stage. Additionally, within the multi-degradation +encoder-decoder of SMDR-IS, we introduce a bifocal intrinsic-context attention +module (BICA). This module is designed to effectively leverage multi-scale +scene information found in images, using intrinsic supervision principles as +its foundation. BICA facilitates the guidance of higher-resolution spaces by +leveraging lower-resolution spaces, considering the significant dependency of +underwater image restoration on spatial contextual relationships. During the +training process, the network gains advantages from the integration of a +multi-degradation loss function. This function serves as a constraint, enabling +the network to effectively exploit information across various scales. When +compared with state-of-the-art methods, SMDR-IS demonstrates its outstanding +performance. Code will be made publicly available. + +
+
+
+
+
+ + ☆ OFVL-MS: Once for Visual Localization across Multiple Indoor Scenes + + +
+ In this work, we seek to predict camera poses across scenes with a multi-task +learning manner, where we view the localization of each scene as a new task. We +propose OFVL-MS, a unified framework that dispenses with the traditional +practice of training a model for each individual scene and relieves gradient +conflict induced by optimizing multiple scenes collectively, enabling efficient +storage yet precise visual localization for all scenes. Technically, in the +forward pass of OFVL-MS, we design a layer-adaptive sharing policy with a +learnable score for each layer to automatically determine whether the layer is +shared or not. Such sharing policy empowers us to acquire task-shared +parameters for a reduction of storage cost and task-specific parameters for +learning scene-related features to alleviate gradient conflict. In the backward +pass of OFVL-MS, we introduce a gradient normalization algorithm that +homogenizes the gradient magnitude of the task-shared parameters so that all +tasks converge at the same pace. Furthermore, a sparse penalty loss is applied +on the learnable scores to facilitate parameter sharing for all tasks without +performance degradation. We conduct comprehensive experiments on multiple +benchmarks and our new released indoor dataset LIVL, showing that OFVL-MS +families significantly outperform the state-of-the-arts with fewer parameters. +We also verify that OFVL-MS can generalize to a new scene with much few +parameters while gaining superior localization performance. + +
+
+
+
+
+ + ☆ Recovering a Molecule's 3D Dynamics from Liquid-phase Electron + Microscopy Movies + + +
+ The dynamics of biomolecules are crucial for our understanding of their +functioning in living systems. However, current 3D imaging techniques, such as +cryogenic electron microscopy (cryo-EM), require freezing the sample, which +limits the observation of their conformational changes in real time. The +innovative liquid-phase electron microscopy (liquid-phase EM) technique allows +molecules to be placed in the native liquid environment, providing a unique +opportunity to observe their dynamics. In this paper, we propose TEMPOR, a +Temporal Electron MicroscoPy Object Reconstruction algorithm for liquid-phase +EM that leverages an implicit neural representation (INR) and a dynamical +variational auto-encoder (DVAE) to recover time series of molecular structures. +We demonstrate its advantages in recovering different motion dynamics from two +simulated datasets, 7bcq and Cas9. To our knowledge, our work is the first +attempt to directly recover 3D structures of a temporally-varying particle from +liquid-phase EM movies. It provides a promising new approach for studying +molecules' 3D dynamics in structural biology. + +
+
+
+
+
+ + ☆ Concept Bottleneck with Visual Concept Filtering for Explainable Medical + Image Classification MICCAI 2023 + + +
+ Interpretability is a crucial factor in building reliable models for various +medical applications. Concept Bottleneck Models (CBMs) enable interpretable +image classification by utilizing human-understandable concepts as intermediate +targets. Unlike conventional methods that require extensive human labor to +construct the concept set, recent works leveraging Large Language Models (LLMs) +for generating concepts made automatic concept generation possible. However, +those methods do not consider whether a concept is visually relevant or not, +which is an important factor in computing meaningful concept scores. Therefore, +we propose a visual activation score that measures whether the concept contains +visual cues or not, which can be easily computed with unlabeled image data. +Computed visual activation scores are then used to filter out the less visible +concepts, thus resulting in a final concept set with visually meaningful +concepts. Our experimental results show that adopting the proposed visual +activation score for concept filtering consistently boosts performance compared +to the baseline. Moreover, qualitative analyses also validate that visually +relevant concepts are successfully selected with the visual activation score. + +
+
+ comment: Accepted to MedAGI Workshop at MICCAI 2023 (Oral Presentation) +
+
+
+
+
+ + ☆ AMSP-UOD: When Vortex Convolution and Stochastic Perturbation Meet + Underwater Object Detection + + +
+ In this paper, we present a novel Amplitude-Modulated Stochastic Perturbation +and Vortex Convolutional Network, AMSP-UOD, designed for underwater object +detection. AMSP-UOD specifically addresses the impact of non-ideal imaging +factors on detection accuracy in complex underwater environments. To mitigate +the influence of noise on object detection performance, we propose AMSP Vortex +Convolution (AMSP-VConv) to disrupt the noise distribution, enhance feature +extraction capabilities, effectively reduce parameters, and improve network +robustness. We design the Feature Association Decoupling Cross Stage Partial +(FAD-CSP) module, which strengthens the association of long and short-range +features, improving the network performance in complex underwater environments. +Additionally, our sophisticated post-processing method, based on non-maximum +suppression with aspect-ratio similarity thresholds, optimizes detection in +dense scenes, such as waterweed and schools of fish, improving object detection +accuracy. Extensive experiments on the URPC and RUOD datasets demonstrate that +our method outperforms existing state-of-the-art methods in terms of accuracy +and noise immunity. AMSP-UOD proposes an innovative solution with the potential +for real-world applications. Code will be made publicly available. + +
+
+
+
+
+ + ☆ LFS-GAN: Lifelong Few-Shot Image Generation ICCV 2023 + + +
+ We address a challenging lifelong few-shot image generation task for the +first time. In this situation, a generative model learns a sequence of tasks +using only a few samples per task. Consequently, the learned model encounters +both catastrophic forgetting and overfitting problems at a time. Existing +studies on lifelong GANs have proposed modulation-based methods to prevent +catastrophic forgetting. However, they require considerable additional +parameters and cannot generate high-fidelity and diverse images from limited +data. On the other hand, the existing few-shot GANs suffer from severe +catastrophic forgetting when learning multiple tasks. To alleviate these +issues, we propose a framework called Lifelong Few-Shot GAN (LFS-GAN) that can +generate high-quality and diverse images in lifelong few-shot image generation +task. Our proposed framework learns each task using an efficient task-specific +modulator - Learnable Factorized Tensor (LeFT). LeFT is rank-constrained and +has a rich representation ability due to its unique reconstruction technique. +Furthermore, we propose a novel mode seeking loss to improve the diversity of +our model in low-data circumstances. Extensive experiments demonstrate that the +proposed LFS-GAN can generate high-fidelity and diverse images without any +forgetting and mode collapse in various domains, achieving state-of-the-art in +lifelong few-shot image generation task. Surprisingly, we find that our LFS-GAN +even outperforms the existing few-shot GANs in the few-shot image generation +task. The code is available at Github. + +
+
+ comment: 20 pages, 19 figures, 14 tables, ICCV 2023 Poster +
+
+
+
+
+ + ☆ Semantic-Aware Implicit Template Learning via Part Deformation + Consistency ICCV + + +
+ Learning implicit templates as neural fields has recently shown impressive +performance in unsupervised shape correspondence. Despite the success, we +observe current approaches, which solely rely on geometric information, often +learn suboptimal deformation across generic object shapes, which have high +structural variability. In this paper, we highlight the importance of part +deformation consistency and propose a semantic-aware implicit template learning +framework to enable semantically plausible deformation. By leveraging semantic +prior from a self-supervised feature extractor, we suggest local conditioning +with novel semantic-aware deformation code and deformation consistency +regularizations regarding part deformation, global deformation, and global +scaling. Our extensive experiments demonstrate the superiority of the proposed +method over baselines in various tasks: keypoint transfer, part label transfer, +and texture transfer. More interestingly, our framework shows a larger +performance gain under more challenging settings. We also provide qualitative +analyses to validate the effectiveness of semantic-aware deformation. The code +is available at https://github.com/mlvlab/PDC. + +
+
+ comment: ICCV camera-ready version +
+
+
+
+
+ + ☆ ACLS: Adaptive and Conditional Label Smoothing for Network Calibration ICCV 2023 + + +
+ We address the problem of network calibration adjusting miscalibrated +confidences of deep neural networks. Many approaches to network calibration +adopt a regularization-based method that exploits a regularization term to +smooth the miscalibrated confidences. Although these approaches have shown the +effectiveness on calibrating the networks, there is still a lack of +understanding on the underlying principles of regularization in terms of +network calibration. We present in this paper an in-depth analysis of existing +regularization-based methods, providing a better understanding on how they +affect to network calibration. Specifically, we have observed that 1) the +regularization-based methods can be interpreted as variants of label smoothing, +and 2) they do not always behave desirably. Based on the analysis, we introduce +a novel loss function, dubbed ACLS, that unifies the merits of existing +regularization methods, while avoiding the limitations. We show extensive +experimental results for image classification and semantic segmentation on +standard benchmarks, including CIFAR10, Tiny-ImageNet, ImageNet, and PASCAL +VOC, demonstrating the effectiveness of our loss function. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ Edge-aware Hard Clustering Graph Pooling for Brain Imaging Data + + +
+ Graph Convolutional Networks (GCNs) can capture non-Euclidean spatial +dependence between different brain regions, and the graph pooling operator in +GCNs is key to enhancing the representation learning capability and acquiring +abnormal brain maps. However, the majority of existing research designs graph +pooling operators only from the perspective of nodes while disregarding the +original edge features, in a way that not only confines graph pooling +application scenarios, but also diminishes its ability to capture critical +substructures. In this study, a clustering graph pooling method that first +supports multidimensional edge features, called Edge-aware hard clustering +graph pooling (EHCPool), is developed. EHCPool proposes the first +'Edge-to-node' score evaluation criterion based on edge features to assess node +feature significance. To more effectively capture the critical subgraphs, a +novel Iteration n-top strategy is further designed to adaptively learn sparse +hard clustering assignments for graphs. Subsequently, an innovative N-E +Aggregation strategy is presented to aggregate node and edge feature +information in each independent subgraph. The proposed model was evaluated on +multi-site brain imaging public datasets and yielded state-of-the-art +performance. We believe this method is the first deep learning tool with the +potential to probe different types of abnormal functional brain networks from +data-driven perspective. + +
+
+
+
+
+ + ☆ Rethinking Data Perturbation and Model Stabilization for Semi-supervised + Medical Image Segmentation + + +
+ Studies on semi-supervised medical image segmentation (SSMIS) have seen fast +progress recently. Due to the limited labelled data, SSMIS methods mainly focus +on effectively leveraging unlabeled data to enhance the segmentation +performance. However, despite their promising performance, current +state-of-the-art methods often prioritize integrating complex techniques and +loss terms rather than addressing the core challenges of semi-supervised +scenarios directly. We argue that the key to SSMIS lies in generating +substantial and appropriate prediction disagreement on unlabeled data. To this +end, we emphasize the crutiality of data perturbation and model stabilization +in semi-supervised segmentation, and propose a simple yet effective approach to +boost SSMIS performance significantly, dubbed DPMS. Specifically, we first +revisit SSMIS from three distinct perspectives: the data, the model, and the +loss, and conduct a comprehensive study of corresponding strategies to examine +their effectiveness. Based on these examinations, we then propose DPMS, which +adopts a plain teacher-student framework with a standard supervised loss and +unsupervised consistency loss. To produce appropriate prediction disagreements, +DPMS perturbs the unlabeled data via strong augmentations to enlarge prediction +disagreements considerably. On the other hand, using EMA teacher when strong +augmentation is applied does not necessarily improve performance. DPMS further +utilizes a forwarding-twice and momentum updating strategies for normalization +statistics to stabilize the training on unlabeled data effectively. Despite its +simplicity, DPMS can obtain new state-of-the-art performance on the public 2D +ACDC and 3D LA datasets across various semi-supervised settings, e.g. obtaining +a remarkable 22.62% improvement against previous SOTA on ACDC with 5% labels. + +
+
+ comment: Code and logs are available at https://github.com/ZhenZHAO/DPMS +
+
+
+
+
+ + ☆ Camera-Driven Representation Learning for Unsupervised Domain Adaptive + Person Re-identification ICCV 2023 + + +
+ We present a novel unsupervised domain adaption method for person +re-identification (reID) that generalizes a model trained on a labeled source +domain to an unlabeled target domain. We introduce a camera-driven curriculum +learning (CaCL) framework that leverages camera labels of person images to +transfer knowledge from source to target domains progressively. To this end, we +divide target domain dataset into multiple subsets based on the camera labels, +and initially train our model with a single subset (i.e., images captured by a +single camera). We then gradually exploit more subsets for training, according +to a curriculum sequence obtained with a camera-driven scheduling rule. The +scheduler considers maximum mean discrepancies (MMD) between each subset and +the source domain dataset, such that the subset closer to the source domain is +exploited earlier within the curriculum. For each curriculum sequence, we +generate pseudo labels of person images in a target domain to train a reID +model in a supervised way. We have observed that the pseudo labels are highly +biased toward cameras, suggesting that person images obtained from the same +camera are likely to have the same pseudo labels, even for different IDs. To +address the camera bias problem, we also introduce a camera-diversity (CD) loss +encouraging person images of the same pseudo label, but captured across various +cameras, to involve more for discriminative feature learning, providing person +representations robust to inter-camera variations. Experimental results on +standard benchmarks, including real-to-real and synthetic-to-real scenarios, +demonstrate the effectiveness of our framework. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ HashReID: Dynamic Network with Binary Codes for Efficient Person + Re-identification WACV 2024 + + +
+ Biometric applications, such as person re-identification (ReID), are often +deployed on energy constrained devices. While recent ReID methods prioritize +high retrieval performance, they often come with large computational costs and +high search time, rendering them less practical in real-world settings. In this +work, we propose an input-adaptive network with multiple exit blocks, that can +terminate computation early if the retrieval is straightforward or noisy, +saving a lot of computation. To assess the complexity of the input, we +introduce a temporal-based classifier driven by a new training strategy. +Furthermore, we adopt a binary hash code generation approach instead of relying +on continuous-valued features, which significantly improves the search process +by a factor of 20. To ensure similarity preservation, we utilize a new ranking +regularizer that bridges the gap between continuous and binary features. +Extensive analysis of our proposed method is conducted on three datasets: +Market1501, MSMT17 (Multi-Scene Multi-Time), and the BGC1 (BRIAR Government +Collection). Using our approach, more than 70% of the samples with compact hash +codes exit early on the Market1501 dataset, saving 80% of the networks +computational cost and improving over other hash-based methods by 60%. These +results demonstrate a significant improvement over dynamic networks and +showcase comparable accuracy performance to conventional ReID methods. Code +will be made available. + +
+
+ comment: WACV 2024 +
+
+
+
+
+ + ☆ Exploring the Optimization Objective of One-Class Classification for + Anomaly Detection + + +
+ One-class classification (OCC) is a longstanding method for anomaly +detection. With the powerful representation capability of the pre-trained +backbone, OCC methods have witnessed significant performance improvements. +Typically, most of these OCC methods employ transfer learning to enhance the +discriminative nature of the pre-trained backbone's features, thus achieving +remarkable efficacy. While most current approaches emphasize feature transfer +strategies, we argue that the optimization objective space within OCC methods +could also be an underlying critical factor influencing performance. In this +work, we conducted a thorough investigation into the optimization objective of +OCC. Through rigorous theoretical analysis and derivation, we unveil a key +insights: any space with the suitable norm can serve as an equivalent +substitute for the hypersphere center, without relying on the distribution +assumption of training samples. Further, we provide guidelines for determining +the feasible domain of norms for the OCC optimization objective. This novel +insight sparks a simple and data-agnostic deep one-class classification method. +Our method is straightforward, with a single 1x1 convolutional layer as a +trainable projector and any space with suitable norm as the optimization +objective. Extensive experiments validate the reliability and efficacy of our +findings and the corresponding methodology, resulting in state-of-the-art +performance in both one-class classification and industrial vision anomaly +detection and segmentation tasks. + +
+
+ comment: 15 paegs, 10 figures +
+
+
+
+
+ + ☆ Age Prediction From Face Images Via Contrastive Learning + + +
+ This paper presents a novel approach for accurately estimating age from face +images, which overcomes the challenge of collecting a large dataset of +individuals with the same identity at different ages. Instead, we leverage +readily available face datasets of different people at different ages and aim +to extract age-related features using contrastive learning. Our method +emphasizes these relevant features while suppressing identity-related features +using a combination of cosine similarity and triplet margin losses. We +demonstrate the effectiveness of our proposed approach by achieving +state-of-the-art performance on two public datasets, FG-NET and MORPH-II. + +
+
+ comment: MVA2023 +
+
+
+
+
+ + ☆ Does Physical Adversarial Example Really Matter to Autonomous Driving? + Towards System-Level Effect of Adversarial Object Evasion Attack ICCV 2023 + + +
+ In autonomous driving (AD), accurate perception is indispensable to achieving +safe and secure driving. Due to its safety-criticality, the security of AD +perception has been widely studied. Among different attacks on AD perception, +the physical adversarial object evasion attacks are especially severe. However, +we find that all existing literature only evaluates their attack effect at the +targeted AI component level but not at the system level, i.e., with the entire +system semantics and context such as the full AD pipeline. Thereby, this raises +a critical research question: can these existing researches effectively achieve +system-level attack effects (e.g., traffic rule violations) in the real-world +AD context? In this work, we conduct the first measurement study on whether and +how effectively the existing designs can lead to system-level effects, +especially for the STOP sign-evasion attacks due to their popularity and +severity. Our evaluation results show that all the representative prior works +cannot achieve any system-level effects. We observe two design limitations in +the prior works: 1) physical model-inconsistent object size distribution in +pixel sampling and 2) lack of vehicle plant model and AD system model +consideration. Then, we propose SysAdv, a novel system-driven attack design in +the AD context and our evaluation results show that the system-level effects +can be significantly improved, i.e., the violation rate increases by around +70%. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ A Unified Framework for 3D Point Cloud Visual Grounding + + +
+ 3D point cloud visual grounding plays a critical role in 3D scene +comprehension, encompassing 3D referring expression comprehension (3DREC) and +segmentation (3DRES). We argue that 3DREC and 3DRES should be unified in one +framework, which is also a natural progression in the community. To explain, +3DREC can help 3DRES locate the referent, while 3DRES can also facilitate 3DREC +via more finegrained language-visual alignment. To achieve this, this paper +takes the initiative step to integrate 3DREC and 3DRES into a unified +framework, termed 3D Referring Transformer (3DRefTR). Its key idea is to build +upon a mature 3DREC model and leverage ready query embeddings and visual tokens +from the 3DREC model to construct a dedicated mask branch. Specially, we +propose Superpoint Mask Branch, which serves a dual purpose: i) By leveraging +the heterogeneous CPU-GPU parallelism, while the GPU is occupied generating +visual tokens, the CPU concurrently produces superpoints, equivalently +accomplishing the upsampling computation; ii) By harnessing on the inherent +association between the superpoints and point cloud, it eliminates the heavy +computational overhead on the high-resolution visual features for upsampling. +This elegant design enables 3DRefTR to achieve both well-performing 3DRES and +3DREC capacities with only a 6% additional latency compared to the original +3DREC model. Empirical evaluations affirm the superiority of 3DRefTR. +Specifically, on the ScanRefer dataset, 3DRefTR surpasses the state-of-the-art +3DRES method by 12.43% in mIoU and improves upon the SOTA 3DREC method by 0.6% +Acc@0.25IoU. + +
+
+
+
+
+ + ☆ SUMMIT: Source-Free Adaptation of Uni-Modal Models to Multi-Modal + Targets ICCV 2023 + + +
+ Scene understanding using multi-modal data is necessary in many applications, +e.g., autonomous navigation. To achieve this in a variety of situations, +existing models must be able to adapt to shifting data distributions without +arduous data annotation. Current approaches assume that the source data is +available during adaptation and that the source consists of paired multi-modal +data. Both these assumptions may be problematic for many applications. Source +data may not be available due to privacy, security, or economic concerns. +Assuming the existence of paired multi-modal data for training also entails +significant data collection costs and fails to take advantage of widely +available freely distributed pre-trained uni-modal models. In this work, we +relax both of these assumptions by addressing the problem of adapting a set of +models trained independently on uni-modal data to a target domain consisting of +unlabeled multi-modal data, without having access to the original source +dataset. Our proposed approach solves this problem through a switching +framework which automatically chooses between two complementary methods of +cross-modal pseudo-label fusion -- agreement filtering and entropy weighting -- +based on the estimated domain gap. We demonstrate our work on the semantic +segmentation problem. Experiments across seven challenging adaptation scenarios +verify the efficacy of our approach, achieving results comparable to, and in +some cases outperforming, methods which assume access to source data. Our +method achieves an improvement in mIoU of up to 12% over competing baselines. +Our code is publicly available at https://github.com/csimo005/SUMMIT. + +
+
+ comment: 12 pages, 5 figures, 9 tables, ICCV 2023 +
+
+
+
+
+ + ☆ Integrated Image and Location Analysis for Wound Classification: A Deep + Learning Approach + + +
+ The global burden of acute and chronic wounds presents a compelling case for +enhancing wound classification methods, a vital step in diagnosing and +determining optimal treatments. Recognizing this need, we introduce an +innovative multi-modal network based on a deep convolutional neural network for +categorizing wounds into four categories: diabetic, pressure, surgical, and +venous ulcers. Our multi-modal network uses wound images and their +corresponding body locations for more precise classification. A unique aspect +of our methodology is incorporating a body map system that facilitates accurate +wound location tagging, improving upon traditional wound image classification +techniques. A distinctive feature of our approach is the integration of models +such as VGG16, ResNet152, and EfficientNet within a novel architecture. This +architecture includes elements like spatial and channel-wise +Squeeze-and-Excitation modules, Axial Attention, and an Adaptive Gated +Multi-Layer Perceptron, providing a robust foundation for classification. Our +multi-modal network was trained and evaluated on two distinct datasets +comprising relevant images and corresponding location information. Notably, our +proposed network outperformed traditional methods, reaching an accuracy range +of 74.79% to 100% for Region of Interest (ROI) without location +classifications, 73.98% to 100% for ROI with location classifications, and +78.10% to 100% for whole image classifications. This marks a significant +enhancement over previously reported performance metrics in the literature. Our +results indicate the potential of our multi-modal network as an effective +decision-support tool for wound image classification, paving the way for its +application in various clinical contexts. + +
+
+
+
+
+ + ☆ Motion-to-Matching: A Mixed Paradigm for 3D Single Object Tracking + + +
+ 3D single object tracking with LiDAR points is an important task in the +computer vision field. Previous methods usually adopt the matching-based or +motion-centric paradigms to estimate the current target status. However, the +former is sensitive to the similar distractors and the sparseness of point +cloud due to relying on appearance matching, while the latter usually focuses +on short-term motion clues (eg. two frames) and ignores the long-term motion +pattern of target. To address these issues, we propose a mixed paradigm with +two stages, named MTM-Tracker, which combines motion modeling with feature +matching into a single network. Specifically, in the first stage, we exploit +the continuous historical boxes as motion prior and propose an encoder-decoder +structure to locate target coarsely. Then, in the second stage, we introduce a +feature interaction module to extract motion-aware features from consecutive +point clouds and match them to refine target movement as well as regress other +target states. Extensive experiments validate that our paradigm achieves +competitive performance on large-scale datasets (70.9% in KITTI and 51.70% in +NuScenes). The code will be open soon at +https://github.com/LeoZhiheng/MTM-Tracker.git. + +
+
+ comment: 8 pages, 8 figures +
+
+
+
+
+ + ☆ Semi-Supervised Learning via Weight-aware Distillation under Class + Distribution Mismatch ICCV 2023 + + +
+ Semi-Supervised Learning (SSL) under class distribution mismatch aims to +tackle a challenging problem wherein unlabeled data contain lots of unknown +categories unseen in the labeled ones. In such mismatch scenarios, traditional +SSL suffers severe performance damage due to the harmful invasion of the +instances with unknown categories into the target classifier. In this study, by +strict mathematical reasoning, we reveal that the SSL error under class +distribution mismatch is composed of pseudo-labeling error and invasion error, +both of which jointly bound the SSL population risk. To alleviate the SSL +error, we propose a robust SSL framework called Weight-Aware Distillation (WAD) +that, by weights, selectively transfers knowledge beneficial to the target task +from unsupervised contrastive representation to the target classifier. +Specifically, WAD captures adaptive weights and high-quality pseudo labels to +target instances by exploring point mutual information (PMI) in representation +space to maximize the role of unlabeled data and filter unknown categories. +Theoretically, we prove that WAD has a tight upper bound of population risk +under class distribution mismatch. Experimentally, extensive results +demonstrate that WAD outperforms five state-of-the-art SSL approaches and one +standard baseline on two benchmark datasets, CIFAR10 and CIFAR100, and an +artificial cross-dataset. The code is available at +https://github.com/RUC-DWBI-ML/research/tree/main/WAD-master. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ CoC-GAN: Employing Context Cluster for Unveiling a New Pathway in Image + Generation + + +
+ Image generation tasks are traditionally undertaken using Convolutional +Neural Networks (CNN) or Transformer architectures for feature aggregating and +dispatching. Despite the frequent application of convolution and attention +structures, these structures are not fundamentally required to solve the +problem of instability and the lack of interpretability in image generation. In +this paper, we propose a unique image generation process premised on the +perspective of converting images into a set of point clouds. In other words, we +interpret an image as a set of points. As such, our methodology leverages +simple clustering methods named Context Clustering (CoC) to generate images +from unordered point sets, which defies the convention of using convolution or +attention mechanisms. Hence, we exclusively depend on this clustering +technique, combined with the multi-layer perceptron (MLP) in a generative +model. Furthermore, we implement the integration of a module termed the 'Point +Increaser' for the model. This module is just an MLP tasked with generating +additional points for clustering, which are subsequently integrated within the +paradigm of the Generative Adversarial Network (GAN). We introduce this model +with the novel structure as the Context Clustering Generative Adversarial +Network (CoC-GAN), which offers a distinctive viewpoint in the domain of +feature aggregating and dispatching. Empirical evaluations affirm that our +CoC-GAN, devoid of convolution and attention mechanisms, exhibits outstanding +performance. Its interpretability, endowed by the CoC module, also allows for +visualization in our experiments. The promising results underscore the +feasibility of our method and thus warrant future investigations of applying +Context Clustering to more novel and interpretable image generation. + +
+
+
+
+
+ + ☆ Compressed Models Decompress Race Biases: What Quantized Models Forget + for Fair Face Recognition + + +
+ With the ever-growing complexity of deep learning models for face +recognition, it becomes hard to deploy these systems in real life. Researchers +have two options: 1) use smaller models; 2) compress their current models. +Since the usage of smaller models might lead to concerning biases, compression +gains relevance. However, compressing might be also responsible for an increase +in the bias of the final model. We investigate the overall performance, the +performance on each ethnicity subgroup and the racial bias of a +State-of-the-Art quantization approach when used with synthetic and real data. +This analysis provides a few more details on potential benefits of performing +quantization with synthetic data, for instance, the reduction of biases on the +majority of test scenarios. We tested five distinct architectures and three +different training datasets. The models were evaluated on a fourth dataset +which was collected to infer and compare the performance of face recognition +models on different ethnicity. + +
+
+ comment: Accepted for Oral at BIOSIG 2023 +
+
+
+
+
+ + ☆ Diffuse, Attend, and Segment: Unsupervised Zero-Shot Segmentation using + Stable Diffusion + + +
+ Producing quality segmentation masks for images is a fundamental problem in +computer vision. Recent research has explored large-scale supervised training +to enable zero-shot segmentation on virtually any image style and unsupervised +training to enable segmentation without dense annotations. However, +constructing a model capable of segmenting anything in a zero-shot manner +without any annotations is still challenging. In this paper, we propose to +utilize the self-attention layers in stable diffusion models to achieve this +goal because the pre-trained stable diffusion model has learned inherent +concepts of objects within its attention layers. Specifically, we introduce a +simple yet effective iterative merging process based on measuring KL divergence +among attention maps to merge them into valid segmentation masks. The proposed +method does not require any training or language dependency to extract quality +segmentation for any images. On COCO-Stuff-27, our method surpasses the prior +unsupervised zero-shot SOTA method by an absolute 26% in pixel accuracy and 17% +in mean IoU. + +
+
+
+
+
+ + ☆ InverseSR: 3D Brain MRI Super-Resolution Using a Latent Diffusion Model MICCAI 2023 + + +
+ High-resolution (HR) MRI scans obtained from research-grade medical centers +provide precise information about imaged tissues. However, routine clinical MRI +scans are typically in low-resolution (LR) and vary greatly in contrast and +spatial resolution due to the adjustments of the scanning parameters to the +local needs of the medical center. End-to-end deep learning methods for MRI +super-resolution (SR) have been proposed, but they require re-training each +time there is a shift in the input distribution. To address this issue, we +propose a novel approach that leverages a state-of-the-art 3D brain generative +model, the latent diffusion model (LDM) trained on UK BioBank, to increase the +resolution of clinical MRI scans. The LDM acts as a generative prior, which has +the ability to capture the prior distribution of 3D T1-weighted brain MRI. +Based on the architecture of the brain LDM, we find that different methods are +suitable for different settings of MRI SR, and thus propose two novel +strategies: 1) for SR with more sparsity, we invert through both the decoder of +the LDM and also through a deterministic Denoising Diffusion Implicit Models +(DDIM), an approach we will call InverseSR(LDM); 2) for SR with less sparsity, +we invert only through the LDM decoder, an approach we will call +InverseSR(Decoder). These two approaches search different latent spaces in the +LDM model to find the optimal latent code to map the given LR MRI into HR. The +training process of the generative model is independent of the MRI +under-sampling process, ensuring the generalization of our method to many MRI +SR problems with different input measurements. We validate our method on over +100 brain T1w MRIs from the IXI dataset. Our method can demonstrate that +powerful priors given by LDM can be used for MRI reconstruction. + +
+
+ comment: Early Accepted to MICCAI 2023 [top 14% of Submissions] +
+
+
+
+
+ + ☆ Overcoming General Knowledge Loss with Selective Parameter Finetuning + + +
+ Foundation models encompass an extensive knowledge base and offer remarkable +transferability. However, this knowledge becomes outdated or insufficient over +time. The challenge lies in updating foundation models to accommodate novel +information while retaining their original ability. In this paper, we present a +novel approach to achieving continual model updates by effecting localized +modifications to a small subset of parameters. Guided by insights gleaned from +prior analyses of foundational models, we first localize a specific layer for +model refinement and then introduce an importance scoring mechanism designed to +update only the most crucial weights. Our method is exhaustively evaluated on +foundational vision-language models, measuring its efficacy in both learning +new information and preserving pre-established knowledge across a diverse +spectrum of continual learning tasks, including Aircraft, Birdsnap CIFAR-100, +CUB, Cars, and GTSRB. The results show that our method improves the existing +continual learning methods by 0.5\% - 10\% on average, and reduces the loss of +pre-trained knowledge from around 5\% to 0.97\%. Comprehensive ablation studies +substantiate our method design, shedding light on the contributions of each +component to controllably learning new knowledge and mitigating the forgetting +of pre-trained knowledge. + +
+
+
+
+
+ + ☆ Augmenting medical image classifiers with synthetic data from latent + diffusion models + + +
+ While hundreds of artificial intelligence (AI) algorithms are now approved or +cleared by the US Food and Drugs Administration (FDA), many studies have shown +inconsistent generalization or latent bias, particularly for underrepresented +populations. Some have proposed that generative AI could reduce the need for +real data, but its utility in model development remains unclear. Skin disease +serves as a useful case study in synthetic image generation due to the +diversity of disease appearance, particularly across the protected attribute of +skin tone. Here we show that latent diffusion models can scalably generate +images of skin disease and that augmenting model training with these data +improves performance in data-limited settings. These performance gains saturate +at synthetic-to-real image ratios above 10:1 and are substantially smaller than +the gains obtained from adding real images. As part of our analysis, we +generate and analyze a new dataset of 458,920 synthetic images produced using +several generation strategies. Our results suggest that synthetic data could +serve as a force-multiplier for model development, but the collection of +diverse real-world data remains the most important step to improve medical AI +algorithms. + +
+
+
+
+
+ + ☆ ARF-Plus: Controlling Perceptual Factors in Artistic Radiance Fields for + 3D Scene Stylization + + +
+ The radiance fields style transfer is an emerging field that has recently +gained popularity as a means of 3D scene stylization, thanks to the outstanding +performance of neural radiance fields in 3D reconstruction and view synthesis. +We highlight a research gap in radiance fields style transfer, the lack of +sufficient perceptual controllability, motivated by the existing concept in the +2D image style transfer. In this paper, we present ARF-Plus, a 3D neural style +transfer framework offering manageable control over perceptual factors, to +systematically explore the perceptual controllability in 3D scene stylization. +Four distinct types of controls - color preservation control, (style pattern) +scale control, spatial (selective stylization area) control, and depth +enhancement control - are proposed and integrated into this framework. Results +from real-world datasets, both quantitative and qualitative, show that the four +types of controls in our ARF-Plus framework successfully accomplish their +corresponding perceptual controls when stylizing 3D scenes. These techniques +work well for individual style inputs as well as for the simultaneous +application of multiple styles within a scene. This unlocks a realm of +limitless possibilities, allowing customized modifications of stylization +effects and flexible merging of the strengths of different styles, ultimately +enabling the creation of novel and eye-catching stylistic effects on 3D scenes. + +
+
+
+
+
+ + ☆ MOFO: MOtion FOcused Self-Supervision for Video Understanding + + +
+ Self-supervised learning (SSL) techniques have recently produced outstanding +results in learning visual representations from unlabeled videos. Despite the +importance of motion in supervised learning techniques for action recognition, +SSL methods often do not explicitly consider motion information in videos. To +address this issue, we propose MOFO (MOtion FOcused), a novel SSL method for +focusing representation learning on the motion area of a video, for action +recognition. MOFO automatically detects motion areas in videos and uses these +to guide the self-supervision task. We use a masked autoencoder which randomly +masks out a high proportion of the input sequence; we force a specified +percentage of the inside of the motion area to be masked and the remainder from +outside. We further incorporate motion information into the finetuning step to +emphasise motion in the downstream task. We demonstrate that our motion-focused +innovations can significantly boost the performance of the currently leading +SSL method (VideoMAE) for action recognition. Our method improves the recent +self-supervised Vision Transformer (ViT), VideoMAE, by achieving +2.6%, +2.1%, ++1.3% accuracy on Epic-Kitchens verb, noun and action classification, +respectively, and +4.7% accuracy on Something-Something V2 action +classification. Our proposed approach significantly improves the performance of +the current SSL method for action recognition, indicating the importance of +explicitly encoding motion in SSL. + +
+
+
+
+
+ + ☆ TAI-GAN: Temporally and Anatomically Informed GAN for early-to-late + frame conversion in dynamic cardiac PET motion correction MICCAI + + +
+ The rapid tracer kinetics of rubidium-82 ($^{82}$Rb) and high variation of +cross-frame distribution in dynamic cardiac positron emission tomography (PET) +raise significant challenges for inter-frame motion correction, particularly +for the early frames where conventional intensity-based image registration +techniques are not applicable. Alternatively, a promising approach utilizes +generative methods to handle the tracer distribution changes to assist existing +registration methods. To improve frame-wise registration and parametric +quantification, we propose a Temporally and Anatomically Informed Generative +Adversarial Network (TAI-GAN) to transform the early frames into the late +reference frame using an all-to-one mapping. Specifically, a feature-wise +linear modulation layer encodes channel-wise parameters generated from temporal +tracer kinetics information, and rough cardiac segmentations with local shifts +serve as the anatomical information. We validated our proposed method on a +clinical $^{82}$Rb PET dataset and found that our TAI-GAN can produce converted +early frames with high image quality, comparable to the real reference frames. +After TAI-GAN conversion, motion estimation accuracy and clinical myocardial +blood flow (MBF) quantification were improved compared to using the original +frames. Our code is published at https://github.com/gxq1998/TAI-GAN. + +
+
+ comment: Accepted by Simulation and Synthesis in Medical Imaging (SASHIMI + 2023, MICCAI workshop), preprint version +
+
+
+
+
+ + ☆ HNAS-reg: hierarchical neural architecture search for deformable medical + image registration + + +
+ Convolutional neural networks (CNNs) have been widely used to build deep +learning models for medical image registration, but manually designed network +architectures are not necessarily optimal. This paper presents a hierarchical +NAS framework (HNAS-Reg), consisting of both convolutional operation search and +network topology search, to identify the optimal network architecture for +deformable medical image registration. To mitigate the computational overhead +and memory constraints, a partial channel strategy is utilized without losing +optimization quality. Experiments on three datasets, consisting of 636 +T1-weighted magnetic resonance images (MRIs), have demonstrated that the +proposal method can build a deep learning model with improved image +registration accuracy and reduced model size, compared with state-of-the-art +image registration approaches, including one representative traditional +approach and two unsupervised learning-based approaches. + +
+
+
+
+
+ + ☆ BaDExpert: Extracting Backdoor Functionality for Accurate Backdoor Input + Detection + + +
+ We present a novel defense, against backdoor attacks on Deep Neural Networks +(DNNs), wherein adversaries covertly implant malicious behaviors (backdoors) +into DNNs. Our defense falls within the category of post-development defenses +that operate independently of how the model was generated. The proposed defense +is built upon a novel reverse engineering approach that can directly extract +backdoor functionality of a given backdoored model to a backdoor expert model. +The approach is straightforward -- finetuning the backdoored model over a small +set of intentionally mislabeled clean samples, such that it unlearns the normal +functionality while still preserving the backdoor functionality, and thus +resulting in a model (dubbed a backdoor expert model) that can only recognize +backdoor inputs. Based on the extracted backdoor expert model, we show the +feasibility of devising highly accurate backdoor input detectors that filter +out the backdoor inputs during model inference. Further augmented by an +ensemble strategy with a finetuned auxiliary model, our defense, BaDExpert +(Backdoor Input Detection with Backdoor Expert), effectively mitigates 16 SOTA +backdoor attacks while minimally impacting clean utility. The effectiveness of +BaDExpert has been verified on multiple datasets (CIFAR10, GTSRB and ImageNet) +across various model architectures (ResNet, VGG, MobileNetV2 and Vision +Transformer). + +
+
+
+
+
+ + ☆ Characterising representation dynamics in recurrent neural networks for + object recognition + + +
+ Recurrent neural networks (RNNs) have yielded promising results for both +recognizing objects in challenging conditions and modeling aspects of primate +vision. However, the representational dynamics of recurrent computations remain +poorly understood, especially in large-scale visual models. Here, we studied +such dynamics in RNNs trained for object classification on MiniEcoset, a novel +subset of ecoset. We report two main insights. First, upon inference, +representations continued to evolve after correct classification, suggesting a +lack of the notion of being ``done with classification''. Second, focusing on +``readout zones'' as a way to characterize the activation trajectories, we +observe that misclassified representations exhibit activation patterns with +lower L2 norm, and are positioned more peripherally in the readout zones. Such +arrangements help the misclassified representations move into the correct zones +as time progresses. Our findings generalize to networks with lateral and +top-down connections, and include both additive and multiplicative interactions +with the bottom-up sweep. The results therefore contribute to a general +understanding of RNN dynamics in naturalistic tasks. We hope that the analysis +framework will aid future investigations of other types of RNNs, including +understanding of representational dynamics in primate vision. + +
+
+ comment: 8 pages, 6 figures; revision of our Conference on Cognitive + Computational Neuroscience (CCN) 2023 paper +
+
+
+
+
+ + ☆ A Spatiotemporal Correspondence Approach to Unsupervised LiDAR + Segmentation with Traffic Applications SC 2023 + + +
+ We address the problem of unsupervised semantic segmentation of outdoor LiDAR +point clouds in diverse traffic scenarios. The key idea is to leverage the +spatiotemporal nature of a dynamic point cloud sequence and introduce +drastically stronger augmentation by establishing spatiotemporal +correspondences across multiple frames. We dovetail clustering and pseudo-label +learning in this work. Essentially, we alternate between clustering points into +semantic groups and optimizing models using point-wise pseudo-spatiotemporal +labels with a simple learning objective. Therefore, our method can learn +discriminative features in an unsupervised learning fashion. We show promising +segmentation performance on Semantic-KITTI, SemanticPOSS, and FLORIDA benchmark +datasets covering scenarios in autonomous vehicle and intersection +infrastructure, which is competitive when compared against many existing fully +supervised learning methods. This general framework can lead to a unified +representation learning approach for LiDAR point clouds incorporating domain +knowledge. + +
+
+ comment: Accepted for publication in IEEE International Conference on + Intelligent Transportation Systems (ITSC 2023) +
+
+
+
+
+ + ☆ Toward American Sign Language Processing in the Real World: Data, Tasks, + and Methods + + +
+ Sign language, which conveys meaning through gestures, is the chief means of +communication among deaf people. Recognizing sign language in natural settings +presents significant challenges due to factors such as lighting, background +clutter, and variations in signer characteristics. In this thesis, I study +automatic sign language processing in the wild, using signing videos collected +from the Internet. This thesis contributes new datasets, tasks, and methods. +Most chapters of this thesis address tasks related to fingerspelling, an +important component of sign language and yet has not been studied widely by +prior work. I present three new large-scale ASL datasets in the wild: +ChicagoFSWild, ChicagoFSWild+, and OpenASL. Using ChicagoFSWild and +ChicagoFSWild+, I address fingerspelling recognition, which consists of +transcribing fingerspelling sequences into text. I propose an end-to-end +approach based on iterative attention that allows recognition from a raw video +without explicit hand detection. I further show that using a Conformer-based +network jointly modeling handshape and mouthing can bring performance close to +that of humans. Next, I propose two tasks for building real-world +fingerspelling-based applications: fingerspelling detection and search. For +fingerspelling detection, I introduce a suite of evaluation metrics and a new +detection model via multi-task training. To address the problem of searching +for fingerspelled keywords in raw sign language videos, we propose a novel +method that jointly localizes and matches fingerspelling segments to text. +Finally, I will describe a benchmark for large-vocabulary open-domain sign +language translation based on OpenASL. To address the challenges of sign +language translation in realistic settings, we propose a set of techniques +including sign search as a pretext task for pre-training and fusion of mouthing +and handshape features. + +
+
+ comment: PhD thesis +
+
+
+
+
+ + ☆ Reframing the Brain Age Prediction Problem to a More Interpretable and + Quantitative Approach + + +
+ Deep learning models have achieved state-of-the-art results in estimating +brain age, which is an important brain health biomarker, from magnetic +resonance (MR) images. However, most of these models only provide a global age +prediction, and rely on techniques, such as saliency maps to interpret their +results. These saliency maps highlight regions in the input image that were +significant for the model's predictions, but they are hard to be interpreted, +and saliency map values are not directly comparable across different samples. +In this work, we reframe the age prediction problem from MR images to an +image-to-image regression problem where we estimate the brain age for each +brain voxel in MR images. We compare voxel-wise age prediction models against +global age prediction models and their corresponding saliency maps. The results +indicate that voxel-wise age prediction models are more interpretable, since +they provide spatial information about the brain aging process, and they +benefit from being quantitative. + +
+
+
+
+
+ + ♻ ☆ Randomized Quantization: A Generic Augmentation for Data Agnostic + Self-supervised Learning ICCV 2023 + + +
+ Self-supervised representation learning follows a paradigm of withholding +some part of the data and tasking the network to predict it from the remaining +part. Among many techniques, data augmentation lies at the core for creating +the information gap. Towards this end, masking has emerged as a generic and +powerful tool where content is withheld along the sequential dimension, e.g., +spatial in images, temporal in audio, and syntactic in language. In this paper, +we explore the orthogonal channel dimension for generic data augmentation by +exploiting precision redundancy. The data for each channel is quantized through +a non-uniform quantizer, with the quantized value sampled randomly within +randomly sampled quantization bins. From another perspective, quantization is +analogous to channel-wise masking, as it removes the information within each +bin, but preserves the information across bins. Our approach significantly +surpasses existing generic data augmentation methods, while showing on par +performance against modality-specific augmentations. We comprehensively +evaluate our approach on vision, audio, 3D point clouds, as well as the DABS +benchmark which is comprised of various data modalities. The code is available +at https: //github.com/microsoft/random_quantize. + +
+
+ comment: Accepted by ICCV 2023. The code is available at https: + //github.com/microsoft/random_quantize +
+
+
+
+
+ + ♻ ☆ Back to Optimization: Diffusion-based Zero-Shot 3D Human Pose Estimation + + +
+ Learning-based methods have dominated the 3D human pose estimation (HPE) +tasks with significantly better performance in most benchmarks than traditional +optimization-based methods. Nonetheless, 3D HPE in the wild is still the +biggest challenge of learning-based models, whether with 2D-3D lifting, +image-to-3D, or diffusion-based methods, since the trained networks implicitly +learn camera intrinsic parameters and domain-based 3D human pose distributions +and estimate poses by statistical average. On the other hand, the +optimization-based methods estimate results case-by-case, which can predict +more diverse and sophisticated human poses in the wild. By combining the +advantages of optimization-based and learning-based methods, we propose the +Zero-shot Diffusion-based Optimization (ZeDO) pipeline for 3D HPE to solve the +problem of cross-domain and in-the-wild 3D HPE. Our multi-hypothesis ZeDO +achieves state-of-the-art (SOTA) performance on Human3.6M as minMPJPE $51.4$mm +without training with any 2D-3D or image-3D pairs. Moreover, our +single-hypothesis ZeDO achieves SOTA performance on 3DPW dataset with PA-MPJPE +$42.6$mm on cross-dataset evaluation, which even outperforms learning-based +methods trained on 3DPW. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Selective Labeling for More Effective Semi-Supervised + Learning ECCV 2022 + + +
+ Given an unlabeled dataset and an annotation budget, we study how to +selectively label a fixed number of instances so that semi-supervised learning +(SSL) on such a partially labeled dataset is most effective. We focus on +selecting the right data to label, in addition to usual SSL's propagating +labels from labeled data to the rest unlabeled data. This instance selection +task is challenging, as without any labeled data we do not know what the +objective of learning should be. Intuitively, no matter what the downstream +task is, instances to be labeled must be representative and diverse: The former +would facilitate label propagation to unlabeled data, whereas the latter would +ensure coverage of the entire dataset. We capture this idea by selecting +cluster prototypes, either in a pretrained feature space, or along with feature +optimization, both without labels. Our unsupervised selective labeling +consistently improves SSL methods over state-of-the-art active learning given +labeled data, by 8 to 25 times in label efficiency. For example, it boosts +FixMatch by 10% (14%) in accuracy on CIFAR-10 (ImageNet-1K) with 0.08% (0.2%) +labeled data, demonstrating that small computation spent on selecting what data +to label brings significant gain especially under a low annotation budget. Our +work sets a new standard for practical and efficient SSL. + +
+
+ comment: Accepted by ECCV 2022; Fixed a few typos +
+
+
+
+
+ + ♻ ☆ Methods and datasets for segmentation of minimally invasive surgical + instruments in endoscopic images and videos: A review of the state of the art + + +
+ In the field of computer- and robot-assisted minimally invasive surgery, +enormous progress has been made in recent years based on the recognition of +surgical instruments in endoscopic images and videos. In particular, the +determination of the position and type of instruments is of great interest. +Current work involves both spatial and temporal information, with the idea that +predicting the movement of surgical tools over time may improve the quality of +the final segmentations. The provision of publicly available datasets has +recently encouraged the development of new methods, mainly based on deep +learning. In this review, we identify and characterize datasets used for method +development and evaluation and quantify their frequency of use in the +literature. We further present an overview of the current state of research +regarding the segmentation and tracking of minimally invasive surgical +instruments in endoscopic images and videos. The paper focuses on methods that +work purely visually, without markers of any kind attached to the instruments, +considering both single-frame semantic and instance segmentation approaches, as +well as those that incorporate temporal information. The publications analyzed +were identified through the platforms Google Scholar, Web of Science, and +PubMed. The search terms used were "instrument segmentation", "instrument +tracking", "surgical tool segmentation", and "surgical tool tracking", +resulting in a total of 741 articles published between 01/2015 and 07/2023, of +which 123 were included using systematic selection criteria. A discussion of +the reviewed literature is provided, highlighting existing shortcomings and +emphasizing the available potential for future developments. + +
+
+ comment: 29 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Learning from Semantic Alignment between Unpaired Multiviews for + Egocentric Video Recognition ICCV + + +
+ We are concerned with a challenging scenario in unpaired multiview video +learning. In this case, the model aims to learn comprehensive multiview +representations while the cross-view semantic information exhibits variations. +We propose Semantics-based Unpaired Multiview Learning (SUM-L) to tackle this +unpaired multiview learning problem. The key idea is to build cross-view +pseudo-pairs and do view-invariant alignment by leveraging the semantic +information of videos. To facilitate the data efficiency of multiview learning, +we further perform video-text alignment for first-person and third-person +videos, to fully leverage the semantic knowledge to improve video +representations. Extensive experiments on multiple benchmark datasets verify +the effectiveness of our framework. Our method also outperforms multiple +existing view-alignment methods, under the more challenging scenario than +typical paired or unpaired multimodal or multiview learning. Our code is +available at https://github.com/wqtwjt1996/SUM-L. + +
+
+ comment: Proceedings of IEEE International Conference on Computer Vision + (ICCV) 2023 +
+
+
+
+
+ + ♻ ☆ Advancing Volumetric Medical Image Segmentation via Global-Local Masked + Autoencoder + + +
+ Masked autoencoder (MAE) is a promising self-supervised pre-training +technique that can improve the representation learning of a neural network +without human intervention. However, applying MAE directly to volumetric +medical images poses two challenges: (i) a lack of global information that is +crucial for understanding the clinical context of the holistic data, (ii) no +guarantee of stabilizing the representations learned from randomly masked +inputs. To address these limitations, we propose the +\textbf{G}lobal-\textbf{L}ocal \textbf{M}asked \textbf{A}uto\textbf{E}ncoder +(GL-MAE), a simple yet effective self-supervised pre-training strategy. In +addition to reconstructing masked local views, as in previous methods, GL-MAE +incorporates global context learning by reconstructing masked global views. +Furthermore, a complete global view is integrated as an anchor to guide the +reconstruction and stabilize the learning process through global-to-global +consistency learning and global-to-local consistency learning. Finetuning +results on multiple datasets demonstrate the superiority of our method over +other state-of-the-art self-supervised algorithms, highlighting its +effectiveness on versatile volumetric medical image segmentation tasks, even +when annotations are scarce. Our codes and models will be released upon +acceptance. + +
+
+
+
+
+ + ♻ ☆ Label-Efficient Online Continual Object Detection in Streaming Video ICCV 2023 + + +
+ Humans can watch a continuous video stream and effortlessly perform continual +acquisition and transfer of new knowledge with minimal supervision yet +retaining previously learnt experiences. In contrast, existing continual +learning (CL) methods require fully annotated labels to effectively learn from +individual frames in a video stream. Here, we examine a more realistic and +challenging problem$\unicode{x2014}$Label-Efficient Online Continual Object +Detection (LEOCOD) in streaming video. We propose a plug-and-play module, +Efficient-CLS, that can be easily inserted into and improve existing continual +learners for object detection in video streams with reduced data annotation +costs and model retraining time. We show that our method has achieved +significant improvement with minimal forgetting across all supervision levels +on two challenging CL benchmarks for streaming real-world videos. Remarkably, +with only 25% annotated video frames, our method still outperforms the base CL +learners, which are trained with 100% annotations on all video frames. The data +and source code will be publicly available at +https://github.com/showlab/Efficient-CLS. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Non-Exemplar Online Class-incremental Continual Learning via + Dual-prototype Self-augment and Refinement + + +
+ This paper investigates a new, practical, but challenging problem named +Non-exemplar Online Class-incremental continual Learning (NO-CL), which aims to +preserve the discernibility of base classes without buffering data examples and +efficiently learn novel classes continuously in a single-pass (i.e., online) +data stream. The challenges of this task are mainly two-fold: (1) Both base and +novel classes suffer from severe catastrophic forgetting as no previous samples +are available for replay. (2) As the online data can only be observed once, +there is no way to fully re-train the whole model, e.g., re-calibrate the +decision boundaries via prototype alignment or feature distillation. In this +paper, we propose a novel Dual-prototype Self-augment and Refinement method +(DSR) for NO-CL problem, which consists of two strategies: 1) Dual class +prototypes: vanilla and high-dimensional prototypes are exploited to utilize +the pre-trained information and obtain robust quasi-orthogonal representations +rather than example buffers for both privacy preservation and memory reduction. +2) Self-augment and refinement: Instead of updating the whole network, we +optimize high-dimensional prototypes alternatively with the extra projection +module based on self-augment vanilla prototypes, through a bi-level +optimization problem. Extensive experiments demonstrate the effectiveness and +superiority of the proposed DSR in NO-CL. + +
+
+
+
+
+ + ♻ ☆ Radar-Camera Fusion for Object Detection and Semantic Segmentation in + Autonomous Driving: A Comprehensive Review + + +
+ Driven by deep learning techniques, perception technology in autonomous +driving has developed rapidly in recent years, enabling vehicles to accurately +detect and interpret surrounding environment for safe and efficient navigation. +To achieve accurate and robust perception capabilities, autonomous vehicles are +often equipped with multiple sensors, making sensor fusion a crucial part of +the perception system. Among these fused sensors, radars and cameras enable a +complementary and cost-effective perception of the surrounding environment +regardless of lighting and weather conditions. This review aims to provide a +comprehensive guideline for radar-camera fusion, particularly concentrating on +perception tasks related to object detection and semantic segmentation.Based on +the principles of the radar and camera sensors, we delve into the data +processing process and representations, followed by an in-depth analysis and +summary of radar-camera fusion datasets. In the review of methodologies in +radar-camera fusion, we address interrogative questions, including "why to +fuse", "what to fuse", "where to fuse", "when to fuse", and "how to fuse", +subsequently discussing various challenges and potential research directions +within this domain. To ease the retrieval and comparison of datasets and fusion +methods, we also provide an interactive website: +https://radar-camera-fusion.github.io. + +
+
+ comment: Accepted by IEEE Transactions on Intelligent Vehicles (T-IV) +
+
+
+
+
+ + ♻ ☆ Black-box Source-free Domain Adaptation via Two-stage Knowledge + Distillation IJCAI 1 + + +
+ Source-free domain adaptation aims to adapt deep neural networks using only +pre-trained source models and target data. However, accessing the source model +still has a potential concern about leaking the source data, which reveals the +patient's privacy. In this paper, we study the challenging but practical +problem: black-box source-free domain adaptation where only the outputs of the +source model and target data are available. We propose a simple but effective +two-stage knowledge distillation method. In Stage +\uppercase\expandafter{\romannumeral1}, we train the target model from scratch +with soft pseudo-labels generated by the source model in a knowledge +distillation manner. In Stage \uppercase\expandafter{\romannumeral2}, we +initialize another model as the new student model to avoid the error +accumulation caused by noisy pseudo-labels. We feed the images with weak +augmentation to the teacher model to guide the learning of the student model. +Our method is simple and flexible, and achieves surprising results on three +cross-domain segmentation tasks. + +
+
+ comment: The short version is accepted by IJCAI 1st International Workshop on + Generalizing from Limited Resources in the Open World. (This version is long + version) +
+
+
+
+
+ + ♻ ☆ Learning Interpretable Dynamics from Images of a Freely Rotating 3D + Rigid Body + + +
+ In many real-world settings, image observations of freely rotating 3D rigid +bodies, such as satellites, may be available when low-dimensional measurements +are not. However, the high-dimensionality of image data precludes the use of +classical estimation techniques to learn the dynamics and a lack of +interpretability reduces the usefulness of standard deep learning methods. In +this work, we present a physics-informed neural network model to estimate and +predict 3D rotational dynamics from image sequences. We achieve this using a +multi-stage prediction pipeline that maps individual images to a latent +representation homeomorphic to $\mathbf{SO}(3)$, computes angular velocities +from latent pairs, and predicts future latent states using the Hamiltonian +equations of motion with a learned representation of the Hamiltonian. We +demonstrate the efficacy of our approach on a new rotating rigid-body dataset +with sequences of rotating cubes and rectangular prisms with uniform and +non-uniform density. + +
+
+ comment: 13 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ On the link between generative semi-supervised learning and generative + open-set recognition + + +
+ This study investigates the relationship between semi-supervised learning +(SSL, which is training off partially labelled datasets) and open-set +recognition (OSR, which is classification with simultaneous novelty detection) +under the context of generative adversarial networks (GANs). Although no +previous study has formally linked SSL and OSR, their respective methods share +striking similarities. Specifically, SSL-GANs and OSR-GANs require their +generators to produce 'bad-looking' samples which are used to regularise their +classifier networks. We hypothesise that the definitions of bad-looking samples +in SSL and OSR represents the same concept and realises the same goal. More +formally, bad-looking samples lie in the complementary space, which is the area +between and around the boundaries of the labelled categories within the +classifier's embedding space. By regularising a classifier with samples in the +complementary space, classifiers achieve improved generalisation for SSL and +also generalise the open space for OSR. To test this hypothesis, we compare a +foundational SSL-GAN with the state-of-the-art OSR-GAN under the same SSL-OSR +experimental conditions. Our results find that SSL-GANs achieve near identical +results to OSR-GANs, proving the SSL-OSR link. Subsequently, to further this +new research path, we compare several SSL-GANs various SSL-OSR setups which +this first benchmark results. A combined framework of SSL-OSR certainly +improves the practicality and cost-efficiency of classifier training, and so +further theoretical and application studies are also discussed. + +
+
+
+
+
+ + ♻ ☆ Backdooring Textual Inversion for Concept Censorship + + +
+ Recent years have witnessed success in AIGC (AI Generated Content). People +can make use of a pre-trained diffusion model to generate images of high +quality or freely modify existing pictures with only prompts in nature +language. More excitingly, the emerging personalization techniques make it +feasible to create specific-desired images with only a few images as +references. However, this induces severe threats if such advanced techniques +are misused by malicious users, such as spreading fake news or defaming +individual reputations. Thus, it is necessary to regulate personalization +models (i.e., concept censorship) for their development and advancement. + In this paper, we focus on the personalization technique dubbed Textual +Inversion (TI), which is becoming prevailing for its lightweight nature and +excellent performance. TI crafts the word embedding that contains detailed +information about a specific object. Users can easily download the word +embedding from public websites like Civitai and add it to their own stable +diffusion model without fine-tuning for personalization. To achieve the concept +censorship of a TI model, we propose leveraging the backdoor technique for good +by injecting backdoors into the Textual Inversion embeddings. Briefly, we +select some sensitive words as triggers during the training of TI, which will +be censored for normal use. In the subsequent generation stage, if the triggers +are combined with personalized embeddings as final prompts, the model will +output a pre-defined target image rather than images including the desired +malicious concept. + To demonstrate the effectiveness of our approach, we conduct extensive +experiments on Stable Diffusion, a prevailing open-sourced text-to-image model. +Our code, data, and results are available at +https://concept-censorship.github.io. + +
+
+
+
+
+ + ♻ ☆ Knowledge-Aware Federated Active Learning with Non-IID Data ICCV23 + + +
+ Federated learning enables multiple decentralized clients to learn +collaboratively without sharing the local training data. However, the expensive +annotation cost to acquire data labels on local clients remains an obstacle in +utilizing local data. In this paper, we propose a federated active learning +paradigm to efficiently learn a global model with limited annotation budget +while protecting data privacy in a decentralized learning way. The main +challenge faced by federated active learning is the mismatch between the active +sampling goal of the global model on the server and that of the asynchronous +local clients. This becomes even more significant when data is distributed +non-IID across local clients. To address the aforementioned challenge, we +propose Knowledge-Aware Federated Active Learning (KAFAL), which consists of +Knowledge-Specialized Active Sampling (KSAS) and Knowledge-Compensatory +Federated Update (KCFU). KSAS is a novel active sampling method tailored for +the federated active learning problem. It deals with the mismatch challenge by +sampling actively based on the discrepancies between local and global models. +KSAS intensifies specialized knowledge in local clients, ensuring the sampled +data to be informative for both the local clients and the global model. KCFU, +in the meantime, deals with the client heterogeneity caused by limited data and +non-IID data distributions. It compensates for each client's ability in weak +classes by the assistance of the global model. Extensive experiments and +analyses are conducted to show the superiority of KSAS over the +state-of-the-art active learning methods and the efficiency of KCFU under the +federated active learning framework. + +
+
+ comment: 14 pages, 12 figures, ICCV23 +
+
+
+
+
+ + ♻ ☆ Zero-Shot In-Distribution Detection in Multi-Object Settings Using + Vision-Language Foundation Models + + +
+ Extracting in-distribution (ID) images from noisy images scraped from the +Internet is an important preprocessing for constructing datasets, which has +traditionally been done manually. Automating this preprocessing with deep +learning techniques presents two key challenges. First, images should be +collected using only the name of the ID class without training on the ID data. +Second, as we can see why COCO was created, it is crucial to identify images +containing not only ID objects but also both ID and out-of-distribution (OOD) +objects as ID images to create robust recognizers. In this paper, we propose a +novel problem setting called zero-shot in-distribution (ID) detection, where we +identify images containing ID objects as ID images (even if they contain OOD +objects), and images lacking ID objects as OOD images without any training. To +solve this problem, we leverage the powerful zero-shot capability of CLIP and +present a simple and effective approach, Global-Local Maximum Concept Matching +(GL-MCM), based on both global and local visual-text alignments of CLIP +features. Extensive experiments demonstrate that GL-MCM outperforms comparison +methods on both multi-object datasets and single-object ImageNet benchmarks. +The code will be available via https://github.com/AtsuMiyai/GL-MCM. + +
+
+ comment: v3: I fixed some typos from v2 +
+
+
+
+
+ + ♻ ☆ Multimodal Garment Designer: Human-Centric Latent Diffusion Models for + Fashion Image Editing ICCV 2023 + + +
+ Fashion illustration is used by designers to communicate their vision and to +bring the design idea from conceptualization to realization, showing how +clothes interact with the human body. In this context, computer vision can thus +be used to improve the fashion design process. Differently from previous works +that mainly focused on the virtual try-on of garments, we propose the task of +multimodal-conditioned fashion image editing, guiding the generation of +human-centric fashion images by following multimodal prompts, such as text, +human body poses, and garment sketches. We tackle this problem by proposing a +new architecture based on latent diffusion models, an approach that has not +been used before in the fashion domain. Given the lack of existing datasets +suitable for the task, we also extend two existing fashion datasets, namely +Dress Code and VITON-HD, with multimodal annotations collected in a +semi-automatic manner. Experimental results on these new datasets demonstrate +the effectiveness of our proposal, both in terms of realism and coherence with +the given multimodal inputs. Source code and collected multimodal annotations +are publicly available at: +https://github.com/aimagelab/multimodal-garment-designer. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Learnable Differencing Center for Nighttime Depth Perception + + +
+ Depth completion is the task of recovering dense depth maps from sparse ones, +usually with the help of color images. Existing image-guided methods perform +well on daytime depth perception self-driving benchmarks, but struggle in +nighttime scenarios with poor visibility and complex illumination. To address +these challenges, we propose a simple yet effective framework called LDCNet. +Our key idea is to use Recurrent Inter-Convolution Differencing (RICD) and +Illumination-Affinitive Intra-Convolution Differencing (IAICD) to enhance the +nighttime color images and reduce the negative effects of the varying +illumination, respectively. RICD explicitly estimates global illumination by +differencing two convolutions with different kernels, treating the +small-kernel-convolution feature as the center of the large-kernel-convolution +feature in a new perspective. IAICD softly alleviates local relative light +intensity by differencing a single convolution, where the center is dynamically +aggregated based on neighboring pixels and the estimated illumination map in +RICD. On both nighttime depth completion and depth estimation tasks, extensive +experiments demonstrate the effectiveness of our LDCNet, reaching the state of +the art. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ♻ ☆ HiFace: High-Fidelity 3D Face Reconstruction by Learning Static and + Dynamic Details ICCV 2023 + + +
+ 3D Morphable Models (3DMMs) demonstrate great potential for reconstructing +faithful and animatable 3D facial surfaces from a single image. The facial +surface is influenced by the coarse shape, as well as the static detail (e,g., +person-specific appearance) and dynamic detail (e.g., expression-driven +wrinkles). Previous work struggles to decouple the static and dynamic details +through image-level supervision, leading to reconstructions that are not +realistic. In this paper, we aim at high-fidelity 3D face reconstruction and +propose HiFace to explicitly model the static and dynamic details. +Specifically, the static detail is modeled as the linear combination of a +displacement basis, while the dynamic detail is modeled as the linear +interpolation of two displacement maps with polarized expressions. We exploit +several loss functions to jointly learn the coarse shape and fine details with +both synthetic and real-world datasets, which enable HiFace to reconstruct +high-fidelity 3D shapes with animatable details. Extensive quantitative and +qualitative experiments demonstrate that HiFace presents state-of-the-art +reconstruction quality and faithfully recovers both the static and dynamic +details. Our project page can be found at https://project-hiface.github.io. + +
+
+ comment: Accepted to ICCV 2023, camera-ready version; Project page: + https://project-hiface.github.io/ +
+
+
+
+
+ + ♻ ☆ Deep Image Fingerprint: Towards Low Budget Synthetic Image Detection and + Model Lineage Analysis + + +
+ The generation of high-quality images has become widely accessible and is a +rapidly evolving process. As a result, anyone can generate images that are +indistinguishable from real ones. This leads to a wide range of applications, +including malicious usage with deceptive intentions. Despite advances in +detection techniques for generated images, a robust detection method still +eludes us. Furthermore, model personalization techniques might affect the +detection capabilities of existing methods. In this work, we utilize the +architectural properties of convolutional neural networks (CNNs) to develop a +new detection method. Our method can detect images from a known generative +model and enable us to establish relationships between fine-tuned generative +models. We tested the method on images produced by both Generative Adversarial +Networks (GANs) and recent large text-to-image models (LTIMs) that rely on +Diffusion Models. Our approach outperforms others trained under identical +conditions and achieves comparable performance to state-of-the-art pre-trained +detection methods on images generated by Stable Diffusion and MidJourney, with +significantly fewer required train samples. + +
+
+
+
+
+ + ♻ ☆ GridMM: Grid Memory Map for Vision-and-Language Navigation ICCV 2023 + + +
+ Vision-and-language navigation (VLN) enables the agent to navigate to a +remote location following the natural language instruction in 3D environments. +To represent the previously visited environment, most approaches for VLN +implement memory using recurrent states, topological maps, or top-down semantic +maps. In contrast to these approaches, we build the top-down egocentric and +dynamically growing Grid Memory Map (i.e., GridMM) to structure the visited +environment. From a global perspective, historical observations are projected +into a unified grid map in a top-down view, which can better represent the +spatial relations of the environment. From a local perspective, we further +propose an instruction relevance aggregation method to capture fine-grained +visual clues in each grid region. Extensive experiments are conducted on both +the REVERIE, R2R, SOON datasets in the discrete environments, and the R2R-CE +dataset in the continuous environments, showing the superiority of our proposed +method. + +
+
+ comment: Accepted by ICCV 2023. The code is available at + https://github.com/MrZihan/GridMM +
+
+
+
+
+ + ♻ ☆ UTRNet: High-Resolution Urdu Text Recognition In Printed Documents ICDAR 2023 + + +
+ In this paper, we propose a novel approach to address the challenges of +printed Urdu text recognition using high-resolution, multi-scale semantic +feature extraction. Our proposed UTRNet architecture, a hybrid CNN-RNN model, +demonstrates state-of-the-art performance on benchmark datasets. To address the +limitations of previous works, which struggle to generalize to the intricacies +of the Urdu script and the lack of sufficient annotated real-world data, we +have introduced the UTRSet-Real, a large-scale annotated real-world dataset +comprising over 11,000 lines and UTRSet-Synth, a synthetic dataset with 20,000 +lines closely resembling real-world and made corrections to the ground truth of +the existing IIITH dataset, making it a more reliable resource for future +research. We also provide UrduDoc, a benchmark dataset for Urdu text line +detection in scanned documents. Additionally, we have developed an online tool +for end-to-end Urdu OCR from printed documents by integrating UTRNet with a +text detection model. Our work not only addresses the current limitations of +Urdu OCR but also paves the way for future research in this area and +facilitates the continued advancement of Urdu OCR technology. The project page +with source code, datasets, annotations, trained models, and online tool is +available at abdur75648.github.io/UTRNet. + +
+
+ comment: Accepted at The 17th International Conference on Document Analysis + and Recognition (ICDAR 2023) +
+
+
+
+
+ + ♻ ☆ EDO-Net: Learning Elastic Properties of Deformable Objects from Graph + Dynamics + + +
+ We study the problem of learning graph dynamics of deformable objects that +generalizes to unknown physical properties. Our key insight is to leverage a +latent representation of elastic physical properties of cloth-like deformable +objects that can be extracted, for example, from a pulling interaction. In this +paper we propose EDO-Net (Elastic Deformable Object - Net), a model of graph +dynamics trained on a large variety of samples with different elastic +properties that does not rely on ground-truth labels of the properties. EDO-Net +jointly learns an adaptation module, and a forward-dynamics module. The former +is responsible for extracting a latent representation of the physical +properties of the object, while the latter leverages the latent representation +to predict future states of cloth-like objects represented as graphs. We +evaluate EDO-Net both in simulation and real world, assessing its capabilities +of: 1) generalizing to unknown physical properties, 2) transferring the learned +representation to new downstream tasks. + +
+
+
+
+
+ + ♻ ☆ Learning to Generalize towards Unseen Domains via a Content-Aware Style + Invariant Model for Disease Detection from Chest X-rays + + +
+ Performance degradation due to source domain mismatch is a longstanding +challenge in deep learning-based medical image analysis, particularly for chest +X-rays (CXRs). Several methods (e.g., adversarial training, multi-domain +mixups) have been proposed to extract domain-invariant high-level features to +address this domain shift. However, these methods do not explicitly regularize +the content and style characteristics of the extracted domain-invariant +features. Recent studies have demonstrated that CNN models exhibit a strong +bias toward styles (e.g., uninformative textures) rather than content (e.g., +shape), in stark contrast to the human-vision system. Radiologists tend to +learn visual cues from CXRs and thus perform well across multiple domains. +Therefore, in medical imaging for pathology diagnosis from CXR images, models +should extract domain-invariant features that are style-invariant and +content-biased. Motivated by this, we employ the novel style randomization +modules (SRMs) at both image and feature levels that work together +hierarchically to create rich style perturbed features on the fly while keeping +the content intact. In addition, we leverage consistency regularizations +between global semantic features and predicted probability distributions, +respectively, for with and without style perturbed versions of the same CXR +image to tweak the model's sensitivity toward content markers for accurate +predictions. Extensive experiments with three large-scale thoracic disease +datasets, i.e., CheXpert, MIMIC-CXR, and BRAX, demonstrate that our proposed +framework is more robust in the presence of domain shift and achieves +state-of-the-art performance. + +
+
+
+
+
+ + ♻ ☆ Neural Spherical Harmonics for structurally coherent continuous + representation of diffusion MRI signal MICCAI 2023 + + +
+ We present a novel way to model diffusion magnetic resonance imaging (dMRI) +datasets, that benefits from the structural coherence of the human brain while +only using data from a single subject. Current methods model the dMRI signal in +individual voxels, disregarding the intervoxel coherence that is present. We +use a neural network to parameterize a spherical harmonics series (NeSH) to +represent the dMRI signal of a single subject from the Human Connectome Project +dataset, continuous in both the angular and spatial domain. The reconstructed +dMRI signal using this method shows a more structurally coherent representation +of the data. Noise in gradient images is removed and the fiber orientation +distribution functions show a smooth change in direction along a fiber tract. +We showcase how the reconstruction can be used to calculate mean diffusivity, +fractional anisotropy, and total apparent fiber density. These results can be +achieved with a single model architecture, tuning only one hyperparameter. In +this paper we also demonstrate how upsampling in both the angular and spatial +domain yields reconstructions that are on par or better than existing methods. + +
+
+ comment: 12 pages, 6 figures, accepted for cdMRI workshop at MICCAI 2023 + Updated to fix typo in author name (Villanova -> Vilanova) +
+
+
+
+
+ + ♻ ☆ Information Theory-Guided Heuristic Progressive Multi-View Coding + + +
+ Multi-view representation learning aims to capture comprehensive information +from multiple views of a shared context. Recent works intuitively apply +contrastive learning to different views in a pairwise manner, which is still +scalable: view-specific noise is not filtered in learning view-shared +representations; the fake negative pairs, where the negative terms are actually +within the same class as the positive, and the real negative pairs are +coequally treated; evenly measuring the similarities between terms might +interfere with optimization. Importantly, few works study the theoretical +framework of generalized self-supervised multi-view learning, especially for +more than two views. To this end, we rethink the existing multi-view learning +paradigm from the perspective of information theory and then propose a novel +information theoretical framework for generalized multi-view learning. Guided +by it, we build a multi-view coding method with a three-tier progressive +architecture, namely Information theory-guided hierarchical Progressive +Multi-view Coding (IPMC). In the distribution-tier, IPMC aligns the +distribution between views to reduce view-specific noise. In the set-tier, IPMC +constructs self-adjusted contrasting pools, which are adaptively modified by a +view filter. Lastly, in the instance-tier, we adopt a designed unified loss to +learn representations and reduce the gradient interference. Theoretically and +empirically, we demonstrate the superiority of IPMC over state-of-the-art +methods. + +
+
+ comment: This paper is accepted by the jourcal of Neural Networks (Elsevier) + by 2023. arXiv admin note: substantial text overlap with arXiv:2109.02344 +
+
+
+
+
+ + ♻ ☆ BallGAN: 3D-aware Image Synthesis with a Spherical Background ICCV 2023 + + +
+ 3D-aware GANs aim to synthesize realistic 3D scenes such that they can be +rendered in arbitrary perspectives to produce images. Although previous methods +produce realistic images, they suffer from unstable training or degenerate +solutions where the 3D geometry is unnatural. We hypothesize that the 3D +geometry is underdetermined due to the insufficient constraint, i.e., being +classified as real image to the discriminator is not enough. To solve this +problem, we propose to approximate the background as a spherical surface and +represent a scene as a union of the foreground placed in the sphere and the +thin spherical background. It reduces the degree of freedom in the background +field. Accordingly, we modify the volume rendering equation and incorporate +dedicated constraints to design a novel 3D-aware GAN framework named BallGAN. +BallGAN has multiple advantages as follows. 1) It produces more reasonable 3D +geometry; the images of a scene across different viewpoints have better +photometric consistency and fidelity than the state-of-the-art methods. 2) The +training becomes much more stable. 3) The foreground can be separately rendered +on top of different arbitrary backgrounds. + +
+
+ comment: ICCV 2023, Project Page: https://minjung-s.github.io/ballgan +
+
+
+
+
+ + ♻ ☆ Foundation Model-oriented Robustness: Robust Image Model Evaluation with + Pretrained Models + + +
+ Machine learning has demonstrated remarkable performance over finite +datasets, yet whether the scores over the fixed benchmarks can sufficiently +indicate the model's performance in the real world is still in discussion. In +reality, an ideal robust model will probably behave similarly to the oracle +(e.g., the human users), thus a good evaluation protocol is probably to +evaluate the models' behaviors in comparison to the oracle. In this paper, we +introduce a new robustness measurement that directly measures the image +classification model's performance compared with a surrogate oracle (i.e., a +foundation model). Besides, we design a simple method that can accomplish the +evaluation beyond the scope of the benchmarks. Our method extends the image +datasets with new samples that are sufficiently perturbed to be distinct from +the ones in the original sets, but are still bounded within the same +image-label structure the original test image represents, constrained by a +foundation model pretrained with a large amount of samples. As a result, our +new method will offer us a new way to evaluate the models' robustness +performance, free of limitations of fixed benchmarks or constrained +perturbations, although scoped by the power of the oracle. In addition to the +evaluation results, we also leverage our generated data to understand the +behaviors of the model and our new evaluation strategies. + +
+
+
+
+
+ + ♻ ☆ AutoPoster: A Highly Automatic and Content-aware Design System for + Advertising Poster Generation ACM MM 2023 + + +
+ Advertising posters, a form of information presentation, combine visual and +linguistic modalities. Creating a poster involves multiple steps and +necessitates design experience and creativity. This paper introduces +AutoPoster, a highly automatic and content-aware system for generating +advertising posters. With only product images and titles as inputs, AutoPoster +can automatically produce posters of varying sizes through four key stages: +image cleaning and retargeting, layout generation, tagline generation, and +style attribute prediction. To ensure visual harmony of posters, two +content-aware models are incorporated for layout and tagline generation. +Moreover, we propose a novel multi-task Style Attribute Predictor (SAP) to +jointly predict visual style attributes. Meanwhile, to our knowledge, we +propose the first poster generation dataset that includes visual attribute +annotations for over 76k posters. Qualitative and quantitative outcomes from +user studies and experiments substantiate the efficacy of our system and the +aesthetic superiority of the generated posters compared to other poster +generation methods. + +
+
+ comment: Accepted for ACM MM 2023 +
+
+
+
+
+ + ♻ ☆ Learning Multiscale Consistency for Self-supervised Electron Microscopy + Instance Segmentation + + +
+ Instance segmentation in electron microscopy (EM) volumes poses a significant +challenge due to the complex morphology of instances and insufficient +annotations. Self-supervised learning has recently emerged as a promising +solution, enabling the acquisition of prior knowledge of cellular tissue +structures that are essential for EM instance segmentation. However, existing +pretraining methods often lack the ability to capture complex visual patterns +and relationships between voxels, which results in the acquired prior knowledge +being insufficient for downstream EM analysis tasks. In this paper, we propose +a novel pretraining framework that leverages multiscale visual representations +to capture both voxel-level and feature-level consistency in EM volumes. +Specifically, our framework enforces voxel-level consistency between the +outputs of a Siamese network by a reconstruction function, and incorporates a +cross-attention mechanism for soft feature matching to achieve fine-grained +feature-level consistency. Moreover, we propose a contrastive learning scheme +on the feature pyramid to extract discriminative features across multiple +scales. We extensively pretrain our method on four large-scale EM datasets, +achieving promising performance improvements in representative tasks of neuron +and mitochondria instance segmentation. + +
+
+
+
+
+ + ♻ ☆ Iteratively Coupled Multiple Instance Learning from Instance to Bag + Classifier for Whole Slide Image Classification + + +
+ Whole Slide Image (WSI) classification remains a challenge due to their +extremely high resolution and the absence of fine-grained labels. Presently, +WSI classification is usually regarded as a Multiple Instance Learning (MIL) +problem when only slide-level labels are available. MIL methods involve a patch +embedding module and a bag-level classification module, but they are +prohibitively expensive to be trained in an end-to-end manner. Therefore, +existing methods usually train them separately, or directly skip the training +of the embedder. Such schemes hinder the patch embedder's access to slide-level +semantic labels, resulting in inconsistency within the entire MIL pipeline. To +overcome this issue, we propose a novel framework called Iteratively Coupled +MIL (ICMIL), which bridges the loss back-propagation process from the bag-level +classifier to the patch embedder. In ICMIL, we use category information in the +bag-level classifier to guide the patch-level fine-tuning of the patch feature +extractor. The refined embedder then generates better instance representations +for achieving a more accurate bag-level classifier. By coupling the patch +embedder and bag classifier at a low cost, our proposed framework enables +information exchange between the two modules, benefiting the entire MIL +classification model. We tested our framework on two datasets using three +different backbones, and our experimental results demonstrate consistent +performance improvements over state-of-the-art MIL methods. The code is +available at: https://github.com/Dootmaan/ICMIL. + +
+
+
+
+
+ + ♻ ☆ Target-Grounded Graph-Aware Transformer for Aerial Vision-and-Dialog + Navigation + + +
+ This report details the method of the winning entry of the AVDN Challenge in +ICCV 2023. The competition addresses the Aerial Navigation from Dialog History +(ANDH) task, which requires a drone agent to associate dialog history with +aerial observations to reach the destination. For better cross-modal grounding +abilities of the drone agent, we propose a Target-Grounded Graph-Aware +Transformer (TG-GAT) framework. Concretely, TG-GAT first leverages a +graph-aware transformer to capture spatiotemporal dependency, which benefits +navigation state tracking and robust action planning. In addition, an auxiliary +visual grounding task is devised to boost the agent's awareness of referred +landmarks. Moreover, a hybrid augmentation strategy based on large language +models is utilized to mitigate data scarcity limitations. Our TG-GAT framework +won the AVDN Challenge 2023, with 2.2% and 3.0% absolute improvements over the +baseline on SPL and SR metrics, respectively. The code is available at +https://github.com/yifeisu/avdn-challenge. + +
+
+
+
+
+ + ♻ ☆ BHSD: A 3D Multi-Class Brain Hemorrhage Segmentation Dataset + + +
+ Intracranial hemorrhage (ICH) is a pathological condition characterized by +bleeding inside the skull or brain, which can be attributed to various factors. +Identifying, localizing and quantifying ICH has important clinical +implications, in a bleed-dependent manner. While deep learning techniques are +widely used in medical image segmentation and have been applied to the ICH +segmentation task, existing public ICH datasets do not support the multi-class +segmentation problem. To address this, we develop the Brain Hemorrhage +Segmentation Dataset (BHSD), which provides a 3D multi-class ICH dataset +containing 192 volumes with pixel-level annotations and 2200 volumes with +slice-level annotations across five categories of ICH. To demonstrate the +utility of the dataset, we formulate a series of supervised and semi-supervised +ICH segmentation tasks. We provide experimental results with state-of-the-art +models as reference benchmarks for further model developments and evaluations +on this dataset. + +
+
+ comment: Accepted by MLMI 2023 +
+
+
+
+
+ + ♻ ☆ Positive Label Is All You Need for Multi-Label Classification + + +
+ Multi-label classification (MLC) suffers from the inevitable label noise in +training data due to the difficulty in annotating various semantic labels in +each image. To mitigate the influence of noisy labels, existing methods mainly +devote to identifying and correcting the label mistakes via a trained MLC +model. However, these methods still involve annoying noisy labels in training, +which can result in imprecise recognition of noisy labels and weaken the +performance. In this paper, considering that the negative labels are +substantially more than positive labels, and most noisy labels are from the +negative labels, we directly discard all the negative labels in the dataset, +and propose a new method dubbed positive and unlabeled multi-label +classification (PU-MLC). By extending positive-unlabeled learning into MLC +task, our method trains model with only positive labels and unlabeled data, and +introduces adaptive re-balance factor and adaptive temperature coefficient in +the loss function to alleviate the catastrophic imbalance in label distribution +and over-smoothing of probabilities in training. Furthermore, to capture both +local and global dependencies in the image, we also introduce a local-global +convolution module, which supplements global information into existing +convolution layers with no retraining of backbone required. Our PU-MLC is +simple and effective, and it is applicable to both MLC and MLC with partial +labels (MLC-PL) tasks. Extensive experiments on MS-COCO and PASCAL VOC datasets +demonstrate that our PU-MLC achieves significantly improvements on both MLC and +MLC-PL settings with even fewer annotations. Code will be released. + +
+
+
+
+
+ + ♻ ☆ AltDiffusion: A Multilingual Text-to-Image Diffusion Model + + +
+ Large Text-to-Image(T2I) diffusion models have shown a remarkable capability +to produce photorealistic and diverse images based on text inputs. However, +existing works only support limited language input, e.g., English, Chinese, and +Japanese, leaving users beyond these languages underserved and blocking the +global expansion of T2I models. Therefore, this paper presents AltDiffusion, a +novel multilingual T2I diffusion model that supports eighteen different +languages. Specifically, we first train a multilingual text encoder based on +the knowledge distillation. Then we plug it into a pretrained English-only +diffusion model and train the model with a two-stage schema to enhance the +multilingual capability, including concept alignment and quality improvement +stage on a large-scale multilingual dataset. Furthermore, we introduce a new +benchmark, which includes Multilingual-General-18(MG-18) and +Multilingual-Cultural-18(MC-18) datasets, to evaluate the capabilities of T2I +diffusion models for generating high-quality images and capturing +culture-specific concepts in different languages. Experimental results on both +MG-18 and MC-18 demonstrate that AltDiffusion outperforms current +state-of-the-art T2I models, e.g., Stable Diffusion in multilingual +understanding, especially with respect to culture-specific concepts, while +still having comparable capability for generating high-quality images. All +source code and checkpoints could be found in +https://github.com/superhero-7/AltDiffuson. + +
+
+ comment: 15 pages; 17 figures +
+
+
+
+
+ + ♻ ☆ Chain-of-Thought Prompt Distillation for Multimodal Named Entity + Recognition and Multimodal Relation Extraction + + +
+ Multimodal Named Entity Recognition (MNER) and Multimodal Relation Extraction +(MRE) necessitate the fundamental reasoning capacity for intricate linguistic +and multimodal comprehension. In this study, we explore distilling the +reasoning ability of large language models (LLMs) into a more compact student +model by generating a \textit{chain of thought} (CoT) -- a sequence of +intermediate reasoning steps. Specifically, we commence by exemplifying the +elicitation of such reasoning ability from LLMs through CoT prompts covering +multi-grain (noun, sentence, multimodality) and data-augmentation (style, +entity, image) dimensions. Subsequently, we present a novel conditional prompt +distillation method to assimilate the commonsense reasoning ability from LLMs, +thereby enhancing the utility of the student model in addressing text-only +inputs without the requisite addition of image and CoT knowledge. Extensive +experiments reveal that our approach attains state-of-the-art accuracy and +manifests a plethora of advantages concerning interpretability, data +efficiency, and cross-domain generalization on MNER and MRE datasets. + +
+
+ comment: modification +
+
+
+
+
+ + ♻ ☆ MSECNet: Accurate and Robust Normal Estimation for 3D Point Clouds by + Multi-Scale Edge Conditioning ACM MM 2023 + + +
+ Estimating surface normals from 3D point clouds is critical for various +applications, including surface reconstruction and rendering. While existing +methods for normal estimation perform well in regions where normals change +slowly, they tend to fail where normals vary rapidly. To address this issue, we +propose a novel approach called MSECNet, which improves estimation in normal +varying regions by treating normal variation modeling as an edge detection +problem. MSECNet consists of a backbone network and a multi-scale edge +conditioning (MSEC) stream. The MSEC stream achieves robust edge detection +through multi-scale feature fusion and adaptive edge detection. The detected +edges are then combined with the output of the backbone network using the edge +conditioning module to produce edge-aware representations. Extensive +experiments show that MSECNet outperforms existing methods on both synthetic +(PCPNet) and real-world (SceneNN) datasets while running significantly faster. +We also conduct various analyses to investigate the contribution of each +component in the MSEC stream. Finally, we demonstrate the effectiveness of our +approach in surface reconstruction. + +
+
+ comment: Accepted for ACM MM 2023 +
+
+
+
+
+ + ♻ ☆ Spherical Space Feature Decomposition for Guided Depth Map + Super-Resolution ICCV 2023 + + +
+ Guided depth map super-resolution (GDSR), as a hot topic in multi-modal image +processing, aims to upsample low-resolution (LR) depth maps with additional +information involved in high-resolution (HR) RGB images from the same scene. +The critical step of this task is to effectively extract domain-shared and +domain-private RGB/depth features. In addition, three detailed issues, namely +blurry edges, noisy surfaces, and over-transferred RGB texture, need to be +addressed. In this paper, we propose the Spherical Space feature Decomposition +Network (SSDNet) to solve the above issues. To better model cross-modality +features, Restormer block-based RGB/depth encoders are employed for extracting +local-global features. Then, the extracted features are mapped to the spherical +space to complete the separation of private features and the alignment of +shared features. Shared features of RGB are fused with the depth features to +complete the GDSR task. Subsequently, a spherical contrast refinement (SCR) +module is proposed to further address the detail issues. Patches that are +classified according to imperfect categories are input into the SCR module, +where the patch features are pulled closer to the ground truth and pushed away +from the corresponding imperfect samples in the spherical feature space via +contrastive learning. Extensive experiments demonstrate that our method can +achieve state-of-the-art results on four test datasets, as well as successfully +generalize to real-world scenes. The code is available at +\url{https://github.com/Zhaozixiang1228/GDSR-SSDNet}. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ SEAM: Searching Transferable Mixed-Precision Quantization Policy through + Large Margin Regularization + + +
+ Mixed-precision quantization (MPQ) suffers from the time-consuming process of +searching the optimal bit-width allocation i.e., the policy) for each layer, +especially when using large-scale datasets such as ISLVRC-2012. This limits the +practicality of MPQ in real-world deployment scenarios. To address this issue, +this paper proposes a novel method for efficiently searching for effective MPQ +policies using a small proxy dataset instead of the large-scale dataset used +for training the model. Deviating from the established norm of employing a +consistent dataset for both model training and MPQ policy search stages, our +approach, therefore, yields a substantial enhancement in the efficiency of MPQ +exploration. Nonetheless, using discrepant datasets poses challenges in +searching for a transferable MPQ policy. Driven by the observation that +quantization noise of sub-optimal policy exerts a detrimental influence on the +discriminability of feature representations -- manifesting as diminished class +margins and ambiguous decision boundaries -- our method aims to identify +policies that uphold the discriminative nature of feature representations, +i.e., intra-class compactness and inter-class separation. This general and +dataset-independent property makes us search for the MPQ policy over a rather +small-scale proxy dataset and then the policy can be directly used to quantize +the model trained on a large-scale dataset. Our method offers several +advantages, including high proxy data utilization, no excessive hyper-parameter +tuning, and high searching efficiency. We search high-quality MPQ policies with +the proxy dataset that has only 4% of the data scale compared to the +large-scale target dataset, achieving the same accuracy as searching directly +on the latter, improving MPQ searching efficiency by up to 300 times. + +
+
+
+
+
+ + ♻ ☆ On the Trustworthiness Landscape of State-of-the-art Generative Models: + A Comprehensive Survey + + +
+ Diffusion models and large language models have emerged as leading-edge +generative models and have sparked a revolutionary impact on various aspects of +human life. However, the practical implementation of these models has also +exposed inherent risks, highlighting their dual nature and raising concerns +regarding their trustworthiness. Despite the abundance of literature on this +subject, a comprehensive survey specifically delving into the intersection of +large-scale generative models and their trustworthiness remains largely absent. +To bridge this gap, This paper investigates both the long-standing and emerging +threats associated with these models across four fundamental dimensions: +privacy, security, fairness, and responsibility. In this way, we construct an +extensive map outlining the trustworthiness of these models, while also +providing practical recommendations and identifying future directions. These +efforts are crucial for promoting the trustworthy deployment of these models, +ultimately benefiting society as a whole. + +
+
+ comment: Draft Version +
+
+
+
+
+ + ♻ ☆ CLIP2Point: Transfer CLIP to Point Cloud Classification with Image-Depth + Pre-training ICCV2023 + + +
+ Pre-training across 3D vision and language remains under development because +of limited training data. Recent works attempt to transfer vision-language +pre-training models to 3D vision. PointCLIP converts point cloud data to +multi-view depth maps, adopting CLIP for shape classification. However, its +performance is restricted by the domain gap between rendered depth maps and +images, as well as the diversity of depth distributions. To address this issue, +we propose CLIP2Point, an image-depth pre-training method by contrastive +learning to transfer CLIP to the 3D domain, and adapt it to point cloud +classification. We introduce a new depth rendering setting that forms a better +visual effect, and then render 52,460 pairs of images and depth maps from +ShapeNet for pre-training. The pre-training scheme of CLIP2Point combines +cross-modality learning to enforce the depth features for capturing expressive +visual and textual features and intra-modality learning to enhance the +invariance of depth aggregation. Additionally, we propose a novel Dual-Path +Adapter (DPA) module, i.e., a dual-path structure with simplified adapters for +few-shot learning. The dual-path structure allows the joint use of CLIP and +CLIP2Point, and the simplified adapter can well fit few-shot tasks without +post-search. Experimental results show that CLIP2Point is effective in +transferring CLIP knowledge to 3D vision. Our CLIP2Point outperforms PointCLIP +and other self-supervised 3D networks, achieving state-of-the-art results on +zero-shot and few-shot classification. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ♻ ☆ Diffusion-Based 3D Human Pose Estimation with Multi-Hypothesis + Aggregation ICCV 2023 + + +
+ In this paper, a novel Diffusion-based 3D Pose estimation (D3DP) method with +Joint-wise reProjection-based Multi-hypothesis Aggregation (JPMA) is proposed +for probabilistic 3D human pose estimation. On the one hand, D3DP generates +multiple possible 3D pose hypotheses for a single 2D observation. It gradually +diffuses the ground truth 3D poses to a random distribution, and learns a +denoiser conditioned on 2D keypoints to recover the uncontaminated 3D poses. +The proposed D3DP is compatible with existing 3D pose estimators and supports +users to balance efficiency and accuracy during inference through two +customizable parameters. On the other hand, JPMA is proposed to assemble +multiple hypotheses generated by D3DP into a single 3D pose for practical use. +It reprojects 3D pose hypotheses to the 2D camera plane, selects the best +hypothesis joint-by-joint based on the reprojection errors, and combines the +selected joints into the final pose. The proposed JPMA conducts aggregation at +the joint level and makes use of the 2D prior information, both of which have +been overlooked by previous approaches. Extensive experiments on Human3.6M and +MPI-INF-3DHP datasets show that our method outperforms the state-of-the-art +deterministic and probabilistic approaches by 1.5% and 8.9%, respectively. Code +is available at https://github.com/paTRICK-swk/D3DP. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Novel Class Discovery for Long-tailed Recognition + + +
+ While the novel class discovery has recently made great progress, existing +methods typically focus on improving algorithms on class-balanced benchmarks. +However, in real-world recognition tasks, the class distributions of their +corresponding datasets are often imbalanced, which leads to serious performance +degeneration of those methods. In this paper, we consider a more realistic +setting for novel class discovery where the distributions of novel and known +classes are long-tailed. One main challenge of this new problem is to discover +imbalanced novel classes with the help of long-tailed known classes. To tackle +this problem, we propose an adaptive self-labeling strategy based on an +equiangular prototype representation of classes. Our method infers high-quality +pseudo-labels for the novel classes by solving a relaxed optimal transport +problem and effectively mitigates the class biases in learning the known and +novel classes. We perform extensive experiments on CIFAR100, ImageNet100, +Herbarium19 and large-scale iNaturalist18 datasets, and the results demonstrate +the superiority of our method. Our code is available at +https://github.com/kleinzcy/NCDLR. + +
+
+ comment: TMLR2023, Final version +
+
+
+
+
+ + ♻ ☆ On the Choice of Perception Loss Function for Learned Video Compression + + +
+ We study causal, low-latency, sequential video compression when the output is +subjected to both a mean squared-error (MSE) distortion loss as well as a +perception loss to target realism. Motivated by prior approaches, we consider +two different perception loss functions (PLFs). The first, PLF-JD, considers +the joint distribution (JD) of all the video frames up to the current one, +while the second metric, PLF-FMD, considers the framewise marginal +distributions (FMD) between the source and reconstruction. Using information +theoretic analysis and deep-learning based experiments, we demonstrate that the +choice of PLF can have a significant effect on the reconstruction, especially +at low-bit rates. In particular, while the reconstruction based on PLF-JD can +better preserve the temporal correlation across frames, it also imposes a +significant penalty in distortion compared to PLF-FMD and further makes it more +difficult to recover from errors made in the earlier output frames. Although +the choice of PLF decisively affects reconstruction quality, we also +demonstrate that it may not be essential to commit to a particular PLF during +encoding and the choice of PLF can be delegated to the decoder. In particular, +encoded representations generated by training a system to minimize the MSE +(without requiring either PLF) can be {\em near universal} and can generate +close to optimal reconstructions for either choice of PLF at the decoder. We +validate our results using (one-shot) information-theoretic analysis, detailed +study of the rate-distortion-perception tradeoff of the Gauss-Markov source +model as well as deep-learning based experiments on moving MNIST and KTH +datasets. + +
+
+
+
+
+ + ♻ ☆ SERE: Exploring Feature Self-relation for Self-supervised Transformer + + +
+ Learning representations with self-supervision for convolutional networks +(CNN) has been validated to be effective for vision tasks. As an alternative to +CNN, vision transformers (ViT) have strong representation ability with spatial +self-attention and channel-level feedforward networks. Recent works reveal that +self-supervised learning helps unleash the great potential of ViT. Still, most +works follow self-supervised strategies designed for CNN, e.g., instance-level +discrimination of samples, but they ignore the properties of ViT. We observe +that relational modeling on spatial and channel dimensions distinguishes ViT +from other networks. To enforce this property, we explore the feature +SElf-RElation (SERE) for training self-supervised ViT. Specifically, instead of +conducting self-supervised learning solely on feature embeddings from multiple +views, we utilize the feature self-relations, i.e., spatial/channel +self-relations, for self-supervised learning. Self-relation based learning +further enhances the relation modeling ability of ViT, resulting in stronger +representations that stably improve performance on multiple downstream tasks. +Our source code will be made publicly available. + +
+
+
+
+
+ + ♻ ☆ GAEI-UNet: Global Attention and Elastic Interaction U-Net for Vessel + Image Segmentation + + +
+ Vessel image segmentation plays a pivotal role in medical diagnostics, aiding +in the early detection and treatment of vascular diseases. While segmentation +based on deep learning has shown promising results, effectively segmenting +small structures and maintaining connectivity between them remains challenging. +To address these limitations, we propose GAEI-UNet, a novel model that combines +global attention and elastic interaction-based techniques. GAEI-UNet leverages +global spatial and channel context information to enhance high-level semantic +understanding within the U-Net architecture, enabling precise segmentation of +small vessels. Additionally, we adopt an elastic interaction-based loss +function to improve connectivity among these fine structures. By capturing the +forces generated by misalignment between target and predicted shapes, our model +effectively learns to preserve the correct topology of vessel networks. +Evaluation on retinal vessel dataset -- DRIVE demonstrates the superior +performance of GAEI-UNet in terms of SE and connectivity of small structures, +without significantly increasing computational complexity. This research aims +to advance the field of vessel image segmentation, providing more accurate and +reliable diagnostic tools for the medical community. The implementation code is +available on Code. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2004.03696 by other authors +
+
+
+
+
+ + ♻ ☆ Street-View Image Generation from a Bird's-Eye View Layout + + +
+ Bird's-Eye View (BEV) Perception has received increasing attention in recent +years as it provides a concise and unified spatial representation across views +and benefits a diverse set of downstream driving applications. While the focus +has been placed on discriminative tasks such as BEV segmentation, the dual +generative task of creating street-view images from a BEV layout has rarely +been explored. The ability to generate realistic street-view images that align +with a given HD map and traffic layout is critical for visualizing complex +traffic scenarios and developing robust perception models for autonomous +driving. In this paper, we propose BEVGen, a conditional generative model that +synthesizes a set of realistic and spatially consistent surrounding images that +match the BEV layout of a traffic scenario. BEVGen incorporates a novel +cross-view transformation and spatial attention design which learn the +relationship between cameras and map views to ensure their consistency. Our +model can accurately render road and lane lines, as well as generate traffic +scenes under different weather conditions and times of day. The code will be +made publicly available. + +
+
+
+
+
+ + ♻ ☆ What Should Be Balanced in a "Balanced" Face Recognition Dataset? + + +
+ The issue of demographic disparities in face recognition accuracy has +attracted increasing attention in recent years. Various face image datasets +have been proposed as 'fair' or 'balanced' to assess the accuracy of face +recognition algorithms across demographics. These datasets typically balance +the number of identities and images across demographics. It is important to +note that the number of identities and images in an evaluation dataset are {\em +not} driving factors for 1-to-1 face matching accuracy. Moreover, balancing the +number of identities and images does not ensure balance in other factors known +to impact accuracy, such as head pose, brightness, and image quality. We +demonstrate these issues using several recently proposed datasets. To improve +the ability to perform less biased evaluations, we propose a bias-aware toolkit +that facilitates creation of cross-demographic evaluation datasets balanced on +factors mentioned in this paper. + +
+
+
+
+
+ + ♻ ☆ Test-Time Adaptation for Visual Document Understanding + + +
+ For visual document understanding (VDU), self-supervised pretraining has been +shown to successfully generate transferable representations, yet, effective +adaptation of such representations to distribution shifts at test-time remains +to be an unexplored area. We propose DocTTA, a novel test-time adaptation +method for documents, that does source-free domain adaptation using unlabeled +target document data. DocTTA leverages cross-modality self-supervised learning +via masked visual language modeling, as well as pseudo labeling to adapt models +learned on a \textit{source} domain to an unlabeled \textit{target} domain at +test time. We introduce new benchmarks using existing public datasets for +various VDU tasks, including entity recognition, key-value extraction, and +document visual question answering. DocTTA shows significant improvements on +these compared to the source model performance, up to 1.89\% in (F1 score), +3.43\% (F1 score), and 17.68\% (ANLS score), respectively. Our benchmark +datasets are available at \url{https://saynaebrahimi.github.io/DocTTA.html}. + +
+
+ comment: Accepted at TMLR 2023 +
+
+
+
+
+ + ♻ ☆ DH-PTAM: A Deep Hybrid Stereo Events-Frames Parallel Tracking And + Mapping System + + +
+ This paper presents a robust approach for a visual parallel tracking and +mapping (PTAM) system that excels in challenging environments. Our proposed +method combines the strengths of heterogeneous multi-modal visual sensors, +including stereo event-based and frame-based sensors, in a unified reference +frame through a novel spatio-temporal synchronization of stereo visual frames +and stereo event streams. We employ deep learning-based feature extraction and +description for estimation to enhance robustness further. We also introduce an +end-to-end parallel tracking and mapping optimization layer complemented by a +simple loop-closure algorithm for efficient SLAM behavior. Through +comprehensive experiments on both small-scale and large-scale real-world +sequences of VECtor and TUM-VIE benchmarks, our proposed method (DH-PTAM) +demonstrates superior performance in terms of robustness and accuracy in +adverse conditions, especially in large-scale HDR scenarios. Our +implementation's research-based Python API is publicly available on GitHub for +further research and development: https://github.com/AbanobSoliman/DH-PTAM. + +
+
+ comment: 9 pages, 9 figures and 4 tables +
+
+
+
+
+ + ♻ ☆ Factorized Inverse Path Tracing for Efficient and Accurate + Material-Lighting Estimation + + +
+ Inverse path tracing has recently been applied to joint material and lighting +estimation, given geometry and multi-view HDR observations of an indoor scene. +However, it has two major limitations: path tracing is expensive to compute, +and ambiguities exist between reflection and emission. Our Factorized Inverse +Path Tracing (FIPT) addresses these challenges by using a factored light +transport formulation and finds emitters driven by rendering errors. Our +algorithm enables accurate material and lighting optimization faster than +previous work, and is more effective at resolving ambiguities. The exhaustive +experiments on synthetic scenes show that our method (1) outperforms +state-of-the-art indoor inverse rendering and relighting methods particularly +in the presence of complex illumination effects; (2) speeds up inverse path +tracing optimization to less than an hour. We further demonstrate robustness to +noisy inputs through material and lighting estimates that allow plausible +relighting in a real scene. The source code is available at: +https://github.com/lwwu2/fipt + +
+
+ comment: Updated experiment results; modified real-world sections +
+
+
+
+
+
+
+
+ + Information Retrieval 11 + +
+
+
+ + ☆ Learning from Negative User Feedback and Measuring Responsiveness for + Sequential Recommenders RecSys 2023 + + +
+ Sequential recommenders have been widely used in industry due to their +strength in modeling user preferences. While these models excel at learning a +user's positive interests, less attention has been paid to learning from +negative user feedback. Negative user feedback is an important lever of user +control, and comes with an expectation that recommenders should respond quickly +and reduce similar recommendations to the user. However, negative feedback +signals are often ignored in the training objective of sequential retrieval +models, which primarily aim at predicting positive user interactions. In this +work, we incorporate explicit and implicit negative user feedback into the +training objective of sequential recommenders in the retrieval stage using a +"not-to-recommend" loss function that optimizes for the log-likelihood of not +recommending items with negative feedback. We demonstrate the effectiveness of +this approach using live experiments on a large-scale industrial recommender +system. Furthermore, we address a challenge in measuring recommender +responsiveness to negative feedback by developing a counterfactual simulation +framework to compare recommender responses between different user actions, +showing improved responsiveness from the modeling change. + +
+
+ comment: RecSys 2023 Industry Track +
+
+
+
+
+ + ☆ LLMRec: Benchmarking Large Language Models on Recommendation Task + + +
+ Recently, the fast development of Large Language Models (LLMs) such as +ChatGPT has significantly advanced NLP tasks by enhancing the capabilities of +conversational models. However, the application of LLMs in the recommendation +domain has not been thoroughly investigated. To bridge this gap, we propose +LLMRec, a LLM-based recommender system designed for benchmarking LLMs on +various recommendation tasks. Specifically, we benchmark several popular +off-the-shelf LLMs, such as ChatGPT, LLaMA, ChatGLM, on five recommendation +tasks, including rating prediction, sequential recommendation, direct +recommendation, explanation generation, and review summarization. Furthermore, +we investigate the effectiveness of supervised finetuning to improve LLMs' +instruction compliance ability. The benchmark results indicate that LLMs +displayed only moderate proficiency in accuracy-based tasks such as sequential +and direct recommendation. However, they demonstrated comparable performance to +state-of-the-art methods in explainability-based tasks. We also conduct +qualitative evaluations to further evaluate the quality of contents generated +by different models, and the results show that LLMs can truly understand the +provided information and generate clearer and more reasonable results. We +aspire that this benchmark will serve as an inspiration for researchers to +delve deeper into the potential of LLMs in enhancing recommendation +performance. Our codes, processed data and benchmark results are available at +https://github.com/williamliujl/LLMRec. + +
+
+
+
+
+ + ☆ Counterfactual Graph Augmentation for Consumer Unfairness Mitigation in + Recommender Systems CIKM 2023 + + +
+ In recommendation literature, explainability and fairness are becoming two +prominent perspectives to consider. However, prior works have mostly addressed +them separately, for instance by explaining to consumers why a certain item was +recommended or mitigating disparate impacts in recommendation utility. None of +them has leveraged explainability techniques to inform unfairness mitigation. +In this paper, we propose an approach that relies on counterfactual +explanations to augment the set of user-item interactions, such that using them +while inferring recommendations leads to fairer outcomes. Modeling user-item +interactions as a bipartite graph, our approach augments the latter by +identifying new user-item edges that not only can explain the original +unfairness by design, but can also mitigate it. Experiments on two public data +sets show that our approach effectively leads to a better trade-off between +fairness and recommendation utility compared with state-of-the-art mitigation +procedures. We further analyze the characteristics of added edges to highlight +key unfairness patterns. Source code available at +https://github.com/jackmedda/RS-BGExplainer/tree/cikm2023. + +
+
+ comment: Accepted as a short paper at CIKM 2023 +
+
+
+
+
+ + ☆ Hybrid Retrieval and Multi-stage Text Ranking Solution at TREC 2022 Deep + Learning Track + + +
+ Large-scale text retrieval technology has been widely used in various +practical business scenarios. This paper presents our systems for the TREC 2022 +Deep Learning Track. We explain the hybrid text retrieval and multi-stage text +ranking method adopted in our solution. The retrieval stage combined the two +structures of traditional sparse retrieval and neural dense retrieval. In the +ranking stage, in addition to the full interaction-based ranking model built on +large pre-trained language model, we also proposes a lightweight sub-ranking +module to further enhance the final text ranking performance. Evaluation +results demonstrate the effectiveness of our proposed approach. Our models +achieve the 1st and 4th rank on the test set of passage ranking and document +ranking respectively. + +
+
+ comment: TREC 2022 Deep Learning Track +
+
+
+
+
+ + ☆ LKPNR: LLM and KG for Personalized News Recommendation Framework + + +
+ Accurately recommending candidate news articles to users is a basic challenge +faced by personalized news recommendation systems. Traditional methods are +usually difficult to grasp the complex semantic information in news texts, +resulting in unsatisfactory recommendation results. Besides, these traditional +methods are more friendly to active users with rich historical behaviors. +However, they can not effectively solve the "long tail problem" of inactive +users. To address these issues, this research presents a novel general +framework that combines Large Language Models (LLM) and Knowledge Graphs (KG) +into semantic representations of traditional methods. In order to improve +semantic understanding in complex news texts, we use LLMs' powerful text +understanding ability to generate news representations containing rich semantic +information. In addition, our method combines the information about news +entities and mines high-order structural information through multiple hops in +KG, thus alleviating the challenge of long tail distribution. Experimental +results demonstrate that compared with various traditional models, the +framework significantly improves the recommendation effect. The successful +integration of LLM and KG in our framework has established a feasible path for +achieving more accurate personalized recommendations in the news field. Our +code is available at https://github.com/Xuan-ZW/LKPNR. + +
+
+
+
+
+ + ☆ Economic Recommender Systems -- A Systematic Review + + +
+ Many of today's online services provide personalized recommendations to their +users. Such recommendations are typically designed to serve certain user needs, +e.g., to quickly find relevant content in situations of information overload. +Correspondingly, the academic literature in the field largely focuses on the +value of recommender systems for the end user. In this context, one underlying +assumption is that the improved service that is achieved through the +recommendations will in turn positively impact the organization's goals, e.g., +in the form of higher customer retention or loyalty. However, in reality, +recommender systems can be used to target organizational economic goals more +directly by incorporating monetary considerations such as price awareness and +profitability aspects into the underlying recommendation models. In this work, +we survey the existing literature on what we call Economic Recommender Systems +based on a systematic review approach that helped us identify 133 relevant +papers. We first categorize existing works along different dimensions and then +review the most important technical approaches from the literature. +Furthermore, we discuss common methodologies to evaluate such systems and +finally outline the limitations of today's research and future directions. + +
+
+
+
+
+ + ☆ Integrating the Wikidata Taxonomy into YAGO + + +
+ Wikidata is one of the largest public general-purpose Knowledge Bases (KBs). +Yet, due to its collaborative nature, its schema and taxonomy have become +convoluted. For the YAGO 4 KB, we combined Wikidata with the ontology from +Schema.org, which reduced and cleaned up the taxonomy and constraints and made +it possible to run automated reasoners on the data. However, it also cut away +large parts of the Wikidata taxonomy. In this paper, we present our effort to +merge the entire Wikidata taxonomy into the YAGO KB as much as possible. We pay +particular attention to logical constraints and a careful distinction of +classes and instances. Our work creates YAGO 4.5, which adds a rich layer of +informative classes to YAGO, while at the same time keeping the KB logically +consistent. + +
+
+
+
+
+ + ☆ Evolution of ESG-focused DLT Research: An NLP Analysis of the Literature + + +
+ Distributed Ledger Technologies (DLTs) have rapidly evolved, necessitating +comprehensive insights into their diverse components. However, a systematic +literature review that emphasizes the Environmental, Sustainability, and +Governance (ESG) components of DLT remains lacking. To bridge this gap, we +selected 107 seed papers to build a citation network of 63,083 references and +refined it to a corpus of 24,539 publications for analysis. Then, we labeled +the named entities in 46 papers according to twelve top-level categories +derived from an established technology taxonomy and enhanced the taxonomy by +pinpointing DLT's ESG elements. Leveraging transformer-based language models, +we fine-tuned a pre-trained language model for a Named Entity Recognition (NER) +task using our labeled dataset. We used our fine-tuned language model to +distill the corpus to 505 key papers, facilitating a literature review via +named entities and temporal graph analysis on DLT evolution in the context of +ESG. Our contributions are a methodology to conduct a machine learning-driven +systematic literature review in the DLT field, placing a special emphasis on +ESG aspects. Furthermore, we present a first-of-its-kind NER dataset, composed +of 54,808 named entities, designed for DLT and ESG-related explorations. + +
+
+
+
+
+ + ♻ ☆ Task Relation-aware Continual User Representation Learning KDD 2023 + + +
+ User modeling, which learns to represent users into a low-dimensional +representation space based on their past behaviors, got a surge of interest +from the industry for providing personalized services to users. Previous +efforts in user modeling mainly focus on learning a task-specific user +representation that is designed for a single task. However, since learning +task-specific user representations for every task is infeasible, recent studies +introduce the concept of universal user representation, which is a more +generalized representation of a user that is relevant to a variety of tasks. +Despite their effectiveness, existing approaches for learning universal user +representations are impractical in real-world applications due to the data +requirement, catastrophic forgetting and the limited learning capability for +continually added tasks. In this paper, we propose a novel continual user +representation learning method, called TERACON, whose learning capability is +not limited as the number of learned tasks increases while capturing the +relationship between the tasks. The main idea is to introduce an embedding for +each task, i.e., task embedding, which is utilized to generate task-specific +soft masks that not only allow the entire model parameters to be updated until +the end of training sequence, but also facilitate the relationship between the +tasks to be captured. Moreover, we introduce a novel knowledge retention module +with pseudo-labeling strategy that successfully alleviates the long-standing +problem of continual learning, i.e., catastrophic forgetting. Extensive +experiments on public and proprietary real-world datasets demonstrate the +superiority and practicality of TERACON. Our code is available at +https://github.com/Sein-Kim/TERACON. + +
+
+ comment: KDD 2023 +
+
+
+
+
+ + ♻ ☆ A Tale of Two Graphs: Freezing and Denoising Graph Structures for + Multimodal Recommendation + + +
+ Multimodal recommender systems utilizing multimodal features (e.g., images +and textual descriptions) typically show better recommendation accuracy than +general recommendation models based solely on user-item interactions. +Generally, prior work fuses multimodal features into item ID embeddings to +enrich item representations, thus failing to capture the latent semantic +item-item structures. In this context, LATTICE proposes to learn the latent +structure between items explicitly and achieves state-of-the-art performance +for multimodal recommendations. However, we argue the latent graph structure +learning of LATTICE is both inefficient and unnecessary. Experimentally, we +demonstrate that freezing its item-item structure before training can also +achieve competitive performance. Based on this finding, we propose a simple yet +effective model, dubbed as FREEDOM, that FREEzes the item-item graph and +DenOises the user-item interaction graph simultaneously for Multimodal +recommendation. Theoretically, we examine the design of FREEDOM through a graph +spectral perspective and demonstrate that it possesses a tighter upper bound on +the graph spectrum. In denoising the user-item interaction graph, we devise a +degree-sensitive edge pruning method, which rejects possibly noisy edges with a +high probability when sampling the graph. We evaluate the proposed model on +three real-world datasets and show that FREEDOM can significantly outperform +current strongest baselines. Compared with LATTICE, FREEDOM achieves an average +improvement of 19.07% in recommendation accuracy while reducing its memory cost +up to 6$\times$ on large graphs. The source code is available at: +https://github.com/enoche/FREEDOM. + +
+
+ comment: Accepted to ACM Multimedia (MM) 2023 +
+
+
+
+
+ + ♻ ☆ Pareto Invariant Representation Learning for Multimedia Recommendation ACM MM 2023 + + +
+ Multimedia recommendation involves personalized ranking tasks, where +multimedia content is usually represented using a generic encoder. However, +these generic representations introduce spurious correlations that fail to +reveal users' true preferences. Existing works attempt to alleviate this +problem by learning invariant representations, but overlook the balance between +independent and identically distributed (IID) and out-of-distribution (OOD) +generalization. In this paper, we propose a framework called Pareto Invariant +Representation Learning (PaInvRL) to mitigate the impact of spurious +correlations from an IID-OOD multi-objective optimization perspective, by +learning invariant representations (intrinsic factors that attract user +attention) and variant representations (other factors) simultaneously. +Specifically, PaInvRL includes three iteratively executed modules: (i) +heterogeneous identification module, which identifies the heterogeneous +environments to reflect distributional shifts for user-item interactions; (ii) +invariant mask generation module, which learns invariant masks based on the +Pareto-optimal solutions that minimize the adaptive weighted Invariant Risk +Minimization (IRM) and Empirical Risk (ERM) losses; (iii) convert module, which +generates both variant representations and item-invariant representations for +training a multi-modal recommendation model that mitigates spurious +correlations and balances the generalization performance within and cross the +environmental distributions. We compare the proposed PaInvRL with +state-of-the-art recommendation models on three public multimedia +recommendation datasets (Movielens, Tiktok, and Kwai), and the experimental +results validate the effectiveness of PaInvRL for both within- and +cross-environmental learning. + +
+
+ comment: ACM MM 2023 full paper +
+
+
+
+
+
+
+
+ + Machine Learning 153 + +
+
+
+ + ☆ D4: Improving LLM Pretraining via Document De-Duplication and + Diversification + + +
+ Over recent years, an increasing amount of compute and data has been poured +into training large language models (LLMs), usually by doing one-pass learning +on as many tokens as possible randomly selected from large-scale web corpora. +While training on ever-larger portions of the internet leads to consistent +performance improvements, the size of these improvements diminishes with scale, +and there has been little work exploring the effect of data selection on +pre-training and downstream performance beyond simple de-duplication methods +such as MinHash. Here, we show that careful data selection (on top of +de-duplicated data) via pre-trained model embeddings can speed up training (20% +efficiency gains) and improves average downstream accuracy on 16 NLP tasks (up +to 2%) at the 6.7B model scale. Furthermore, we show that repeating data +intelligently consistently outperforms baseline training (while repeating +random data performs worse than baseline training). Our results indicate that +clever data selection can significantly improve LLM pre-training, calls into +question the common practice of training for a single epoch on as much data as +possible, and demonstrates a path to keep improving our models past the limits +of randomly sampling web data. + +
+
+
+
+
+ + ☆ Extended Linear Regression: A Kalman Filter Approach for Minimizing Loss + via Area Under the Curve + + +
+ This research enhances linear regression models by integrating a Kalman +filter and analysing curve areas to minimize loss. The goal is to develop an +optimal linear regression equation using stochastic gradient descent (SGD) for +weight updating. Our approach involves a stepwise process, starting with +user-defined parameters. The linear regression model is trained using SGD, +tracking weights and loss separately and zipping them finally. A Kalman filter +is then trained based on weight and loss arrays to predict the next +consolidated weights. Predictions result from multiplying input averages with +weights, evaluated for loss to form a weight-versus-loss curve. The curve's +equation is derived using the two-point formula, and area under the curve is +calculated via integration. The linear regression equation with minimum area +becomes the optimal curve for prediction. Benefits include avoiding constant +weight updates via gradient descent and working with partial datasets, unlike +methods needing the entire set. However, computational complexity should be +considered. The Kalman filter's accuracy might diminish beyond a certain +prediction range. + +
+
+
+
+
+ + ☆ On-Manifold Projected Gradient Descent + + +
+ This work provides a computable, direct, and mathematically rigorous +approximation to the differential geometry of class manifolds for +high-dimensional data, along with nonlinear projections from input space onto +these class manifolds. The tools are applied to the setting of neural network +image classifiers, where we generate novel, on-manifold data samples, and +implement a projected gradient descent algorithm for on-manifold adversarial +training. The susceptibility of neural networks (NNs) to adversarial attack +highlights the brittle nature of NN decision boundaries in input space. +Introducing adversarial examples during training has been shown to reduce the +susceptibility of NNs to adversarial attack; however, it has also been shown to +reduce the accuracy of the classifier if the examples are not valid examples +for that class. Realistic "on-manifold" examples have been previously generated +from class manifolds in the latent of an autoencoder. Our work explores these +phenomena in a geometric and computational setting that is much closer to the +raw, high-dimensional input space than can be provided by VAE or other black +box dimensionality reductions. We employ conformally invariant diffusion maps +(CIDM) to approximate class manifolds in diffusion coordinates, and develop the +Nystr\"{o}m projection to project novel points onto class manifolds in this +setting. On top of the manifold approximation, we leverage the spectral +exterior calculus (SEC) to determine geometric quantities such as tangent +vectors of the manifold. We use these tools to obtain adversarial examples that +reside on a class manifold, yet fool a classifier. These misclassifications +then become explainable in terms of human-understandable manipulations within +the data, by expressing the on-manifold adversary in the semantic basis on the +manifold. + +
+
+
+
+
+ + ☆ Language Reward Modulation for Pretraining Reinforcement Learning + + +
+ Using learned reward functions (LRFs) as a means to solve sparse-reward +reinforcement learning (RL) tasks has yielded some steady progress in +task-complexity through the years. In this work, we question whether today's +LRFs are best-suited as a direct replacement for task rewards. Instead, we +propose leveraging the capabilities of LRFs as a pretraining signal for RL. +Concretely, we propose $\textbf{LA}$nguage Reward $\textbf{M}$odulated +$\textbf{P}$retraining (LAMP) which leverages the zero-shot capabilities of +Vision-Language Models (VLMs) as a $\textit{pretraining}$ utility for RL as +opposed to a downstream task reward. LAMP uses a frozen, pretrained VLM to +scalably generate noisy, albeit shaped exploration rewards by computing the +contrastive alignment between a highly diverse collection of language +instructions and the image observations of an agent in its pretraining +environment. LAMP optimizes these rewards in conjunction with standard +novelty-seeking exploration rewards with reinforcement learning to acquire a +language-conditioned, pretrained policy. Our VLM pretraining approach, which is +a departure from previous attempts to use LRFs, can warmstart sample-efficient +learning on robot manipulation tasks in RLBench. + +
+
+ comment: Code available at https://github.com/ademiadeniji/lamp +
+
+
+
+
+ + ☆ FECoM: A Step towards Fine-Grained Energy Measurement for Deep Learning + + +
+ With the increasing usage, scale, and complexity of Deep Learning (DL) +models, their rapidly growing energy consumption has become a critical concern. +Promoting green development and energy awareness at different granularities is +the need of the hour to limit carbon emissions of DL systems. However, the lack +of standard and repeatable tools to accurately measure and optimize energy +consumption at a fine granularity (e.g., at method level) hinders progress in +this area. In this paper, we introduce FECoM (Fine-grained Energy Consumption +Meter), a framework for fine-grained DL energy consumption measurement. +Specifically, FECoM provides researchers and developers a mechanism to profile +DL APIs. FECoM addresses the challenges of measuring energy consumption at +fine-grained level by using static instrumentation and considering various +factors, including computational load and temperature stability. We assess +FECoM's capability to measure fine-grained energy consumption for one of the +most popular open-source DL frameworks, namely TensorFlow. Using FECoM, we also +investigate the impact of parameter size and execution time on energy +consumption, enriching our understanding of TensorFlow APIs' energy profiles. +Furthermore, we elaborate on the considerations, issues, and challenges that +one needs to consider while designing and implementing a fine-grained energy +consumption measurement tool. We hope this work will facilitate further +advances in DL energy measurement and the development of energy-aware practices +for DL systems. + +
+
+
+
+
+ + ☆ Learning from Negative User Feedback and Measuring Responsiveness for + Sequential Recommenders RecSys 2023 + + +
+ Sequential recommenders have been widely used in industry due to their +strength in modeling user preferences. While these models excel at learning a +user's positive interests, less attention has been paid to learning from +negative user feedback. Negative user feedback is an important lever of user +control, and comes with an expectation that recommenders should respond quickly +and reduce similar recommendations to the user. However, negative feedback +signals are often ignored in the training objective of sequential retrieval +models, which primarily aim at predicting positive user interactions. In this +work, we incorporate explicit and implicit negative user feedback into the +training objective of sequential recommenders in the retrieval stage using a +"not-to-recommend" loss function that optimizes for the log-likelihood of not +recommending items with negative feedback. We demonstrate the effectiveness of +this approach using live experiments on a large-scale industrial recommender +system. Furthermore, we address a challenge in measuring recommender +responsiveness to negative feedback by developing a counterfactual simulation +framework to compare recommender responses between different user actions, +showing improved responsiveness from the modeling change. + +
+
+ comment: RecSys 2023 Industry Track +
+
+
+
+
+ + ☆ How Safe Am I Given What I See? Calibrated Prediction of Safety Chances + for Image-Controlled Autonomy + + +
+ End-to-end learning has emerged as a major paradigm for developing autonomous +systems. Unfortunately, with its performance and convenience comes an even +greater challenge of safety assurance. A key factor of this challenge is the +absence of the notion of a low-dimensional and interpretable dynamical state, +around which traditional assurance methods revolve. Focusing on the online +safety prediction problem, this paper proposes a configurable family of +learning pipelines based on generative world models, which do not require +low-dimensional states. To implement these pipelines, we overcome the +challenges of learning safety-informed latent representations and missing +safety labels under prediction-induced distribution shift. These pipelines come +with statistical calibration guarantees on their safety chance predictions +based on conformal prediction. We perform an extensive evaluation of the +proposed learning pipelines on two case studies of image-controlled systems: a +racing car and a cartpole. + +
+
+
+
+
+ + ☆ How to Protect Copyright Data in Optimization of Large Language Models? + + +
+ Large language models (LLMs) and generative AI have played a transformative +role in computer research and applications. Controversy has arisen as to +whether these models output copyrighted data, which can occur if the data the +models are trained on is copyrighted. LLMs are built on the transformer neural +network architecture, which in turn relies on a mathematical computation called +Attention that uses the softmax function. + In this paper, we show that large language model training and optimization +can be seen as a softmax regression problem. We then establish a method of +efficiently performing softmax regression, in a way that prevents the +regression function from generating copyright data. This establishes a +theoretical method of training large language models in a way that avoids +generating copyright data. + +
+
+
+
+
+ + ☆ Multi-Objective Optimization for Sparse Deep Neural Network Training + + +
+ Different conflicting optimization criteria arise naturally in various Deep +Learning scenarios. These can address different main tasks (i.e., in the +setting of Multi-Task Learning), but also main and secondary tasks such as loss +minimization versus sparsity. The usual approach is a simple weighting of the +criteria, which formally only works in the convex setting. In this paper, we +present a Multi-Objective Optimization algorithm using a modified Weighted +Chebyshev scalarization for training Deep Neural Networks (DNNs) with respect +to several tasks. By employing this scalarization technique, the algorithm can +identify all optimal solutions of the original problem while reducing its +complexity to a sequence of single-objective problems. The simplified problems +are then solved using an Augmented Lagrangian method, enabling the use of +popular optimization techniques such as Adam and Stochastic Gradient Descent, +while efficaciously handling constraints. Our work aims to address the +(economical and also ecological) sustainability issue of DNN models, with a +particular focus on Deep Multi-Task models, which are typically designed with a +very large number of weights to perform equally well on multiple tasks. Through +experiments conducted on two Machine Learning datasets, we demonstrate the +possibility of adaptively sparsifying the model during training without +significantly impacting its performance, if we are willing to apply +task-specific adaptations to the network weights. Code is available at +https://github.com/salomonhotegni/MDMTN. + +
+
+ comment: 13 pages, 7 figures +
+
+
+
+
+ + ☆ Critical Learning Periods Emerge Even in Deep Linear Networks + + +
+ Critical learning periods are periods early in development where temporary +sensory deficits can have a permanent effect on behavior and learned +representations. Despite the radical differences between biological and +artificial networks, critical learning periods have been empirically observed +in both systems. This suggests that critical periods may be fundamental to +learning and not an accident of biology. Yet, why exactly critical periods +emerge in deep networks is still an open question, and in particular it is +unclear whether the critical periods observed in both systems depend on +particular architectural or optimization details. To isolate the key underlying +factors, we focus on deep linear network models, and show that, surprisingly, +such networks also display much of the behavior seen in biology and artificial +networks, while being amenable to analytical treatment. We show that critical +periods depend on the depth of the model and structure of the data +distribution. We also show analytically and in simulations that the learning of +features is tied to competition between sources. Finally, we extend our +analysis to multi-task learning to show that pre-training on certain tasks can +damage the transfer performance on new tasks, and show how this depends on the +relationship between tasks and the duration of the pre-training stage. To the +best of our knowledge, our work provides the first analytically tractable model +that sheds light into why critical learning periods emerge in biological and +artificial networks. + +
+
+
+
+
+ + ☆ Diffusion Language Models Can Perform Many Tasks with Scaling and + Instruction-Finetuning + + +
+ The recent surge of generative AI has been fueled by the generative power of +diffusion probabilistic models and the scalable capabilities of large language +models. Despite their potential, it remains elusive whether diffusion language +models can solve general language tasks comparable to their autoregressive +counterparts. This paper demonstrates that scaling diffusion models w.r.t. +data, sizes, and tasks can effectively make them strong language learners. We +build competent diffusion language models at scale by first acquiring knowledge +from massive data via masked language modeling pretraining thanks to their +intrinsic connections. We then reprogram pretrained masked language models into +diffusion language models via diffusive adaptation, wherein task-specific +finetuning and instruction finetuning are explored to unlock their versatility +in solving general language tasks. Experiments show that scaling diffusion +language models consistently improves performance across downstream language +tasks. We further discover that instruction finetuning can elicit zero-shot and +few-shot in-context learning abilities that help tackle many unseen tasks by +following natural language instructions, and show promise in advanced and +challenging abilities such as reasoning + +
+
+
+
+
+ + ☆ The Challenges of Machine Learning for Trust and Safety: A Case Study on + Misinformation Detection + + +
+ We examine the disconnect between scholarship and practice in applying +machine learning to trust and safety problems, using misinformation detection +as a case study. We systematize literature on automated detection of +misinformation across a corpus of 270 well-cited papers in the field. We then +examine subsets of papers for data and code availability, design missteps, +reproducibility, and generalizability. We find significant shortcomings in the +literature that call into question claimed performance and practicality. +Detection tasks are often meaningfully distinct from the challenges that online +services actually face. Datasets and model evaluation are often +non-representative of real-world contexts, and evaluation frequently is not +independent of model training. Data and code availability is poor. Models do +not generalize well to out-of-domain data. Based on these results, we offer +recommendations for evaluating machine learning applications to trust and +safety problems. Our aim is for future work to avoid the pitfalls that we +identify. + +
+
+
+
+
+ + ☆ Learning to Learn Financial Networks for Optimising Momentum Strategies + + +
+ Network momentum provides a novel type of risk premium, which exploits the +interconnections among assets in a financial network to predict future returns. +However, the current process of constructing financial networks relies heavily +on expensive databases and financial expertise, limiting accessibility for +small-sized and academic institutions. Furthermore, the traditional approach +treats network construction and portfolio optimisation as separate tasks, +potentially hindering optimal portfolio performance. To address these +challenges, we propose L2GMOM, an end-to-end machine learning framework that +simultaneously learns financial networks and optimises trading signals for +network momentum strategies. The model of L2GMOM is a neural network with a +highly interpretable forward propagation architecture, which is derived from +algorithm unrolling. The L2GMOM is flexible and can be trained with diverse +loss functions for portfolio performance, e.g. the negative Sharpe ratio. +Backtesting on 64 continuous future contracts demonstrates a significant +improvement in portfolio profitability and risk control, with a Sharpe ratio of +1.74 across a 20-year period. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ ULDP-FL: Federated Learning with Across Silo User-Level Differential + Privacy + + +
+ Differentially Private Federated Learning (DP-FL) has garnered attention as a +collaborative machine learning approach that ensures formal privacy. Most DP-FL +approaches ensure DP at the record-level within each silo for cross-silo FL. +However, a single user's data may extend across multiple silos, and the desired +user-level DP guarantee for such a setting remains unknown. In this study, we +present ULDP-FL, a novel FL framework designed to guarantee user-level DP in +cross-silo FL where a single user's data may belong to multiple silos. Our +proposed algorithm directly ensures user-level DP through per-user weighted +clipping, departing from group-privacy approaches. We provide a theoretical +analysis of the algorithm's privacy and utility. Additionally, we enhance the +algorithm's utility and showcase its private implementation using cryptographic +building blocks. Empirical experiments on real-world datasets show substantial +improvements in our methods in privacy-utility trade-offs under user-level DP +compared to baseline methods. To the best of our knowledge, our work is the +first FL framework that effectively provides user-level DP in the general +cross-silo FL setting. + +
+
+
+
+
+ + ☆ Curriculum Learning with Adam: The Devil Is in the Wrong Details + + +
+ Curriculum learning (CL) posits that machine learning models -- similar to +humans -- may learn more efficiently from data that match their current +learning progress. However, CL methods are still poorly understood and, in +particular for natural language processing (NLP), have achieved only limited +success. In this paper, we explore why. Starting from an attempt to replicate +and extend a number of recent curriculum methods, we find that their results +are surprisingly brittle when applied to NLP. A deep dive into the +(in)effectiveness of the curricula in some scenarios shows us why: when +curricula are employed in combination with the popular Adam optimisation +algorithm, they oftentimes learn to adapt to suboptimally chosen optimisation +parameters for this algorithm. We present a number of different case studies +with different common hand-crafted and automated CL approaches to illustrate +this phenomenon, and we find that none of them outperforms optimisation with +only Adam with well-chosen hyperparameters. As such, our results contribute to +understanding why CL methods work, but at the same time urge caution when +claiming positive results. + +
+
+
+
+
+ + ☆ Self-Supervised Knowledge-Driven Deep Learning for 3D Magnetic Inversion + + +
+ The magnetic inversion method is one of the non-destructive geophysical +methods, which aims to estimate the subsurface susceptibility distribution from +surface magnetic anomaly data. Recently, supervised deep learning methods have +been widely utilized in lots of geophysical fields including magnetic +inversion. However, these methods rely heavily on synthetic training data, +whose performance is limited since the synthetic data is not independently and +identically distributed with the field data. Thus, we proposed to realize +magnetic inversion by self-supervised deep learning. The proposed +self-supervised knowledge-driven 3D magnetic inversion method (SSKMI) learns on +the target field data by a closed loop of the inversion and forward models. +Given that the parameters of the forward model are preset, SSKMI can optimize +the inversion model by minimizing the mean absolute error between observed and +re-estimated surface magnetic anomalies. Besides, there is a knowledge-driven +module in the proposed inversion model, which makes the deep learning method +more explicable. Meanwhile, comparative experiments demonstrate that the +knowledge-driven module can accelerate the training of the proposed method and +achieve better results. Since magnetic inversion is an ill-pose task, SSKMI +proposed to constrain the inversion model by a guideline in the auxiliary loop. +The experimental results demonstrate that the proposed method is a reliable +magnetic inversion method with outstanding performance. + +
+
+ comment: 11 pages, 14 figures +
+
+
+
+
+ + ☆ Robustness Analysis of Continuous-Depth Models with Lagrangian + Techniques + + +
+ This paper presents, in a unified fashion, deterministic as well as +statistical Lagrangian-verification techniques. They formally quantify the +behavioral robustness of any time-continuous process, formulated as a +continuous-depth model. To this end, we review LRT-NG, SLR, and GoTube, +algorithms for constructing a tight reachtube, that is, an over-approximation +of the set of states reachable within a given time-horizon, and provide +guarantees for the reachtube bounds. We compare the usage of the variational +equations, associated to the system equations, the mean value theorem, and the +Lipschitz constants, in achieving deterministic and statistical guarantees. In +LRT-NG, the Lipschitz constant is used as a bloating factor of the initial +perturbation, to compute the radius of an ellipsoid in an optimal metric, which +over-approximates the set of reachable states. In SLR and GoTube, we get +statistical guarantees, by using the Lipschitz constants to compute local balls +around samples. These are needed to calculate the probability of having found +an upper bound, of the true maximum perturbation at every timestep. Our +experiments demonstrate the superior performance of Lagrangian techniques, when +compared to LRT, Flow*, and CAPD, and illustrate their use in the robustness +analysis of various continuous-depth models. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2107.08467 +
+
+
+
+
+ + ☆ Development and external validation of a lung cancer risk estimation + tool using gradient-boosting + + +
+ Lung cancer is a significant cause of mortality worldwide, emphasizing the +importance of early detection for improved survival rates. In this study, we +propose a machine learning (ML) tool trained on data from the PLCO Cancer +Screening Trial and validated on the NLST to estimate the likelihood of lung +cancer occurrence within five years. The study utilized two datasets, the PLCO +(n=55,161) and NLST (n=48,595), consisting of comprehensive information on risk +factors, clinical measurements, and outcomes related to lung cancer. Data +preprocessing involved removing patients who were not current or former smokers +and those who had died of causes unrelated to lung cancer. Additionally, a +focus was placed on mitigating bias caused by censored data. Feature selection, +hyper-parameter optimization, and model calibration were performed using +XGBoost, an ensemble learning algorithm that combines gradient boosting and +decision trees. The ML model was trained on the pre-processed PLCO dataset and +tested on the NLST dataset. The model incorporated features such as age, +gender, smoking history, medical diagnoses, and family history of lung cancer. +The model was well-calibrated (Brier score=0.044). ROC-AUC was 82% on the PLCO +dataset and 70% on the NLST dataset. PR-AUC was 29% and 11% respectively. When +compared to the USPSTF guidelines for lung cancer screening, our model provided +the same recall with a precision of 13.1% vs. 9.3% on the PLCO dataset and 3.2% +vs. 3.1% on the NLST dataset. The developed ML tool provides a freely available +web application for estimating the likelihood of developing lung cancer within +five years. By utilizing risk factors and clinical data, individuals can assess +their risk and make informed decisions regarding lung cancer screening. This +research contributes to the efforts in early detection and prevention +strategies, aiming to reduce lung cancer-related mortality rates. + +
+
+ comment: 14 pages, 4 figures, 4 tables, 1 Github repository, see + http://github.com/plbenveniste/LungCancerRisk +
+
+
+
+
+ + ☆ Unsupervised anomalies detection in IIoT edge devices networks using + federated learning + + +
+ In a connection of many IoT devices that each collect data, normally training +a machine learning model would involve transmitting the data to a central +server which requires strict privacy rules. However, some owners are reluctant +of availing their data out of the company due to data security concerns. +Federated learning(FL) as a distributed machine learning approach performs +training of a machine learning model on the device that gathered the data +itself. In this scenario, data is not share over the network for training +purpose. Fedavg as one of FL algorithms permits a model to be copied to +participating devices during a training session. The devices could be chosen at +random, and a device can be aborted. The resulting models are sent to the +coordinating server and then average models from the devices that finished +training. The process is repeated until a desired model accuracy is achieved. +By doing this, FL approach solves the privacy problem for IoT/ IIoT devices +that held sensitive data for the owners. In this paper, we leverage the +benefits of FL and implemented Fedavg algorithm on a recent dataset that +represent the modern IoT/ IIoT device networks. The results were almost the +same as the centralized machine learning approach. We also evaluated some +shortcomings of Fedavg such as unfairness that happens during the training when +struggling devices do not participate for every stage of training. This +inefficient training of local or global model could lead in a high number of +false alarms in intrusion detection systems for IoT/IIoT gadgets developed +using Fedavg. Hence, after evaluating the FedAv deep auto encoder with +centralized deep auto encoder ML, we further proposed and designed a Fair +Fedavg algorithm that will be evaluated in the future work. + +
+
+ comment: Accepted for PuBlication in machine learning journals +
+
+
+
+
+ + ☆ Data-driven decision-focused surrogate modeling + + +
+ We introduce the concept of decision-focused surrogate modeling for solving +computationally challenging nonlinear optimization problems in real-time +settings. The proposed data-driven framework seeks to learn a simpler, e.g. +convex, surrogate optimization model that is trained to minimize the decision +prediction error, which is defined as the difference between the optimal +solutions of the original and the surrogate optimization models. The learning +problem, formulated as a bilevel program, can be viewed as a data-driven +inverse optimization problem to which we apply a decomposition-based solution +algorithm from previous work. We validate our framework through numerical +experiments involving the optimization of common nonlinear chemical processes +such as chemical reactors, heat exchanger networks, and material blending +systems. We also present a detailed comparison of decision-focused surrogate +modeling with standard data-driven surrogate modeling methods and demonstrate +that our approach is significantly more data-efficient while producing simple +surrogate models with high decision prediction accuracy. + +
+
+
+
+
+ + ☆ A Probabilistic Fluctuation based Membership Inference Attack for + Generative Models + + +
+ Membership Inference Attack (MIA) identifies whether a record exists in a +machine learning model's training set by querying the model. MIAs on the +classic classification models have been well-studied, and recent works have +started to explore how to transplant MIA onto generative models. Our +investigation indicates that existing MIAs designed for generative models +mainly depend on the overfitting in target models. However, overfitting can be +avoided by employing various regularization techniques, whereas existing MIAs +demonstrate poor performance in practice. Unlike overfitting, memorization is +essential for deep learning models to attain optimal performance, making it a +more prevalent phenomenon. Memorization in generative models leads to an +increasing trend in the probability distribution of generating records around +the member record. Therefore, we propose a Probabilistic Fluctuation Assessing +Membership Inference Attack (PFAMI), a black-box MIA that infers memberships by +detecting these trends via analyzing the overall probabilistic fluctuations +around given records. We conduct extensive experiments across multiple +generative models and datasets, which demonstrate PFAMI can improve the attack +success rate (ASR) by about 27.9% when compared with the best baseline. + +
+
+
+
+
+ + ☆ Masking Strategies for Background Bias Removal in Computer Vision Models ICCV + + +
+ Models for fine-grained image classification tasks, where the difference +between some classes can be extremely subtle and the number of samples per +class tends to be low, are particularly prone to picking up background-related +biases and demand robust methods to handle potential examples with +out-of-distribution (OOD) backgrounds. To gain deeper insights into this +critical problem, our research investigates the impact of background-induced +bias on fine-grained image classification, evaluating standard backbone models +such as Convolutional Neural Network (CNN) and Vision Transformers (ViT). We +explore two masking strategies to mitigate background-induced bias: Early +masking, which removes background information at the (input) image level, and +late masking, which selectively masks high-level spatial features corresponding +to the background. Extensive experiments assess the behavior of CNN and ViT +models under different masking strategies, with a focus on their generalization +to OOD backgrounds. The obtained findings demonstrate that both proposed +strategies enhance OOD performance compared to the baseline models, with early +masking consistently exhibiting the best OOD performance. Notably, a ViT +variant employing GAP-Pooled Patch token-based classification combined with +early masking achieves the highest OOD robustness. + +
+
+ comment: Accepted at the 2023 IEEE/CVF International Conference on Computer + Vision Workshop (ICCVW) on Out Of Distribution Generalization in Computer + Vision (OOD-CV) +
+
+
+
+
+ + ☆ An Accelerated Block Proximal Framework with Adaptive Momentum for + Nonconvex and Nonsmooth Optimization + + +
+ We propose an accelerated block proximal linear framework with adaptive +momentum (ABPL$^+$) for nonconvex and nonsmooth optimization. We analyze the +potential causes of the extrapolation step failing in some algorithms, and +resolve this issue by enhancing the comparison process that evaluates the +trade-off between the proximal gradient step and the linear extrapolation step +in our algorithm. Furthermore, we extends our algorithm to any scenario +involving updating block variables with positive integers, allowing each cycle +to randomly shuffle the update order of the variable blocks. Additionally, +under mild assumptions, we prove that ABPL$^+$ can monotonically decrease the +function value without strictly restricting the extrapolation parameters and +step size, demonstrates the viability and effectiveness of updating these +blocks in a random order, and we also more obviously and intuitively +demonstrate that the derivative set of the sequence generated by our algorithm +is a critical point set. Moreover, we demonstrate the global convergence as +well as the linear and sublinear convergence rates of our algorithm by +utilizing the Kurdyka-Lojasiewicz (K{\L}) condition. To enhance the +effectiveness and flexibility of our algorithm, we also expand the study to the +imprecise version of our algorithm and construct an adaptive extrapolation +parameter strategy, which improving its overall performance. We apply our +algorithm to multiple non-negative matrix factorization with the $\ell_0$ norm, +nonnegative tensor decomposition with the $\ell_0$ norm, and perform extensive +numerical experiments to validate its effectiveness and efficiency. + +
+
+
+
+
+ + ☆ An Open-Source ML-Based Full-Stack Optimization Framework for Machine + Learning Accelerators + + +
+ Parameterizable machine learning (ML) accelerators are the product of recent +breakthroughs in ML. To fully enable their design space exploration (DSE), we +propose a physical-design-driven, learning-based prediction framework for +hardware-accelerated deep neural network (DNN) and non-DNN ML algorithms. It +adopts a unified approach that combines backend power, performance, and area +(PPA) analysis with frontend performance simulation, thereby achieving a +realistic estimation of both backend PPA and system metrics such as runtime and +energy. In addition, our framework includes a fully automated DSE technique, +which optimizes backend and system metrics through an automated search of +architectural and backend parameters. Experimental studies show that our +approach consistently predicts backend PPA and system metrics with an average +7% or less prediction error for the ASIC implementation of two deep learning +accelerator platforms, VTA and VeriGOOD-ML, in both a commercial 12 nm process +and a research-oriented 45 nm process. + +
+
+ comment: This is an extended version of our work titled "Physically Accurate + Learning-based Performance Prediction of Hardware-accelerated ML Algorithms" + published in MLCAD 2022 +
+
+
+
+
+ + ☆ Less is More -- Towards parsimonious multi-task models using structured + sparsity + + +
+ Group sparsity in Machine Learning (ML) encourages simpler, more +interpretable models with fewer active parameter groups. This work aims to +incorporate structured group sparsity into the shared parameters of a +Multi-Task Learning (MTL) framework, to develop parsimonious models that can +effectively address multiple tasks with fewer parameters while maintaining +comparable or superior performance to a dense model. Sparsifying the model +during training helps decrease the model's memory footprint, computation +requirements, and prediction time during inference. We use channel-wise l1/l2 +group sparsity in the shared layers of the Convolutional Neural Network (CNN). +This approach not only facilitates the elimination of extraneous groups +(channels) but also imposes a penalty on the weights, thereby enhancing the +learning of all tasks. We compare the outcomes of single-task and multi-task +experiments under group sparsity on two publicly available MTL datasets, NYU-v2 +and CelebAMask-HQ. We also investigate how changing the sparsification degree +impacts both the performance of the model and the sparsity of groups. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Generalized Continual Category Discovery + + +
+ Most of Continual Learning (CL) methods push the limit of supervised learning +settings, where an agent is expected to learn new labeled tasks and not forget +previous knowledge. However, these settings are not well aligned with real-life +scenarios, where a learning agent has access to a vast amount of unlabeled data +encompassing both novel (entirely unlabeled) classes and examples from known +classes. Drawing inspiration from Generalized Category Discovery (GCD), we +introduce a novel framework that relaxes this assumption. Precisely, in any +task, we allow for the existence of novel and known classes, and one must use +continual version of unsupervised learning methods to discover them. We call +this setting Generalized Continual Category Discovery (GCCD). It unifies CL and +GCD, bridging the gap between synthetic benchmarks and real-life scenarios. +With a series of experiments, we present that existing methods fail to +accumulate knowledge from subsequent tasks in which unlabeled samples of novel +classes are present. In light of these limitations, we propose a method that +incorporates both supervised and unsupervised signals and mitigates the +forgetting through the use of centroid adaptation. Our method surpasses strong +CL methods adopted for GCD techniques and presents a superior representation +learning performance. + +
+
+ comment: 7 pages, 5 figures +
+
+
+
+
+ + ☆ Quantifying degeneracy in singular models via the learning coefficient + + +
+ Deep neural networks (DNN) are singular statistical models which exhibit +complex degeneracies. In this work, we illustrate how a quantity known as the +\emph{learning coefficient} introduced in singular learning theory quantifies +precisely the degree of degeneracy in deep neural networks. Importantly, we +will demonstrate that degeneracy in DNN cannot be accounted for by simply +counting the number of "flat" directions. We propose a computationally scalable +approximation of a localized version of the learning coefficient using +stochastic gradient Langevin dynamics. To validate our approach, we demonstrate +its accuracy in low-dimensional models with known theoretical values. +Importantly, the local learning coefficient can correctly recover the ordering +of degeneracy between various parameter regions of interest. An experiment on +MNIST shows the local learning coefficient can reveal the inductive bias of +stochastic opitmizers for more or less degenerate critical points. + +
+
+ comment: 22 pages, 10 figures +
+
+
+
+
+ + ☆ Cached Operator Reordering: A Unified View for Fast GNN Training + + +
+ Graph Neural Networks (GNNs) are a powerful tool for handling structured +graph data and addressing tasks such as node classification, graph +classification, and clustering. However, the sparse nature of GNN computation +poses new challenges for performance optimization compared to traditional deep +neural networks. We address these challenges by providing a unified view of GNN +computation, I/O, and memory. By analyzing the computational graphs of the +Graph Convolutional Network (GCN) and Graph Attention (GAT) layers -- two +widely used GNN layers -- we propose alternative computation strategies. We +present adaptive operator reordering with caching, which achieves a speedup of +up to 2.43x for GCN compared to the current state-of-the-art. Furthermore, an +exploration of different caching schemes for GAT yields a speedup of up to +1.94x. The proposed optimizations save memory, are easily implemented across +various hardware platforms, and have the potential to alleviate performance +bottlenecks in training large-scale GNN models. + +
+
+
+
+
+ + ☆ Stabilizing RNN Gradients through Pre-training + + +
+ Numerous theories of learning suggest to prevent the gradient variance from +exponential growth with depth or time, to stabilize and improve training. +Typically, these analyses are conducted on feed-forward fully-connected neural +networks or single-layer recurrent neural networks, given their mathematical +tractability. In contrast, this study demonstrates that pre-training the +network to local stability can be effective whenever the architectures are too +complex for an analytical initialization. Furthermore, we extend known +stability theories to encompass a broader family of deep recurrent networks, +requiring minimal assumptions on data and parameter distribution, a theory that +we refer to as the Local Stability Condition (LSC). Our investigation reveals +that the classical Glorot, He, and Orthogonal initialization schemes satisfy +the LSC when applied to feed-forward fully-connected neural networks. However, +analysing deep recurrent networks, we identify a new additive source of +exponential explosion that emerges from counting gradient paths in a +rectangular grid in depth and time. We propose a new approach to mitigate this +issue, that consists on giving a weight of a half to the time and depth +contributions to the gradient, instead of the classical weight of one. Our +empirical results confirm that pre-training both feed-forward and recurrent +networks to fulfill the LSC often results in improved final performance across +models. This study contributes to the field by providing a means to stabilize +networks of any complexity. Our approach can be implemented as an additional +step before pre-training on large augmented datasets, and as an alternative to +finding stable initializations analytically. + +
+
+
+
+
+ + ☆ Identifying Reaction-Aware Driving Styles of Stochastic Model Predictive + Controlled Vehicles by Inverse Reinforcement Learning + + +
+ The driving style of an Autonomous Vehicle (AV) refers to how it behaves and +interacts with other AVs. In a multi-vehicle autonomous driving system, an AV +capable of identifying the driving styles of its nearby AVs can reliably +evaluate the risk of collisions and make more reasonable driving decisions. +However, there has not been a consistent definition of driving styles for an AV +in the literature, although it is considered that the driving style is encoded +in the AV's trajectories and can be identified using Maximum Entropy Inverse +Reinforcement Learning (ME-IRL) methods as a cost function. Nevertheless, an +important indicator of the driving style, i.e., how an AV reacts to its nearby +AVs, is not fully incorporated in the feature design of previous ME-IRL +methods. In this paper, we describe the driving style as a cost function of a +series of weighted features. We design additional novel features to capture the +AV's reaction-aware characteristics. Then, we identify the driving styles from +the demonstration trajectories generated by the Stochastic Model Predictive +Control (SMPC) using a modified ME-IRL method with our newly proposed features. +The proposed method is validated using MATLAB simulation and an off-the-shelf +experiment. + +
+
+
+
+
+ + ☆ InstructionGPT-4: A 200-Instruction Paradigm for Fine-Tuning MiniGPT-4 + + +
+ Multimodal large language models acquire their instruction-following +capabilities through a two-stage training process: pre-training on image-text +pairs and fine-tuning on supervised vision-language instruction data. Recent +studies have shown that large language models can achieve satisfactory results +even with a limited amount of high-quality instruction-following data. In this +paper, we introduce InstructionGPT-4, which is fine-tuned on a small dataset +comprising only 200 examples, amounting to approximately 6% of the +instruction-following data used in the alignment dataset for MiniGPT-4. We +first propose several metrics to access the quality of multimodal instruction +data. Based on these metrics, we present a simple and effective data selector +to automatically identify and filter low-quality vision-language data. By +employing this method, InstructionGPT-4 outperforms the original MiniGPT-4 on +various evaluations (e.g., visual question answering, GPT-4 preference). +Overall, our findings demonstrate that less but high-quality instruction tuning +data is efficient to enable multimodal large language models to generate better +output. + +
+
+
+
+
+ + ☆ Pre-gated MoE: An Algorithm-System Co-Design for Fast and Scalable + Mixture-of-Expert Inference + + +
+ Large language models (LLMs) based on transformers have made significant +strides in recent years, the success of which is driven by scaling up their +model size. Despite their high algorithmic performance, the computational and +memory requirements of LLMs present unprecedented challenges. To tackle the +high compute requirements of LLMs, the Mixture-of-Experts (MoE) architecture +was introduced which is able to scale its model size without proportionally +scaling up its computational requirements. Unfortunately, MoE's high memory +demands and dynamic activation of sparse experts restrict its applicability to +real-world problems. Previous solutions that offload MoE's memory-hungry expert +parameters to CPU memory fall short because the latency to migrate activated +experts from CPU to GPU incurs high performance overhead. Our proposed +Pre-gated MoE system effectively tackles the compute and memory challenges of +conventional MoE architectures using our algorithm-system co-design. Pre-gated +MoE employs our novel pre-gating function which alleviates the dynamic nature +of sparse expert activation, allowing our proposed system to address the large +memory footprint of MoEs while also achieving high performance. We demonstrate +that Pre-gated MoE is able to improve performance, reduce GPU memory +consumption, while also maintaining the same level of model quality. These +features allow our Pre-gated MoE system to cost-effectively deploy large-scale +LLMs using just a single GPU with high performance. + +
+
+
+
+
+ + ☆ Ensembling Uncertainty Measures to Improve Safety of Black-Box + Classifiers ECAI23 + + +
+ Machine Learning (ML) algorithms that perform classification may predict the +wrong class, experiencing misclassifications. It is well-known that +misclassifications may have cascading effects on the encompassing system, +possibly resulting in critical failures. This paper proposes SPROUT, a Safety +wraPper thROugh ensembles of UncertainTy measures, which suspects +misclassifications by computing uncertainty measures on the inputs and outputs +of a black-box classifier. If a misclassification is detected, SPROUT blocks +the propagation of the output of the classifier to the encompassing system. The +resulting impact on safety is that SPROUT transforms erratic outputs +(misclassifications) into data omission failures, which can be easily managed +at the system level. SPROUT has a broad range of applications as it fits binary +and multi-class classification, comprising image and tabular datasets. We +experimentally show that SPROUT always identifies a huge fraction of the +misclassifications of supervised classifiers, and it is able to detect all +misclassifications in specific cases. SPROUT implementation contains +pre-trained wrappers, it is publicly available and ready to be deployed with +minimal effort. + +
+
+ comment: To appear at ECAI23 in October23 +
+
+
+
+
+ + ☆ HarvestNet: A Dataset for Detecting Smallholder Farming Activity Using + Harvest Piles and Remote Sensing + + +
+ Small farms contribute to a large share of the productive land in developing +countries. In regions such as sub-Saharan Africa, where 80% of farms are small +(under 2 ha in size), the task of mapping smallholder cropland is an important +part of tracking sustainability measures such as crop productivity. However, +the visually diverse and nuanced appearance of small farms has limited the +effectiveness of traditional approaches to cropland mapping. Here we introduce +a new approach based on the detection of harvest piles characteristic of many +smallholder systems throughout the world. We present HarvestNet, a dataset for +mapping the presence of farms in the Ethiopian regions of Tigray and Amhara +during 2020-2023, collected using expert knowledge and satellite images, +totaling 7k hand-labeled images and 2k ground collected labels. We also +benchmark a set of baselines including SOTA models in remote sensing with our +best models having around 80% classification performance on hand labelled data +and 90%, 98% accuracy on ground truth data for Tigray, Amhara respectively. We +also perform a visual comparison with a widely used pre-existing coverage map +and show that our model detects an extra 56,621 hectares of cropland in Tigray. +We conclude that remote sensing of harvest piles can contribute to more timely +and accurate cropland assessments in food insecure region. + +
+
+ comment: 18 pages, 22 figures +
+
+
+
+
+ + ☆ Manipulating Embeddings of Stable Diffusion Prompts + + +
+ Generative text-to-image models such as Stable Diffusion allow users to +generate images based on a textual description, the prompt. Changing the prompt +is still the primary means for the user to change a generated image as desired. +However, changing the image by reformulating the prompt remains a difficult +process of trial and error, which has led to the emergence of prompt +engineering as a new field of research. We propose and analyze methods to +change the embedding of a prompt directly instead of the prompt text. It allows +for more fine-grained and targeted control that takes into account user +intentions. Our approach treats the generative text-to-image model as a +continuous function and passes gradients between the image space and the prompt +embedding space. By addressing different user interaction problems, we can +apply this idea in three scenarios: (1) Optimization of a metric defined in +image space that could measure, for example, image style. (2) Assistance of +users in creative tasks by enabling them to navigate the image space along a +selection of directions of "near" prompt embeddings. (3) Changing the embedding +of the prompt to include information that the user has seen in a particular +seed but finds difficult to describe in the prompt. Our experiments demonstrate +the feasibility of the described methods. + +
+
+
+
+
+ + ☆ Sample Complexity of Robust Learning against Evasion Attacks + + +
+ It is becoming increasingly important to understand the vulnerability of +machine learning models to adversarial attacks. One of the fundamental problems +in adversarial machine learning is to quantify how much training data is needed +in the presence of evasion attacks, where data is corrupted at test time. In +this thesis, we work with the exact-in-the-ball notion of robustness and study +the feasibility of adversarially robust learning from the perspective of +learning theory, considering sample complexity. + We first explore the setting where the learner has access to random examples +only, and show that distributional assumptions are essential. We then focus on +learning problems with distributions on the input data that satisfy a Lipschitz +condition and show that robustly learning monotone conjunctions has sample +complexity at least exponential in the adversary's budget (the maximum number +of bits it can perturb on each input). However, if the adversary is restricted +to perturbing $O(\log n)$ bits, then one can robustly learn conjunctions and +decision lists w.r.t. log-Lipschitz distributions. + We then study learning models where the learner is given more power. We first +consider local membership queries, where the learner can query the label of +points near the training sample. We show that, under the uniform distribution, +the exponential dependence on the adversary's budget to robustly learn +conjunctions remains inevitable. We then introduce a local equivalence query +oracle, which returns whether the hypothesis and target concept agree in a +given region around a point in the training sample, and a counterexample if it +exists. We show that if the query radius is equal to the adversary's budget, we +can develop robust empirical risk minimization algorithms in the +distribution-free setting. We give general query complexity upper and lower +bounds, as well as for concrete concept classes. + +
+
+ comment: DPhil (PhD) Thesis - University of Oxford +
+
+
+
+
+ + ☆ Layer-wise Feedback Propagation + + +
+ In this paper, we present Layer-wise Feedback Propagation (LFP), a novel +training approach for neural-network-like predictors that utilizes +explainability, specifically Layer-wise Relevance Propagation(LRP), to assign +rewards to individual connections based on their respective contributions to +solving a given task. This differs from traditional gradient descent, which +updates parameters towards anestimated loss minimum. LFP distributes a reward +signal throughout the model without the need for gradient computations. It then +strengthens structures that receive positive feedback while reducingthe +influence of structures that receive negative feedback. We establish the +convergence of LFP theoretically and empirically, and demonstrate its +effectiveness in achieving comparable performance to gradient descent on +various models and datasets. Notably, LFP overcomes certain limitations +associated with gradient-based methods, such as reliance on meaningful +derivatives. We further investigate how the different LRP-rules can be extended +to LFP, what their effects are on training, as well as potential applications, +such as training models with no meaningful derivatives, e.g., step-function +activated Spiking Neural Networks (SNNs), or for transfer learning, to +efficiently utilize existing knowledge. + +
+
+
+
+
+ + ☆ A multiobjective continuation method to compute the regularization path + of deep neural networks + + +
+ Sparsity is a highly desired feature in deep neural networks (DNNs) since it +ensures numerical efficiency, improves the interpretability of models (due to +the smaller number of relevant features), and robustness. In machine learning +approaches based on linear models, it is well known that there exists a +connecting path between the sparsest solution in terms of the $\ell^1$ norm +(i.e., zero weights) and the non-regularized solution, which is called the +regularization path. Very recently, there was a first attempt to extend the +concept of regularization paths to DNNs by means of treating the empirical loss +and sparsity ($\ell^1$ norm) as two conflicting criteria and solving the +resulting multiobjective optimization problem. However, due to the +non-smoothness of the $\ell^1$ norm and the high number of parameters, this +approach is not very efficient from a computational perspective. To overcome +this limitation, we present an algorithm that allows for the approximation of +the entire Pareto front for the above-mentioned objectives in a very efficient +manner. We present numerical examples using both deterministic and stochastic +gradients. We furthermore demonstrate that knowledge of the regularization path +allows for a well-generalizing network parametrization. + +
+
+ comment: 7 pages, 6 figures +
+
+
+
+
+ + ☆ IncreLoRA: Incremental Parameter Allocation Method for + Parameter-Efficient Fine-tuning + + +
+ With the increasing size of pre-trained language models (PLMs), fine-tuning +all the parameters in the model is not efficient, especially when there are a +large number of downstream tasks, which incur significant training and storage +costs. Many parameter-efficient fine-tuning (PEFT) approaches have been +proposed, among which, Low-Rank Adaptation (LoRA) is a representative approach +that injects trainable rank decomposition matrices into every target module. +Yet LoRA ignores the importance of parameters in different modules. To address +this problem, many works have been proposed to prune the parameters of LoRA. +However, under limited training conditions, the upper bound of the rank of the +pruned parameter matrix is still affected by the preset values. We, therefore, +propose IncreLoRA, an incremental parameter allocation method that adaptively +adds trainable parameters during training based on the importance scores of +each module. This approach is different from the pruning method as it is not +limited by the initial number of training parameters, and each parameter matrix +has a higher rank upper bound for the same training overhead. We conduct +extensive experiments on GLUE to demonstrate the effectiveness of IncreLoRA. +The results show that our method owns higher parameter efficiency, especially +when under the low-resource settings where our method significantly outperforms +the baselines. Our code is publicly available. + +
+
+
+
+
+ + ☆ CACTUS: a Comprehensive Abstraction and Classification Tool for + Uncovering Structures + + +
+ The availability of large data sets is providing an impetus for driving +current artificial intelligent developments. There are, however, challenges for +developing solutions with small data sets due to practical and cost-effective +deployment and the opacity of deep learning models. The Comprehensive +Abstraction and Classification Tool for Uncovering Structures called CACTUS is +presented for improved secure analytics by effectively employing explainable +artificial intelligence. It provides additional support for categorical +attributes, preserving their original meaning, optimising memory usage, and +speeding up the computation through parallelisation. It shows to the user the +frequency of the attributes in each class and ranks them by their +discriminative power. Its performance is assessed by application to the +Wisconsin diagnostic breast cancer and Thyroid0387 data sets. + +
+
+
+
+
+ + ☆ Prompt-Based Length Controlled Generation with Reinforcement Learning + + +
+ Recently, large language models (LLMs) like ChatGPT and GPT-4 have attracted +great attention given their surprising improvement and performance. Length +controlled generation of LLMs emerges as an important topic, which also enables +users to fully leverage the capability of LLMs in more real-world scenarios +like generating a proper answer or essay of a desired length. In addition, the +autoregressive generation in LLMs is extremely time-consuming, while the +ability of controlling this generated length can arbitrarily reduce the +inference cost by limiting the length, and thus satisfy different needs. +Therefore, we aim to propose a prompt-based length control method to achieve +this length controlled generation, which can also be widely applied in +GPT-style LLMs. In particular, we adopt reinforcement learning with the reward +signal given by either trainable or rule-based reward model, which further +affects the generation of LLMs via rewarding a pre-defined target length. +Experiments show that our method significantly improves the accuracy of +prompt-based length control for summarization task on popular datasets like +CNNDM and NYT. We believe this length-controllable ability can provide more +potentials towards the era of LLMs. + +
+
+
+
+
+ + ☆ A Scale-Invariant Task Balancing Approach for Multi-Task Learning + + +
+ Multi-task learning (MTL), a learning paradigm to learn multiple related +tasks simultaneously, has achieved great success in various fields. However, +task-balancing remains a significant challenge in MTL, with the disparity in +loss/gradient scales often leading to performance compromises. In this paper, +we propose a Scale-Invariant Multi-Task Learning (SI-MTL) method to alleviate +the task-balancing problem from both loss and gradient perspectives. +Specifically, SI-MTL contains a logarithm transformation which is performed on +all task losses to ensure scale-invariant at the loss level, and a gradient +balancing method, SI-G, which normalizes all task gradients to the same +magnitude as the maximum gradient norm. Extensive experiments conducted on +several benchmark datasets consistently demonstrate the effectiveness of SI-G +and the state-of-the-art performance of SI-MTL. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ☆ Bias-Aware Minimisation: Understanding and Mitigating Estimator Bias in + Private SGD + + +
+ Differentially private SGD (DP-SGD) holds the promise of enabling the safe +and responsible application of machine learning to sensitive datasets. However, +DP-SGD only provides a biased, noisy estimate of a mini-batch gradient. This +renders optimisation steps less effective and limits model utility as a result. +With this work, we show a connection between per-sample gradient norms and the +estimation bias of the private gradient oracle used in DP-SGD. Here, we propose +Bias-Aware Minimisation (BAM) that allows for the provable reduction of private +gradient estimator bias. We show how to efficiently compute quantities needed +for BAM to scale to large neural networks and highlight similarities to closely +related methods such as Sharpness-Aware Minimisation. Finally, we provide +empirical evidence that BAM not only reduces bias but also substantially +improves privacy-utility trade-offs on the CIFAR-10, CIFAR-100, and ImageNet-32 +datasets. + +
+
+ comment: Accepted to the 2023 Theory and Practice of Differential Privacy + (TPDP) Workshop +
+
+
+
+
+ + ☆ MKL-$L_{0/1}$-SVM + + +
+ This paper presents a Multiple Kernel Learning (abbreviated as MKL) framework +for the Support Vector Machine (SVM) with the $(0, 1)$ loss function. Some +first-order optimality conditions are given and then exploited to develop a +fast ADMM solver to deal with the nonconvex and nonsmooth optimization problem. +Extensive numerical experiments on synthetic and real datasets show that the +performance of our MKL-$L_{0/1}$-SVM is comparable with the one of the leading +approaches called SimpleMKL developed by Rakotomamonjy, Bach, Canu, and +Grandvalet [Journal of Machine Learning Research, vol. 9, pp. 2491-2521, 2008]. + +
+
+ comment: 25 pages in the JMLR template, 4 figures, and 2 tables. arXiv admin + note: substantial text overlap with arXiv:2303.04445 +
+
+
+
+
+ + ☆ Quantum-Noise-driven Generative Diffusion Models + + +
+ Generative models realized with machine learning techniques are powerful +tools to infer complex and unknown data distributions from a finite number of +training samples in order to produce new synthetic data. Diffusion models are +an emerging framework that have recently overcome the performance of the +generative adversarial networks in creating synthetic text and high-quality +images. Here, we propose and discuss the quantum generalization of diffusion +models, i.e., three quantum-noise-driven generative diffusion models that could +be experimentally tested on real quantum systems. The idea is to harness unique +quantum features, in particular the non-trivial interplay among coherence, +entanglement and noise that the currently available noisy quantum processors do +unavoidably suffer from, in order to overcome the main computational burdens of +classical diffusion models during inference. Hence, we suggest to exploit +quantum noise not as an issue to be detected and solved but instead as a very +remarkably beneficial key ingredient to generate much more complex probability +distributions that would be difficult or even impossible to express +classically, and from which a quantum processor might sample more efficiently +than a classical one. Therefore, our results are expected to pave the way for +new quantum-inspired or quantum-based generative diffusion algorithms +addressing more powerfully classical tasks as data generation/prediction with +widespread real-world applications ranging from climate forecasting to +neuroscience, from traffic flow analysis to financial forecasting. + +
+
+ comment: 13 pages, 2 figures +
+
+
+
+
+ + ☆ Neural oscillators for magnetic hysteresis modeling + + +
+ Hysteresis is a ubiquitous phenomenon in science and engineering; its +modeling and identification are crucial for understanding and optimizing the +behavior of various systems. We develop an ordinary differential equation-based +recurrent neural network (RNN) approach to model and quantify the hysteresis, +which manifests itself in sequentiality and history-dependence. Our neural +oscillator, HystRNN, draws inspiration from coupled-oscillatory RNN and +phenomenological hysteresis models to update the hidden states. The performance +of HystRNN is evaluated to predict generalized scenarios, involving first-order +reversal curves and minor loops. The findings show the ability of HystRNN to +generalize its behavior to previously untrained regions, an essential feature +that hysteresis models must have. This research highlights the advantage of +neural oscillators over the traditional RNN-based methods in capturing complex +hysteresis patterns in magnetic materials, where traditional rate-dependent +methods are inadequate to capture intrinsic nonlinearity. + +
+
+
+
+
+ + ☆ On Uniformly Optimal Algorithms for Best Arm Identification in Two-Armed + Bandits with Fixed Budget + + +
+ We study the problem of best-arm identification with fixed budget in +stochastic two-arm bandits with Bernoulli rewards. We prove that surprisingly, +there is no algorithm that (i) performs as well as the algorithm sampling each +arm equally (this algorithm is referred to as the {\it uniform sampling} +algorithm) on all instances, and that (ii) strictly outperforms this algorithm +on at least one instance. In short, there is no algorithm better than the +uniform sampling algorithm. Towards this result, we introduce the natural class +of {\it consistent} and {\it stable} algorithms, and show that any algorithm +that performs as well as the uniform sampling algorithm on all instances +belongs to this class. The proof is completed by deriving a lower bound on the +error rate satisfied by any consistent and stable algorithm, and by showing +that the uniform sampling algorithm matches this lower bound. Our results +provide a solution to the two open problems presented in \cite{qin2022open}. + +
+
+
+
+
+ + ☆ Relational Concept Based Models + + +
+ The design of interpretable deep learning models working in relational +domains poses an open challenge: interpretable deep learning methods, such as +Concept-Based Models (CBMs), are not designed to solve relational problems, +while relational models are not as interpretable as CBMs. To address this +problem, we propose Relational Concept-Based Models, a family of relational +deep learning methods providing interpretable task predictions. Our +experiments, ranging from image classification to link prediction in knowledge +graphs, show that relational CBMs (i) match generalization performance of +existing relational black-boxes (as opposed to non-relational CBMs), (ii) +support the generation of quantified concept-based explanations, (iii) +effectively respond to test-time interventions, and (iv) withstand demanding +settings including out-of-distribution scenarios, limited training data +regimes, and scarce concept supervisions. + +
+
+
+
+
+ + ☆ Will More Expressive Graph Neural Networks do Better on Generative + Tasks? + + +
+ Graph generation poses a significant challenge as it involves predicting a +complete graph with multiple nodes and edges based on simply a given label. +This task also carries fundamental importance to numerous real-world +applications, including de-novo drug and molecular design. In recent years, +several successful methods have emerged in the field of graph generation. +However, these approaches suffer from two significant shortcomings: (1) the +underlying Graph Neural Network (GNN) architectures used in these methods are +often underexplored; and (2) these methods are often evaluated on only a +limited number of metrics. To fill this gap, we investigate the expressiveness +of GNNs under the context of the molecular graph generation task, by replacing +the underlying GNNs of graph generative models with more expressive GNNs. +Specifically, we analyse the performance of six GNNs in two different +generative frameworks (GCPN and GraphAF), on six different molecular generative +objectives on the ZINC-250k dataset. Through our extensive experiments, we +demonstrate that advanced GNNs can indeed improve the performance of GCPN and +GraphAF on molecular generation tasks, but GNN expressiveness is not a +necessary condition for a good GNN-based generative model. Moreover, we show +that GCPN and GraphAF with advanced GNNs can achieve state-of-the-art results +across 17 other non-GNN-based graph generative approaches, such as variational +autoencoders and Bayesian optimisation models, on the proposed molecular +generative objectives (DRD2, Median1, Median2), which are important metrics for +de-novo molecular design. + +
+
+
+
+
+ + ☆ Approximating Score-based Explanation Techniques Using Conformal + Regression + + +
+ Score-based explainable machine-learning techniques are often used to +understand the logic behind black-box models. However, such explanation +techniques are often computationally expensive, which limits their application +in time-critical contexts. Therefore, we propose and investigate the use of +computationally less costly regression models for approximating the output of +score-based explanation techniques, such as SHAP. Moreover, validity guarantees +for the approximated values are provided by the employed inductive conformal +prediction framework. We propose several non-conformity measures designed to +take the difficulty of approximating the explanations into account while +keeping the computational cost low. We present results from a large-scale +empirical investigation, in which the approximate explanations generated by our +proposed models are evaluated with respect to efficiency (interval size). The +results indicate that the proposed method can significantly improve execution +time compared to the fast version of SHAP, TreeSHAP. The results also suggest +that the proposed method can produce tight intervals, while providing validity +guarantees. Moreover, the proposed approach allows for comparing explanations +of different approximation methods and selecting a method based on how +informative (tight) are the predicted intervals. + +
+
+ comment: 20 pages, 14 figures, The 12th Symposium on Conformal and + Probabilistic Prediction with Applications (COPA 2023) +
+
+
+
+
+ + ☆ EVE: Efficient Vision-Language Pre-training with Masked Prediction and + Modality-Aware MoE + + +
+ Building scalable vision-language models to learn from diverse, multimodal +data remains an open challenge. In this paper, we introduce an Efficient +Vision-languagE foundation model, namely EVE, which is one unified multimodal +Transformer pre-trained solely by one unified pre-training task. Specifically, +EVE encodes both vision and language within a shared Transformer network +integrated with modality-aware sparse Mixture-of-Experts (MoE) modules, which +capture modality-specific information by selectively switching to different +experts. To unify pre-training tasks of vision and language, EVE performs +masked signal modeling on image-text pairs to reconstruct masked signals, i.e., +image pixels and text tokens, given visible signals. This simple yet effective +pre-training objective accelerates training by 3.5x compared to the model +pre-trained with Image-Text Contrastive and Image-Text Matching losses. Owing +to the combination of the unified architecture and pre-training task, EVE is +easy to scale up, enabling better downstream performance with fewer resources +and faster training speed. Despite its simplicity, EVE achieves +state-of-the-art performance on various vision-language downstream tasks, +including visual question answering, visual reasoning, and image-text +retrieval. + +
+
+
+
+
+ + ☆ Anisotropic Hybrid Networks for liver tumor segmentation with + uncertainty quantification MICCAI + + +
+ The burden of liver tumors is important, ranking as the fourth leading cause +of cancer mortality. In case of hepatocellular carcinoma (HCC), the delineation +of liver and tumor on contrast-enhanced magnetic resonance imaging (CE-MRI) is +performed to guide the treatment strategy. As this task is time-consuming, +needs high expertise and could be subject to inter-observer variability there +is a strong need for automatic tools. However, challenges arise from the lack +of available training data, as well as the high variability in terms of image +resolution and MRI sequence. In this work we propose to compare two different +pipelines based on anisotropic models to obtain the segmentation of the liver +and tumors. The first pipeline corresponds to a baseline multi-class model that +performs the simultaneous segmentation of the liver and tumor classes. In the +second approach, we train two distinct binary models, one segmenting the liver +only and the other the tumors. Our results show that both pipelines exhibit +different strengths and weaknesses. Moreover we propose an uncertainty +quantification strategy allowing the identification of potential false positive +tumor lesions. Both solutions were submitted to the MICCAI 2023 Atlas challenge +regarding liver and tumor segmentation. + +
+
+ comment: Accepted for presentation at MICCAI Workshop on 2nd + Resource-Efficient Medical Image Analysis (REMIA) +
+
+
+
+
+ + ☆ Maintaining Plasticity via Regenerative Regularization + + +
+ In continual learning, plasticity refers to the ability of an agent to +quickly adapt to new information. Neural networks are known to lose plasticity +when processing non-stationary data streams. In this paper, we propose L2 Init, +a very simple approach for maintaining plasticity by incorporating in the loss +function L2 regularization toward initial parameters. This is very similar to +standard L2 regularization (L2), the only difference being that L2 regularizes +toward the origin. L2 Init is simple to implement and requires selecting only a +single hyper-parameter. The motivation for this method is the same as that of +methods that reset neurons or parameter values. Intuitively, when recent losses +are insensitive to particular parameters, these parameters drift toward their +initial values. This prepares parameters to adapt quickly to new tasks. On +simple problems representative of different types of nonstationarity in +continual learning, we demonstrate that L2 Init consistently mitigates +plasticity loss. We additionally find that our regularization term reduces +parameter magnitudes and maintains a high effective feature rank. + +
+
+
+
+
+ + ☆ When MiniBatch SGD Meets SplitFed Learning:Convergence Analysis and + Performance Evaluation + + +
+ Federated learning (FL) enables collaborative model training across +distributed clients (e.g., edge devices) without sharing raw data. Yet, FL can +be computationally expensive as the clients need to train the entire model +multiple times. SplitFed learning (SFL) is a recent distributed approach that +alleviates computation workload at the client device by splitting the model at +a cut layer into two parts, where clients only need to train part of the model. +However, SFL still suffers from the \textit{client drift} problem when clients' +data are highly non-IID. To address this issue, we propose MiniBatch-SFL. This +algorithm incorporates MiniBatch SGD into SFL, where the clients train the +client-side model in an FL fashion while the server trains the server-side +model similar to MiniBatch SGD. We analyze the convergence of MiniBatch-SFL and +show that the bound of the expected loss can be obtained by analyzing the +expected server-side and client-side model updates, respectively. The +server-side updates do not depend on the non-IID degree of the clients' +datasets and can potentially mitigate client drift. However, the client-side +model relies on the non-IID degree and can be optimized by properly choosing +the cut layer. Perhaps counter-intuitive, our empirical result shows that a +latter position of the cut layer leads to a smaller average gradient divergence +and a better algorithm performance. Moreover, numerical results show that +MiniBatch-SFL achieves higher accuracy than conventional SFL and FL. The +accuracy improvement can be up to 24.1\% and 17.1\% with highly non-IID data, +respectively. + +
+
+
+
+
+ + ☆ Multi-scale Transformer Pyramid Networks for Multivariate Time Series + Forecasting + + +
+ Multivariate Time Series (MTS) forecasting involves modeling temporal +dependencies within historical records. Transformers have demonstrated +remarkable performance in MTS forecasting due to their capability to capture +long-term dependencies. However, prior work has been confined to modeling +temporal dependencies at either a fixed scale or multiple scales that +exponentially increase (most with base 2). This limitation hinders their +effectiveness in capturing diverse seasonalities, such as hourly and daily +patterns. In this paper, we introduce a dimension invariant embedding technique +that captures short-term temporal dependencies and projects MTS data into a +higher-dimensional space, while preserving the dimensions of time steps and +variables in MTS data. Furthermore, we present a novel Multi-scale Transformer +Pyramid Network (MTPNet), specifically designed to effectively capture temporal +dependencies at multiple unconstrained scales. The predictions are inferred +from multi-scale latent representations obtained from transformers at various +scales. Extensive experiments on nine benchmark datasets demonstrate that the +proposed MTPNet outperforms recent state-of-the-art methods. + +
+
+
+
+
+ + ☆ RamseyRL: A Framework for Intelligent Ramsey Number Counterexample + Searching AAAI2024 + + +
+ The Ramsey number is the minimum number of nodes, $n = R(s, t)$, such that +all undirected simple graphs of order $n$, contain a clique of order $s$, or an +independent set of order $t$. This paper explores the application of a best +first search algorithm and reinforcement learning (RL) techniques to find +counterexamples to specific Ramsey numbers. We incrementally improve over prior +search methods such as random search by introducing a graph vectorization and +deep neural network (DNN)-based heuristic, which gauge the likelihood of a +graph being a counterexample. The paper also proposes algorithmic optimizations +to confine a polynomial search runtime. This paper does not aim to present new +counterexamples but rather introduces and evaluates a framework supporting +Ramsey counterexample exploration using other heuristics. Code and methods are +made available through a PyPI package and GitHub repository. + +
+
+ comment: 8 pages, 4 figures, submitted to AAAI2024 +
+
+
+
+
+ + ☆ Audio Generation with Multiple Conditional Diffusion Model AAAI 2024 + + +
+ Text-based audio generation models have limitations as they cannot encompass +all the information in audio, leading to restricted controllability when +relying solely on text. To address this issue, we propose a novel model that +enhances the controllability of existing pre-trained text-to-audio models by +incorporating additional conditions including content (timestamp) and style +(pitch contour and energy contour) as supplements to the text. This approach +achieves fine-grained control over the temporal order, pitch, and energy of +generated audio. To preserve the diversity of generation, we employ a trainable +control condition encoder that is enhanced by a large language model and a +trainable Fusion-Net to encode and fuse the additional conditions while keeping +the weights of the pre-trained text-to-audio model frozen. Due to the lack of +suitable datasets and evaluation metrics, we consolidate existing datasets into +a new dataset comprising the audio and corresponding conditions and use a +series of evaluation metrics to evaluate the controllability performance. +Experimental results demonstrate that our model successfully achieves +fine-grained control to accomplish controllable audio generation. Audio samples +and our dataset are publicly available at +https://conditionaudiogen.github.io/conditionaudiogen/ + +
+
+ comment: Submitted to AAAI 2024 +
+
+
+
+
+ + ☆ Retail Demand Forecasting: A Comparative Study for Multivariate Time + Series + + +
+ Accurate demand forecasting in the retail industry is a critical determinant +of financial performance and supply chain efficiency. As global markets become +increasingly interconnected, businesses are turning towards advanced prediction +models to gain a competitive edge. However, existing literature mostly focuses +on historical sales data and ignores the vital influence of macroeconomic +conditions on consumer spending behavior. In this study, we bridge this gap by +enriching time series data of customer demand with macroeconomic variables, +such as the Consumer Price Index (CPI), Index of Consumer Sentiment (ICS), and +unemployment rates. Leveraging this comprehensive dataset, we develop and +compare various regression and machine learning models to predict retail demand +accurately. + +
+
+
+
+
+ + ☆ System Identification for Continuous-time Linear Dynamical Systems + + +
+ The problem of system identification for the Kalman filter, relying on the +expectation-maximization (EM) procedure to learn the underlying parameters of a +dynamical system, has largely been studied assuming that observations are +sampled at equally-spaced time points. However, in many applications this is a +restrictive and unrealistic assumption. This paper addresses system +identification for the continuous-discrete filter, with the aim of generalizing +learning for the Kalman filter by relying on a solution to a continuous-time +It\^o stochastic differential equation (SDE) for the latent state and +covariance dynamics. We introduce a novel two-filter, analytical form for the +posterior with a Bayesian derivation, which yields analytical updates which do +not require the forward-pass to be pre-computed. Using this analytical and +efficient computation of the posterior, we provide an EM procedure which +estimates the parameters of the SDE, naturally incorporating irregularly +sampled measurements. Generalizing the learning of latent linear dynamical +systems (LDS) to continuous-time may extend the use of the hybrid Kalman filter +to data which is not regularly sampled or has intermittent missing values, and +can extend the power of non-linear system identification methods such as +switching LDS (SLDS), which rely on EM for the linear discrete-time Kalman +filter as a sub-unit for learning locally linearized behavior of a non-linear +system. We apply the method by learning the parameters of a latent, +multivariate Fokker-Planck SDE representing a toggle-switch genetic circuit +using biologically realistic parameters, and compare the efficacy of learning +relative to the discrete-time Kalman filter as the step-size irregularity and +spectral-radius of the dynamics-matrix increases. + +
+
+ comment: 32 pages, 3 figures +
+
+
+
+
+ + ☆ Dynamic landslide susceptibility mapping over recent three decades to + uncover variations in landslide causes in subtropical urban mountainous areas + + +
+ Landslide susceptibility assessment (LSA) is of paramount importance in +mitigating landslide risks. Recently, there has been a surge in the utilization +of data-driven methods for predicting landslide susceptibility due to the +growing availability of aerial and satellite data. Nonetheless, the rapid +oscillations within the landslide-inducing environment (LIE), primarily due to +significant changes in external triggers such as rainfall, pose difficulties +for contemporary data-driven LSA methodologies to accommodate LIEs over diverse +timespans. This study presents dynamic landslide susceptibility mapping that +simply employs multiple predictive models for annual LSA. In practice, this +will inevitably encounter small sample problems due to the limited number of +landslide samples in certain years. Another concern arises owing to the +majority of the existing LSA approaches train black-box models to fit distinct +datasets, yet often failing in generalization and providing comprehensive +explanations concerning the interactions between input features and +predictions. Accordingly, we proposed to meta-learn representations with fast +adaptation ability using a few samples and gradient updates; and apply SHAP for +each model interpretation and landslide feature permutation. Additionally, we +applied MT-InSAR for LSA result enhancement and validation. The chosen study +area is Lantau Island, Hong Kong, where we conducted a comprehensive dynamic +LSA spanning from 1992 to 2019. The model interpretation results demonstrate +that the primary factors responsible for triggering landslides in Lantau Island +are terrain slope and extreme rainfall. The results also indicate that the +variation in landslide causes can be primarily attributed to extreme rainfall +events, which result from global climate change, and the implementation of the +Landslip Prevention and Mitigation Programme (LPMitP) by the Hong Kong +government. + +
+
+
+
+
+ + ☆ Solving Elliptic Optimal Control Problems using Physics Informed Neural + Networks + + +
+ In this work, we present and analyze a numerical solver for optimal control +problems (without / with box constraint) for linear and semilinear second-order +elliptic problems. The approach is based on a coupled system derived from the +first-order optimality system of the optimal control problem, and applies +physics informed neural networks (PINNs) to solve the coupled system. We +present an error analysis of the numerical scheme, and provide $L^2(\Omega)$ +error bounds on the state, control and adjoint state in terms of deep neural +network parameters (e.g., depth, width, and parameter bounds) and the number of +sampling points in the domain and on the boundary. The main tools in the +analysis include offset Rademacher complexity and boundedness and Lipschitz +continuity of neural network functions. We present several numerical examples +to illustrate the approach and compare it with three existing approaches. + +
+
+ comment: 28 pages, 5 figures +
+
+
+
+
+ + ☆ Diverse Policies Converge in Reward-free Markov Decision Processe + + +
+ Reinforcement learning has achieved great success in many decision-making +tasks, and traditional reinforcement learning algorithms are mainly designed +for obtaining a single optimal solution. However, recent works show the +importance of developing diverse policies, which makes it an emerging research +topic. Despite the variety of diversity reinforcement learning algorithms that +have emerged, none of them theoretically answer the question of how the +algorithm converges and how efficient the algorithm is. In this paper, we +provide a unified diversity reinforcement learning framework and investigate +the convergence of training diverse policies. Under such a framework, we also +propose a provably efficient diversity reinforcement learning algorithm. +Finally, we verify the effectiveness of our method through numerical +experiments. + +
+
+
+
+
+ + ☆ Audio Difference Captioning Utilizing Similarity-Discrepancy + Disentanglement + + +
+ We proposed Audio Difference Captioning (ADC) as a new extension task of +audio captioning for describing the semantic differences between input pairs of +similar but slightly different audio clips. The ADC solves the problem that +conventional audio captioning sometimes generates similar captions for similar +audio clips, failing to describe the difference in content. We also propose a +cross-attention-concentrated transformer encoder to extract differences by +comparing a pair of audio clips and a similarity-discrepancy disentanglement to +emphasize the difference in the latent space. To evaluate the proposed methods, +we built an AudioDiffCaps dataset consisting of pairs of similar but slightly +different audio clips with human-annotated descriptions of their differences. +The experiment with the AudioDiffCaps dataset showed that the proposed methods +solve the ADC task effectively and improve the attention weights to extract the +difference by visualizing them in the transformer encoder. + +
+
+ comment: Accepted to DCASE2023 Workshop +
+
+
+
+
+ + ☆ Addressing Selection Bias in Computerized Adaptive Testing: A User-Wise + Aggregate Influence Function Approach CIKM 2023 + + +
+ Computerized Adaptive Testing (CAT) is a widely used, efficient test mode +that adapts to the examinee's proficiency level in the test domain. CAT +requires pre-trained item profiles, for CAT iteratively assesses the student +real-time based on the registered items' profiles, and selects the next item to +administer using candidate items' profiles. However, obtaining such item +profiles is a costly process that involves gathering a large, dense +item-response data, then training a diagnostic model on the collected data. In +this paper, we explore the possibility of leveraging response data collected in +the CAT service. We first show that this poses a unique challenge due to the +inherent selection bias introduced by CAT, i.e., more proficient students will +receive harder questions. Indeed, when naively training the diagnostic model +using CAT response data, we observe that item profiles deviate significantly +from the ground-truth. To tackle the selection bias issue, we propose the +user-wise aggregate influence function method. Our intuition is to filter out +users whose response data is heavily biased in an aggregate manner, as judged +by how much perturbation the added data will introduce during parameter +estimation. This way, we may enhance the performance of CAT while introducing +minimal bias to the item profiles. We provide extensive experiments to +demonstrate the superiority of our proposed method based on the three public +datasets and one dataset that contains real-world CAT response data. + +
+
+ comment: CIKM 2023 +
+
+
+
+
+ + ☆ Utilizing Admissible Bounds for Heuristic Learning + + +
+ While learning a heuristic function for forward search algorithms with modern +machine learning techniques has been gaining interest in recent years, there +has been little theoretical understanding of \emph{what} they should learn, +\emph{how} to train them, and \emph{why} we do so. This lack of understanding +leads to various literature performing an ad-hoc selection of datasets +(suboptimal vs optimal costs or admissible vs inadmissible heuristics) and +optimization metrics (e.g., squared vs absolute errors). Moreover, due to the +lack of admissibility of the resulting trained heuristics, little focus has +been put on the role of admissibility \emph{during} learning. This paper +articulates the role of admissible heuristics in supervised heuristic learning +using them as parameters of Truncated Gaussian distributions, which tightens +the hypothesis space compared to ordinary Gaussian distributions. We argue that +this mathematical model faithfully follows the principle of maximum entropy and +empirically show that, as a result, it yields more accurate heuristics and +converges faster during training. + +
+
+ comment: 14 pages, 3 figures +
+
+
+
+
+ + ☆ Rethinking Data Perturbation and Model Stabilization for Semi-supervised + Medical Image Segmentation + + +
+ Studies on semi-supervised medical image segmentation (SSMIS) have seen fast +progress recently. Due to the limited labelled data, SSMIS methods mainly focus +on effectively leveraging unlabeled data to enhance the segmentation +performance. However, despite their promising performance, current +state-of-the-art methods often prioritize integrating complex techniques and +loss terms rather than addressing the core challenges of semi-supervised +scenarios directly. We argue that the key to SSMIS lies in generating +substantial and appropriate prediction disagreement on unlabeled data. To this +end, we emphasize the crutiality of data perturbation and model stabilization +in semi-supervised segmentation, and propose a simple yet effective approach to +boost SSMIS performance significantly, dubbed DPMS. Specifically, we first +revisit SSMIS from three distinct perspectives: the data, the model, and the +loss, and conduct a comprehensive study of corresponding strategies to examine +their effectiveness. Based on these examinations, we then propose DPMS, which +adopts a plain teacher-student framework with a standard supervised loss and +unsupervised consistency loss. To produce appropriate prediction disagreements, +DPMS perturbs the unlabeled data via strong augmentations to enlarge prediction +disagreements considerably. On the other hand, using EMA teacher when strong +augmentation is applied does not necessarily improve performance. DPMS further +utilizes a forwarding-twice and momentum updating strategies for normalization +statistics to stabilize the training on unlabeled data effectively. Despite its +simplicity, DPMS can obtain new state-of-the-art performance on the public 2D +ACDC and 3D LA datasets across various semi-supervised settings, e.g. obtaining +a remarkable 22.62% improvement against previous SOTA on ACDC with 5% labels. + +
+
+ comment: Code and logs are available at https://github.com/ZhenZHAO/DPMS +
+
+
+
+
+ + ☆ Shape-conditioned 3D Molecule Generation via Equivariant Diffusion + Models + + +
+ Ligand-based drug design aims to identify novel drug candidates of similar +shapes with known active molecules. In this paper, we formulated an in silico +shape-conditioned molecule generation problem to generate 3D molecule +structures conditioned on the shape of a given molecule. To address this +problem, we developed a translation- and rotation-equivariant shape-guided +generative model ShapeMol. ShapeMol consists of an equivariant shape encoder +that maps molecular surface shapes into latent embeddings, and an equivariant +diffusion model that generates 3D molecules based on these embeddings. +Experimental results show that ShapeMol can generate novel, diverse, drug-like +molecules that retain 3D molecular shapes similar to the given shape condition. +These results demonstrate the potential of ShapeMol in designing drug +candidates of desired 3D shapes binding to protein target pockets. + +
+
+
+
+
+ + ☆ Adversarial Training Using Feedback Loops + + +
+ Deep neural networks (DNN) have found wide applicability in numerous fields +due to their ability to accurately learn very complex input-output relations. +Despite their accuracy and extensive use, DNNs are highly susceptible to +adversarial attacks due to limited generalizability. For future progress in the +field, it is essential to build DNNs that are robust to any kind of +perturbations to the data points. In the past, many techniques have been +proposed to robustify DNNs using first-order derivative information of the +network. + This paper proposes a new robustification approach based on control theory. A +neural network architecture that incorporates feedback control, named Feedback +Neural Networks, is proposed. The controller is itself a neural network, which +is trained using regular and adversarial data such as to stabilize the system +outputs. The novel adversarial training approach based on the feedback control +architecture is called Feedback Looped Adversarial Training (FLAT). Numerical +results on standard test problems empirically show that our FLAT method is more +effective than the state-of-the-art to guard against adversarial attacks. + +
+
+
+
+
+ + ☆ SUMMIT: Source-Free Adaptation of Uni-Modal Models to Multi-Modal + Targets ICCV 2023 + + +
+ Scene understanding using multi-modal data is necessary in many applications, +e.g., autonomous navigation. To achieve this in a variety of situations, +existing models must be able to adapt to shifting data distributions without +arduous data annotation. Current approaches assume that the source data is +available during adaptation and that the source consists of paired multi-modal +data. Both these assumptions may be problematic for many applications. Source +data may not be available due to privacy, security, or economic concerns. +Assuming the existence of paired multi-modal data for training also entails +significant data collection costs and fails to take advantage of widely +available freely distributed pre-trained uni-modal models. In this work, we +relax both of these assumptions by addressing the problem of adapting a set of +models trained independently on uni-modal data to a target domain consisting of +unlabeled multi-modal data, without having access to the original source +dataset. Our proposed approach solves this problem through a switching +framework which automatically chooses between two complementary methods of +cross-modal pseudo-label fusion -- agreement filtering and entropy weighting -- +based on the estimated domain gap. We demonstrate our work on the semantic +segmentation problem. Experiments across seven challenging adaptation scenarios +verify the efficacy of our approach, achieving results comparable to, and in +some cases outperforming, methods which assume access to source data. Our +method achieves an improvement in mIoU of up to 12% over competing baselines. +Our code is publicly available at https://github.com/csimo005/SUMMIT. + +
+
+ comment: 12 pages, 5 figures, 9 tables, ICCV 2023 +
+
+
+
+
+ + ☆ Cabrita: closing the gap for foreign languages + + +
+ The strategy of training the model from scratch in a specific language or +domain serves two essential purposes: i) enhancing performance in the +particular linguistic or domain context, and ii) ensuring effective +tokenization. The main limitation inherent to this approach lies in the +associated cost, which can reach six to seven-digit dollar values, depending on +the model size and the number of parameters involved. + The main solution to overcome the cost challenge is to rely on available +pre-trained models, which, despite recent advancements such as the LLaMA and +LLaMA-2 models, still demonstrate inefficiency for certain specific domain +problems or prove ineffective in scenarios involving conversational memory +resources, given the large number of tokens required to represent text. + To overcome this issue, we present a methodology named Cabrita, which, as our +research demonstrates, successfully addresses the performance and efficient +tokenization problem, all at an affordable cost. We believe that this +methodology can be applied to any transformer-like architecture model. To +validate the study, we conducted continuous pre-training exclusively using +Portuguese text on a 3-billion-parameter model known as OpenLLaMA, resulting in +a model named openCabrita 3B. The openCabrita 3B also features a new tokenizer +that results in a significant reduction in the number of tokens required to +represent the text. In our assessment, for few-shot learning tasks, we achieved +similar results with this 3B model compared to a traditional continuous +pre-training approach as well as to 7B models English pre-trained models. + +
+
+ comment: 9 pages, 1 figure +
+
+
+
+
+ + ☆ Integrating Large Language Models into the Debugging C Compiler for + generating contextual error explanations + + +
+ This paper introduces a method for Large Language Models (LLM) to produce +enhanced compiler error explanations, in simple language, within our Debugging +C Compiler (DCC). It is well documented that compiler error messages have been +known to present a barrier for novices learning how to program. Although our +initial use of DCC in introductory programming (CS1) has been instrumental in +teaching C to novice programmers by providing safeguards to commonly occurring +errors and translating the usually cryptic compiler error messages at both +compile- and run-time, we proposed that incorporating LLM-generated +explanations would further enhance the learning experience for novice +programmers. Through an expert evaluation, we observed that LLM-generated +explanations for compiler errors were conceptually accurate in 90% of +compile-time errors, and 75% of run-time errors. Additionally, the new DCC-help +tool has been increasingly adopted by students, with an average of 1047 unique +runs per week, demonstrating a promising initial assessment of using LLMs to +complement compiler output to enhance programming education for beginners. We +release our tool as open-source to the community. + +
+
+ comment: 7 pages, 2 figures +
+
+
+
+
+ + ☆ KinSPEAK: Improving speech recognition for Kinyarwanda via + semi-supervised learning methods + + +
+ Despite recent availability of large transcribed Kinyarwanda speech data, +achieving robust speech recognition for Kinyarwanda is still challenging. In +this work, we show that using self-supervised pre-training, following a simple +curriculum schedule during fine-tuning and using semi-supervised learning to +leverage large unlabelled speech data significantly improve speech recognition +performance for Kinyarwanda. Our approach focuses on using public domain data +only. A new studio-quality speech dataset is collected from a public website, +then used to train a clean baseline model. The clean baseline model is then +used to rank examples from a more diverse and noisy public dataset, defining a +simple curriculum training schedule. Finally, we apply semi-supervised learning +to label and learn from large unlabelled data in four successive generations. +Our final model achieves 3.2% word error rate (WER) on the new dataset and +15.9% WER on Mozilla Common Voice benchmark, which is state-of-the-art to the +best of our knowledge. Our experiments also indicate that using syllabic rather +than character-based tokenization results in better speech recognition +performance for Kinyarwanda. + +
+
+ comment: 9 pages, 2 figures, 5 tables +
+
+
+
+
+ + ☆ Finding the Perfect Fit: Applying Regression Models to ClimateBench v1.0 + + +
+ Climate projections using data driven machine learning models acting as +emulators, is one of the prevailing areas of research to enable policy makers +make informed decisions. Use of machine learning emulators as surrogates for +computationally heavy GCM simulators reduces time and carbon footprints. In +this direction, ClimateBench [1] is a recently curated benchmarking dataset for +evaluating the performance of machine learning emulators designed for climate +data. Recent studies have reported that despite being considered fundamental, +regression models offer several advantages pertaining to climate emulations. In +particular, by leveraging the kernel trick, regression models can capture +complex relationships and improve their predictive capabilities. This study +focuses on evaluating non-linear regression models using the aforementioned +dataset. Specifically, we compare the emulation capabilities of three +non-linear regression models. Among them, Gaussian Process Regressor +demonstrates the best-in-class performance against standard evaluation metrics +used for climate field emulation studies. However, Gaussian Process Regression +suffers from being computational resource hungry in terms of space and time +complexity. Alternatively, Support Vector and Kernel Ridge models also deliver +competitive results and but there are certain trade-offs to be addressed. +Additionally, we are actively investigating the performance of composite +kernels and techniques such as variational inference to further enhance the +performance of the regression models and effectively model complex non-linear +patterns, including phenomena like precipitation. + +
+
+
+
+
+ + ☆ A deep reinforcement learning approach for real-time demand-responsive + railway rescheduling to mitigate station overcrowding using mobile data + + +
+ Real-time railway rescheduling is a timely and flexible technique to +automatically alter the operation schedule in response to time-varying +conditions. Current research lacks data-driven approaches that capture +real-time passenger mobility during railway disruptions, relying mostly on +OD-based data and model-based methods for estimating demands of trains. +Meanwhile, the schedule-updating principles for a long-term disruption overlook +the uneven distribution of demand over time. To fill this gap, this paper +proposes a demand-responsive approach by inferring real-world passenger +mobility from mobile data (MD) to facilitate real-time rescheduling. Unlike +network-level approaches, this paper focuses on a heavy-demand station upstream +of the disrupted area. The objective is to reschedule all trains on multiple +routes passing through this target station, which have been affected by a +severe emergency event such as a natural disaster. Particular attention should +be given to avoiding the accumulation of overcrowded passengers at this +station, to prevent additional accidents arising from overcrowding. This +research addresses the challenges associated with this scenario, including the +dynamics of arriving and leaving of passengers, station overcrowding, rolling +stock shortage, open-ended disruption duration, integrated rescheduling on +multiple routes, and delays due to detours. A deep reinforcement learning (DRL) +framework is proposed to determine the optimal rescheduled timetable, route +stops, and rolling stock allocation, while considering real-time demand +satisfaction, station overcrowding, train capacity utilization, and headway +safety. + +
+
+ comment: 36 pages,16 figures +
+
+
+
+
+ + ☆ SEA: Shareable and Explainable Attribution for Query-based Black-box + Attacks + + +
+ Machine Learning (ML) systems are vulnerable to adversarial examples, +particularly those from query-based black-box attacks. Despite various efforts +to detect and prevent such attacks, there is a need for a more comprehensive +approach to logging, analyzing, and sharing evidence of attacks. While classic +security benefits from well-established forensics and intelligence sharing, +Machine Learning is yet to find a way to profile its attackers and share +information about them. In response, this paper introduces SEA, a novel ML +security system to characterize black-box attacks on ML systems for forensic +purposes and to facilitate human-explainable intelligence sharing. SEA +leverages the Hidden Markov Models framework to attribute the observed query +sequence to known attacks. It thus understands the attack's progression rather +than just focusing on the final adversarial examples. Our evaluations reveal +that SEA is effective at attack attribution, even on their second occurrence, +and is robust to adaptive strategies designed to evade forensics analysis. +Interestingly, SEA's explanations of the attack behavior allow us even to +fingerprint specific minor implementation bugs in attack libraries. For +example, we discover that the SignOPT and Square attacks implementation in ART +v1.14 sends over 50% specific zero difference queries. We thoroughly evaluate +SEA on a variety of settings and demonstrate that it can recognize the same +attack's second occurrence with 90+% Top-1 and 95+% Top-3 accuracy. + +
+
+
+
+
+ + ☆ ${\rm E}(3)$-Equivariant Actor-Critic Methods for Cooperative + Multi-Agent Reinforcement Learning + + +
+ Identification and analysis of symmetrical patterns in the natural world have +led to significant discoveries across various scientific fields, such as the +formulation of gravitational laws in physics and advancements in the study of +chemical structures. In this paper, we focus on exploiting Euclidean symmetries +inherent in certain cooperative multi-agent reinforcement learning (MARL) +problems and prevalent in many applications. We begin by formally +characterizing a subclass of Markov games with a general notion of symmetries +that admits the existence of symmetric optimal values and policies. Motivated +by these properties, we design neural network architectures with symmetric +constraints embedded as an inductive bias for multi-agent actor-critic methods. +This inductive bias results in superior performance in various cooperative MARL +benchmarks and impressive generalization capabilities such as zero-shot +learning and transfer learning in unseen scenarios with repeated symmetric +patterns. The code is available at: https://github.com/dchen48/E3AC. + +
+
+
+
+
+ + ☆ A Survey for Federated Learning Evaluations: Goals and Measures + + +
+ Evaluation is a systematic approach to assessing how well a system achieves +its intended purpose. Federated learning (FL) is a novel paradigm for +privacy-preserving machine learning that allows multiple parties to +collaboratively train models without sharing sensitive data. However, +evaluating FL is challenging due to its interdisciplinary nature and diverse +goals, such as utility, efficiency, and security. In this survey, we first +review the major evaluation goals adopted in the existing studies and then +explore the evaluation metrics used for each goal. We also introduce FedEval, +an open-source platform that provides a standardized and comprehensive +evaluation framework for FL algorithms in terms of their utility, efficiency, +and security. Finally, we discuss several challenges and future research +directions for FL evaluation. + +
+
+
+
+
+ + ☆ A Benchmark Study on Calibration + + +
+ Deep neural networks are increasingly utilized in various machine learning +tasks. However, as these models grow in complexity, they often face calibration +issues, despite enhanced prediction accuracy. Many studies have endeavored to +improve calibration performance through data preprocessing, the use of specific +loss functions, and training frameworks. Yet, investigations into calibration +properties have been somewhat overlooked. Our study leverages the Neural +Architecture Search (NAS) search space, offering an exhaustive model +architecture space for thorough calibration properties exploration. We +specifically create a model calibration dataset. This dataset evaluates 90 +bin-based and 12 additional calibration measurements across 117,702 unique +neural networks within the widely employed NATS-Bench search space. Our +analysis aims to answer several longstanding questions in the field, using our +proposed dataset: (i) Can model calibration be generalized across different +tasks? (ii) Can robustness be used as a calibration measurement? (iii) How +reliable are calibration metrics? (iv) Does a post-hoc calibration method +affect all models uniformly? (v) How does calibration interact with accuracy? +(vi) What is the impact of bin size on calibration measurement? (vii) Which +architectural designs are beneficial for calibration? Additionally, our study +bridges an existing gap by exploring calibration within NAS. By providing this +dataset, we enable further research into NAS calibration. As far as we are +aware, our research represents the first large-scale investigation into +calibration properties and the premier study of calibration issues within NAS. + +
+
+ comment: 39 pages, 35 figures +
+
+
+
+
+ + ☆ Zero-delay Consistent Signal Reconstruction from Streamed Multivariate + Time Series + + +
+ Digitalizing real-world analog signals typically involves sampling in time +and discretizing in amplitude. Subsequent signal reconstructions inevitably +incur an error that depends on the amplitude resolution and the temporal +density of the acquired samples. From an implementation viewpoint, consistent +signal reconstruction methods have proven a profitable error-rate decay as the +sampling rate increases. Despite that, these results are obtained under offline +settings. Therefore, a research gap exists regarding methods for consistent +signal reconstruction from data streams. This paper presents a method that +consistently reconstructs streamed multivariate time series of quantization +intervals under a zero-delay response requirement. On the other hand, previous +work has shown that the temporal dependencies within univariate time series can +be exploited to reduce the roughness of zero-delay signal reconstructions. This +work shows that the spatiotemporal dependencies within multivariate time series +can also be exploited to achieve improved results. Specifically, the +spatiotemporal dependencies of the multivariate time series are learned, with +the assistance of a recurrent neural network, to reduce the roughness of the +signal reconstruction on average while ensuring consistency. Our experiments +show that our proposed method achieves a favorable error-rate decay with the +sampling rate compared to a similar but non-consistent reconstruction. + +
+
+ comment: 11 pages, 8 figures +
+
+
+
+
+ + ☆ PFL-GAN: When Client Heterogeneity Meets Generative Models in + Personalized Federated Learning + + +
+ Recent advances of generative learning models are accompanied by the growing +interest in federated learning (FL) based on generative adversarial network +(GAN) models. In the context of FL, GAN can capture the underlying client data +structure, and regenerate samples resembling the original data distribution +without compromising the private raw data. Although most existing GAN-based FL +works focus on training a global model, Personalized FL (PFL) sometimes can be +more effective in view of client data heterogeneity in terms of distinct data +sample distributions, feature spaces, and labels. To cope with client +heterogeneity in GAN-based FL, we propose a novel GAN sharing and aggregation +strategy for PFL. The proposed PFL-GAN addresses the client heterogeneity in +different scenarios. More specially, we first learn the similarity among +clients and then develop an weighted collaborative data aggregation. The +empirical results through the rigorous experimentation on several well-known +datasets demonstrate the effectiveness of PFL-GAN. + +
+
+
+
+
+ + ☆ Augmenting medical image classifiers with synthetic data from latent + diffusion models + + +
+ While hundreds of artificial intelligence (AI) algorithms are now approved or +cleared by the US Food and Drugs Administration (FDA), many studies have shown +inconsistent generalization or latent bias, particularly for underrepresented +populations. Some have proposed that generative AI could reduce the need for +real data, but its utility in model development remains unclear. Skin disease +serves as a useful case study in synthetic image generation due to the +diversity of disease appearance, particularly across the protected attribute of +skin tone. Here we show that latent diffusion models can scalably generate +images of skin disease and that augmenting model training with these data +improves performance in data-limited settings. These performance gains saturate +at synthetic-to-real image ratios above 10:1 and are substantially smaller than +the gains obtained from adding real images. As part of our analysis, we +generate and analyze a new dataset of 458,920 synthetic images produced using +several generation strategies. Our results suggest that synthetic data could +serve as a force-multiplier for model development, but the collection of +diverse real-world data remains the most important step to improve medical AI +algorithms. + +
+
+
+
+
+ + ☆ An Intentional Forgetting-Driven Self-Healing Method For Deep + Reinforcement Learning Systems + + +
+ Deep reinforcement learning (DRL) is increasingly applied in large-scale +productions like Netflix and Facebook. As with most data-driven systems, DRL +systems can exhibit undesirable behaviors due to environmental drifts, which +often occur in constantly-changing production settings. Continual Learning (CL) +is the inherent self-healing approach for adapting the DRL agent in response to +the environment's conditions shifts. However, successive shifts of considerable +magnitude may cause the production environment to drift from its original +state. Recent studies have shown that these environmental drifts tend to drive +CL into long, or even unsuccessful, healing cycles, which arise from +inefficiencies such as catastrophic forgetting, warm-starting failure, and slow +convergence. In this paper, we propose Dr. DRL, an effective self-healing +approach for DRL systems that integrates a novel mechanism of intentional +forgetting into vanilla CL to overcome its main issues. Dr. DRL deliberately +erases the DRL system's minor behaviors to systematically prioritize the +adaptation of the key problem-solving skills. Using well-established DRL +algorithms, Dr. DRL is compared with vanilla CL on various drifted +environments. Dr. DRL is able to reduce, on average, the healing time and +fine-tuning episodes by, respectively, 18.74% and 17.72%. Dr. DRL successfully +helps agents to adapt to 19.63% of drifted environments left unsolved by +vanilla CL while maintaining and even enhancing by up to 45% the obtained +rewards for drifted environments that are resolved by both approaches. + +
+
+ comment: Accepted for publication in The 38th IEEE/ACM International + Conference on Automated Software Engineering (ASE 2023) +
+
+
+
+
+ + ☆ TAI-GAN: Temporally and Anatomically Informed GAN for early-to-late + frame conversion in dynamic cardiac PET motion correction MICCAI + + +
+ The rapid tracer kinetics of rubidium-82 ($^{82}$Rb) and high variation of +cross-frame distribution in dynamic cardiac positron emission tomography (PET) +raise significant challenges for inter-frame motion correction, particularly +for the early frames where conventional intensity-based image registration +techniques are not applicable. Alternatively, a promising approach utilizes +generative methods to handle the tracer distribution changes to assist existing +registration methods. To improve frame-wise registration and parametric +quantification, we propose a Temporally and Anatomically Informed Generative +Adversarial Network (TAI-GAN) to transform the early frames into the late +reference frame using an all-to-one mapping. Specifically, a feature-wise +linear modulation layer encodes channel-wise parameters generated from temporal +tracer kinetics information, and rough cardiac segmentations with local shifts +serve as the anatomical information. We validated our proposed method on a +clinical $^{82}$Rb PET dataset and found that our TAI-GAN can produce converted +early frames with high image quality, comparable to the real reference frames. +After TAI-GAN conversion, motion estimation accuracy and clinical myocardial +blood flow (MBF) quantification were improved compared to using the original +frames. Our code is published at https://github.com/gxq1998/TAI-GAN. + +
+
+ comment: Accepted by Simulation and Synthesis in Medical Imaging (SASHIMI + 2023, MICCAI workshop), preprint version +
+
+
+
+
+ + ☆ BaDExpert: Extracting Backdoor Functionality for Accurate Backdoor Input + Detection + + +
+ We present a novel defense, against backdoor attacks on Deep Neural Networks +(DNNs), wherein adversaries covertly implant malicious behaviors (backdoors) +into DNNs. Our defense falls within the category of post-development defenses +that operate independently of how the model was generated. The proposed defense +is built upon a novel reverse engineering approach that can directly extract +backdoor functionality of a given backdoored model to a backdoor expert model. +The approach is straightforward -- finetuning the backdoored model over a small +set of intentionally mislabeled clean samples, such that it unlearns the normal +functionality while still preserving the backdoor functionality, and thus +resulting in a model (dubbed a backdoor expert model) that can only recognize +backdoor inputs. Based on the extracted backdoor expert model, we show the +feasibility of devising highly accurate backdoor input detectors that filter +out the backdoor inputs during model inference. Further augmented by an +ensemble strategy with a finetuned auxiliary model, our defense, BaDExpert +(Backdoor Input Detection with Backdoor Expert), effectively mitigates 16 SOTA +backdoor attacks while minimally impacting clean utility. The effectiveness of +BaDExpert has been verified on multiple datasets (CIFAR10, GTSRB and ImageNet) +across various model architectures (ResNet, VGG, MobileNetV2 and Vision +Transformer). + +
+
+
+
+
+ + ☆ Deploying Deep Reinforcement Learning Systems: A Taxonomy of Challenges + + +
+ Deep reinforcement learning (DRL), leveraging Deep Learning (DL) in +reinforcement learning, has shown significant potential in achieving +human-level autonomy in a wide range of domains, including robotics, computer +vision, and computer games. This potential justifies the enthusiasm and growing +interest in DRL in both academia and industry. However, the community currently +focuses mostly on the development phase of DRL systems, with little attention +devoted to DRL deployment. In this paper, we propose an empirical study on +Stack Overflow (SO), the most popular Q&A forum for developers, to uncover and +understand the challenges practitioners faced when deploying DRL systems. +Specifically, we categorized relevant SO posts by deployment platforms: +server/cloud, mobile/embedded system, browser, and game engine. After filtering +and manual analysis, we examined 357 SO posts about DRL deployment, +investigated the current state, and identified the challenges related to +deploying DRL systems. Then, we investigate the prevalence and difficulty of +these challenges. Results show that the general interest in DRL deployment is +growing, confirming the study's relevance and importance. Results also show +that DRL deployment is more difficult than other DRL issues. Additionally, we +built a taxonomy of 31 unique challenges in deploying DRL to different +platforms. On all platforms, RL environment-related challenges are the most +popular, and communication-related challenges are the most difficult among +practitioners. We hope our study inspires future research and helps the +community overcome the most common and difficult challenges practitioners face +when deploying DRL systems. + +
+
+ comment: Accepted for publication in The International Conference on Software + Maintenance and Evolution (ICSME 2023) +
+
+
+
+
+ + ☆ Evolution of ESG-focused DLT Research: An NLP Analysis of the Literature + + +
+ Distributed Ledger Technologies (DLTs) have rapidly evolved, necessitating +comprehensive insights into their diverse components. However, a systematic +literature review that emphasizes the Environmental, Sustainability, and +Governance (ESG) components of DLT remains lacking. To bridge this gap, we +selected 107 seed papers to build a citation network of 63,083 references and +refined it to a corpus of 24,539 publications for analysis. Then, we labeled +the named entities in 46 papers according to twelve top-level categories +derived from an established technology taxonomy and enhanced the taxonomy by +pinpointing DLT's ESG elements. Leveraging transformer-based language models, +we fine-tuned a pre-trained language model for a Named Entity Recognition (NER) +task using our labeled dataset. We used our fine-tuned language model to +distill the corpus to 505 key papers, facilitating a literature review via +named entities and temporal graph analysis on DLT evolution in the context of +ESG. Our contributions are a methodology to conduct a machine learning-driven +systematic literature review in the DLT field, placing a special emphasis on +ESG aspects. Furthermore, we present a first-of-its-kind NER dataset, composed +of 54,808 named entities, designed for DLT and ESG-related explorations. + +
+
+
+
+
+ + ☆ Machine learning in parameter estimation of nonlinear systems + + +
+ Accurately estimating parameters in complex nonlinear systems is crucial +across scientific and engineering fields. We present a novel approach for +parameter estimation using a neural network with the Huber loss function. This +method taps into deep learning's abilities to uncover parameters governing +intricate behaviors in nonlinear equations. We validate our approach using +synthetic data and predefined functions that model system dynamics. By training +the neural network with noisy time series data, it fine-tunes the Huber loss +function to converge to accurate parameters. We apply our method to damped +oscillators, Van der Pol oscillators, Lotka-Volterra systems, and Lorenz +systems under multiplicative noise. The trained neural network accurately +estimates parameters, evident from closely matching latent dynamics. Comparing +true and estimated trajectories visually reinforces our method's precision and +robustness. Our study underscores the Huber loss-guided neural network as a +versatile tool for parameter estimation, effectively uncovering complex +relationships in nonlinear systems. The method navigates noise and uncertainty +adeptly, showcasing its adaptability to real-world challenges. + +
+
+ comment: 23 pages, 7 figures, +
+
+
+
+
+ + ☆ FOSA: Full Information Maximum Likelihood (FIML) Optimized + Self-Attention Imputation for Missing Data + + +
+ In data imputation, effectively addressing missing values is pivotal, +especially in intricate datasets. This paper delves into the FIML Optimized +Self-attention (FOSA) framework, an innovative approach that amalgamates the +strengths of Full Information Maximum Likelihood (FIML) estimation with the +capabilities of self-attention neural networks. Our methodology commences with +an initial estimation of missing values via FIML, subsequently refining these +estimates by leveraging the self-attention mechanism. Our comprehensive +experiments on both simulated and real-world datasets underscore FOSA's +pronounced advantages over traditional FIML techniques, encapsulating facets of +accuracy, computational efficiency, and adaptability to diverse data +structures. Intriguingly, even in scenarios where the Structural Equation Model +(SEM) might be mis-specified, leading to suboptimal FIML estimates, the robust +architecture of FOSA's self-attention component adeptly rectifies and optimizes +the imputation outcomes. Our empirical tests reveal that FOSA consistently +delivers commendable predictions, even in the face of up to 40% random +missingness, highlighting its robustness and potential for wide-scale +applications in data imputation. + +
+
+ comment: The source code for the experiments is publicly available at: + https://github.com/oudeng/FOSA/ +
+
+
+
+
+ + ☆ Open-set Face Recognition with Neural Ensemble, Maximal Entropy Loss and + Feature Augmentation + + +
+ Open-set face recognition refers to a scenario in which biometric systems +have incomplete knowledge of all existing subjects. Therefore, they are +expected to prevent face samples of unregistered subjects from being identified +as previously enrolled identities. This watchlist context adds an arduous +requirement that calls for the dismissal of irrelevant faces by focusing mainly +on subjects of interest. As a response, this work introduces a novel method +that associates an ensemble of compact neural networks with a margin-based cost +function that explores additional samples. Supplementary negative samples can +be obtained from external databases or synthetically built at the +representation level in training time with a new mix-up feature augmentation +approach. Deep neural networks pre-trained on large face datasets serve as the +preliminary feature extraction module. We carry out experiments on well-known +LFW and IJB-C datasets where results show that the approach is able to boost +closed and open-set identification rates. + +
+
+
+
+
+ + ♻ ☆ Tryage: Real-time, intelligent Routing of User Prompts to Large Language + Models + + +
+ The introduction of the transformer architecture and the self-attention +mechanism has led to an explosive production of language models trained on +specific downstream tasks and data domains. With over 200, 000 models in the +Hugging Face ecosystem, users grapple with selecting and optimizing models to +suit multifaceted workflows and data domains while addressing computational, +security, and recency concerns. There is an urgent need for machine learning +frameworks that can eliminate the burden of model selection and customization +and unleash the incredible power of the vast emerging model library for end +users. Here, we propose a context-aware routing system, Tryage, that leverages +a language model router for optimal selection of expert models from a model +library based on analysis of individual input prompts. Inspired by the thalamic +router in the brain, Tryage employs a perceptive router to predict down-stream +model performance on prompts and, then, makes a routing decision using an +objective function that integrates performance predictions with user goals and +constraints that are incorporated through flags (e.g., model size, model +recency). Tryage allows users to explore a Pareto front and automatically +trade-off between task accuracy and secondary goals including minimization of +model size, recency, security, verbosity, and readability. Across heterogeneous +data sets that include code, text, clinical data, and patents, the Tryage +framework surpasses Gorilla and GPT3.5 turbo in dynamic model selection +identifying the optimal model with an accuracy of 50.9% , compared to 23.6% by +GPT 3.5 Turbo and 10.8% by Gorilla. Conceptually, Tryage demonstrates how +routing models can be applied to program and control the behavior of +multi-model LLM systems to maximize efficient use of the expanding and evolving +language model ecosystem. + +
+
+
+
+
+ + ♻ ☆ Towards Interactive Reinforcement Learning with Intrinsic Feedback + + +
+ Reinforcement learning (RL) and brain-computer interfaces (BCI) have +experienced significant growth over the past decade. With rising interest in +human-in-the-loop (HITL), incorporating human input with RL algorithms has +given rise to the sub-field of interactive RL. Adjacently, the field of BCI has +long been interested in extracting informative brain signals from neural +activity for use in human-computer interactions. A key link between these +fields lies in the interpretation of neural activity as feedback such that +interactive RL approaches can be employed. We denote this new and emerging +medium of feedback as intrinsic feedback. Despite intrinsic feedback's ability +to be conveyed automatically and even unconsciously, proper exploration +surrounding this key link has largely gone unaddressed by both communities. +Thus, to help facilitate a deeper understanding and a more effective +utilization, we provide a tutorial-style review covering the motivations, +approaches, and open problems of intrinsic feedback and its foundational +concepts. + +
+
+ comment: Name change and vast rewrites of the paper +
+
+
+
+
+ + ♻ ☆ The Common Intuition to Transfer Learning Can Win or Lose: Case Studies + for Linear Regression + + +
+ We study a fundamental transfer learning process from source to target linear +regression tasks, including overparameterized settings where there are more +learned parameters than data samples. The target task learning is addressed by +using its training data together with the parameters previously computed for +the source task. We define a transfer learning approach to the target task as a +linear regression optimization with a regularization on the distance between +the to-be-learned target parameters and the already-learned source parameters. +We analytically characterize the generalization performance of our transfer +learning approach and demonstrate its ability to resolve the peak in +generalization errors in double descent phenomena of the minimum L2-norm +solution to linear regression. Moreover, we show that for sufficiently related +tasks, the optimally tuned transfer learning approach can outperform the +optimally tuned ridge regression method, even when the true parameter vector +conforms to an isotropic Gaussian prior distribution. Namely, we demonstrate +that transfer learning can beat the minimum mean square error (MMSE) solution +of the independent target task. Our results emphasize the ability of transfer +learning to extend the solution space to the target task and, by that, to have +an improved MMSE solution. We formulate the linear MMSE solution to our +transfer learning setting and point out its key differences from the common +design philosophy to transfer learning. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Selective Labeling for More Effective Semi-Supervised + Learning ECCV 2022 + + +
+ Given an unlabeled dataset and an annotation budget, we study how to +selectively label a fixed number of instances so that semi-supervised learning +(SSL) on such a partially labeled dataset is most effective. We focus on +selecting the right data to label, in addition to usual SSL's propagating +labels from labeled data to the rest unlabeled data. This instance selection +task is challenging, as without any labeled data we do not know what the +objective of learning should be. Intuitively, no matter what the downstream +task is, instances to be labeled must be representative and diverse: The former +would facilitate label propagation to unlabeled data, whereas the latter would +ensure coverage of the entire dataset. We capture this idea by selecting +cluster prototypes, either in a pretrained feature space, or along with feature +optimization, both without labels. Our unsupervised selective labeling +consistently improves SSL methods over state-of-the-art active learning given +labeled data, by 8 to 25 times in label efficiency. For example, it boosts +FixMatch by 10% (14%) in accuracy on CIFAR-10 (ImageNet-1K) with 0.08% (0.2%) +labeled data, demonstrating that small computation spent on selecting what data +to label brings significant gain especially under a low annotation budget. Our +work sets a new standard for practical and efficient SSL. + +
+
+ comment: Accepted by ECCV 2022; Fixed a few typos +
+
+
+
+
+ + ♻ ☆ Towards Top-Down Automated Development in Limited Scopes: A + Neuro-Symbolic Framework from Expressibles to Executables + + +
+ Deep code generation is a topic of deep learning for software engineering +(DL4SE), which adopts neural models to generate code for the intended +functions. Since end-to-end neural methods lack domain knowledge and software +hierarchy awareness, they tend to perform poorly w.r.t project-level tasks. To +systematically explore the potential improvements of code generation, we let it +participate in the whole top-down development from \emph{expressibles} to +\emph{executables}, which is possible in limited scopes. In the process, it +benefits from massive samples, features, and knowledge. As the foundation, we +suggest building a taxonomy on code data, namely code taxonomy, leveraging the +categorization of code information. Moreover, we introduce a three-layer +semantic pyramid (SP) to associate text data and code data. It identifies the +information of different abstraction levels, and thus introduces the domain +knowledge on development and reveals the hierarchy of software. Furthermore, we +propose a semantic pyramid framework (SPF) as the approach, focusing on +software of high modularity and low complexity. SPF divides the code generation +process into stages and reserves spots for potential interactions. In addition, +we conceived preliminary applications in software development to confirm the +neuro-symbolic framework. + +
+
+ comment: 5 pages, 3 figures, 2 tables, accepted by ESEC/FSE 2023, the + camera-ready version +
+
+
+
+
+ + ♻ ☆ Emergent segmentation from participation dynamics and multi-learner + retraining + + +
+ The choice to participate in a data-driven service, often made on the basis +of quality of that service, influences the ability of the service to learn and +improve. We study the participation and retraining dynamics that arise when +both the learners and sub-populations of users are \emph{risk-reducing}, which +cover a broad class of updates including gradient descent, multiplicative +weights, etc. Suppose, for example, that individuals choose to spend their time +amongst social media platforms proportionally to how well each platform works +for them. Each platform also gathers data about its active users, which it uses +to update parameters with a gradient step. For this example and for our general +class of dynamics, we show that the only asymptotically stable equilibria are +segmented, with sub-populations allocated to a single learner. Under mild +assumptions, the utilitarian social optimum is a stable equilibrium. In +contrast to previous work, which shows that repeated risk minimization can +result in representation disparity and high overall loss for a single learner +\citep{hashimoto2018fairness,miller2021outside}, we find that repeated myopic +updates with multiple learners lead to better outcomes. We illustrate the +phenomena via a simulated example initialized from real data. + +
+
+
+
+
+ + ♻ ☆ ProtoBandit: Efficient Prototype Selection via Multi-Armed Bandits + + +
+ In this work, we propose a multi-armed bandit-based framework for identifying +a compact set of informative data instances (i.e., the prototypes) from a +source dataset $S$ that best represents a given target set $T$. Prototypical +examples of a given dataset offer interpretable insights into the underlying +data distribution and assist in example-based reasoning, thereby influencing +every sphere of human decision-making. Current state-of-the-art prototype +selection approaches require $O(|S||T|)$ similarity comparisons between source +and target data points, which becomes prohibitively expensive for large-scale +settings. We propose to mitigate this limitation by employing stochastic greedy +search in the space of prototypical examples and multi-armed bandits for +reducing the number of similarity comparisons. Our randomized algorithm, +ProtoBandit, identifies a set of $k$ prototypes incurring $O(k^3|S|)$ +similarity comparisons, which is independent of the size of the target set. An +interesting outcome of our analysis is for the $k$-medoids clustering problem +$T = S$ setting) in which we show that our algorithm ProtoBandit approximates +the BUILD step solution of the partitioning around medoids (PAM) method in +$O(k^3|S|)$ complexity. Empirically, we observe that ProtoBandit reduces the +number of similarity computation calls by several orders of magnitudes +($100-1000$ times) while obtaining solutions similar in quality to those from +state-of-the-art approaches. + +
+
+ comment: Erratum corrected +
+
+
+
+
+ + ♻ ☆ A Survey on Dataset Distillation: Approaches, Applications and Future + Directions + + +
+ Dataset distillation is attracting more attention in machine learning as +training sets continue to grow and the cost of training state-of-the-art models +becomes increasingly high. By synthesizing datasets with high information +density, dataset distillation offers a range of potential applications, +including support for continual learning, neural architecture search, and +privacy protection. Despite recent advances, we lack a holistic understanding +of the approaches and applications. Our survey aims to bridge this gap by first +proposing a taxonomy of dataset distillation, characterizing existing +approaches, and then systematically reviewing the data modalities, and related +applications. In addition, we summarize the challenges and discuss future +directions for this field of research. + +
+
+
+
+
+ + ♻ ☆ Learning Interpretable Dynamics from Images of a Freely Rotating 3D + Rigid Body + + +
+ In many real-world settings, image observations of freely rotating 3D rigid +bodies, such as satellites, may be available when low-dimensional measurements +are not. However, the high-dimensionality of image data precludes the use of +classical estimation techniques to learn the dynamics and a lack of +interpretability reduces the usefulness of standard deep learning methods. In +this work, we present a physics-informed neural network model to estimate and +predict 3D rotational dynamics from image sequences. We achieve this using a +multi-stage prediction pipeline that maps individual images to a latent +representation homeomorphic to $\mathbf{SO}(3)$, computes angular velocities +from latent pairs, and predicts future latent states using the Hamiltonian +equations of motion with a learned representation of the Hamiltonian. We +demonstrate the efficacy of our approach on a new rotating rigid-body dataset +with sequences of rotating cubes and rectangular prisms with uniform and +non-uniform density. + +
+
+ comment: 13 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ AudioFormer: Audio Transformer learns audio feature representations from + discrete acoustic codes + + +
+ We propose a method named AudioFormer,which learns audio feature +representations through the acquisition of discrete acoustic codes and +subsequently fine-tunes them for audio classification tasks. Initially,we +introduce a novel perspective by considering the audio classification task as a +form of natural language understanding (NLU). Leveraging an existing neural +audio codec model,we generate discrete acoustic codes and utilize them to train +a masked language model (MLM),thereby obtaining audio feature representations. +Furthermore,we pioneer the integration of a Multi-Positive sample Contrastive +(MPC) learning approach. This method enables the learning of joint +representations among multiple discrete acoustic codes within the same audio +input. In our experiments,we treat discrete acoustic codes as textual data and +train a masked language model using a cloze-like methodology,ultimately +deriving high-quality audio representations. Notably,the MPC learning technique +effectively captures collaborative representations among distinct positive +samples. Our research outcomes demonstrate that AudioFormer attains +significantly improved performance compared to prevailing monomodal audio +classification models across multiple datasets,and even outperforms +audio-visual multimodal classification models on select datasets. +Specifically,our approach achieves remarkable results on datasets including +AudioSet (2M,20K),and FSD50K,with performance scores of 53.9,45.1,and +65.6,respectively. We have openly shared both the code and models: +https://github.com/LZH-0225/AudioFormer.git. + +
+
+ comment: 9 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Domain Specific Question Answering Over Knowledge Graphs Using Logical + Programming and Large Language Models + + +
+ Answering questions over domain-specific graphs requires a tailored approach +due to the limited number of relations and the specific nature of the domain. +Our approach integrates classic logical programming languages into large +language models (LLMs), enabling the utilization of logical reasoning +capabilities to tackle the KGQA task. By representing the questions as Prolog +queries, which are readable and near close to natural language in +representation, we facilitate the generation of programmatically derived +answers. To validate the effectiveness of our approach, we evaluate it using a +well-known benchmark dataset, MetaQA. Our experimental results demonstrate that +our method achieves accurate identification of correct answer entities for all +test questions, even when trained on a small fraction of annotated data. +Overall, our work presents a promising approach to addressing question +answering over domain-specific graphs, offering an explainable and robust +solution by incorporating logical programming languages. + +
+
+
+
+
+ + ♻ ☆ On the link between generative semi-supervised learning and generative + open-set recognition + + +
+ This study investigates the relationship between semi-supervised learning +(SSL, which is training off partially labelled datasets) and open-set +recognition (OSR, which is classification with simultaneous novelty detection) +under the context of generative adversarial networks (GANs). Although no +previous study has formally linked SSL and OSR, their respective methods share +striking similarities. Specifically, SSL-GANs and OSR-GANs require their +generators to produce 'bad-looking' samples which are used to regularise their +classifier networks. We hypothesise that the definitions of bad-looking samples +in SSL and OSR represents the same concept and realises the same goal. More +formally, bad-looking samples lie in the complementary space, which is the area +between and around the boundaries of the labelled categories within the +classifier's embedding space. By regularising a classifier with samples in the +complementary space, classifiers achieve improved generalisation for SSL and +also generalise the open space for OSR. To test this hypothesis, we compare a +foundational SSL-GAN with the state-of-the-art OSR-GAN under the same SSL-OSR +experimental conditions. Our results find that SSL-GANs achieve near identical +results to OSR-GANs, proving the SSL-OSR link. Subsequently, to further this +new research path, we compare several SSL-GANs various SSL-OSR setups which +this first benchmark results. A combined framework of SSL-OSR certainly +improves the practicality and cost-efficiency of classifier training, and so +further theoretical and application studies are also discussed. + +
+
+
+
+
+ + ♻ ☆ Physics-informed neural networks with unknown measurement noise + + +
+ Physics-informed neural networks (PINNs) constitute a flexible approach to +both finding solutions and identifying parameters of partial differential +equations. Most works on the topic assume noiseless data, or data contaminated +by weak Gaussian noise. We show that the standard PINN framework breaks down in +case of non-Gaussian noise. We give a way of resolving this fundamental issue +and we propose to jointly train an energy-based model (EBM) to learn the +correct noise distribution. We illustrate the improved performance of our +approach using multiple examples. + +
+
+
+
+
+ + ♻ ☆ Randomized Coordinate Subgradient Method for Nonsmooth Composite + Optimization + + +
+ Coordinate-type subgradient methods for addressing nonsmooth optimization +problems are relatively underexplored due to the set-valued nature of the +subdifferential. In this work, our study focuses on nonsmooth composite +optimization problems, encompassing a wide class of convex and weakly convex +(nonconvex nonsmooth) problems. By utilizing the chain rule of the composite +structure properly, we introduce the Randomized Coordinate Subgradient method +(RCS) for tackling this problem class. To the best of our knowledge, this is +the first coordinate subgradient method for solving general nonsmooth composite +optimization problems. In theory, we consider the linearly bounded subgradients +assumption for the objective function, which is more general than the +traditional Lipschitz continuity assumption, to account for practical +scenarios. We then conduct convergence analysis for RCS in both convex and +weakly convex cases based on this generalized Lipschitz-type assumption. +Specifically, we establish the $\widetilde{\mathcal{O}}$$(1/\sqrt{k})$ +convergence rate in expectation and the $\tilde o(1/\sqrt{k})$ almost sure +asymptotic convergence rate in terms of the suboptimality gap when $f$ is +convex. For the case when $f$ is weakly convex and its subdifferential +satisfies the global metric subregularity property, we derive the +$\mathcal{O}(\varepsilon^{-4})$ iteration complexity in expectation. We also +establish an asymptotic convergence result. To justify the global metric +subregularity property utilized in the analysis, we establish this error bound +condition for the concrete (real-valued) robust phase retrieval problem. We +also provide a convergence lemma and the relationship between the global metric +subregularity properties of a weakly convex function and its Moreau envelope. +Finally, we conduct several experiments to demonstrate the possible superiority +of RCS over the subgradient method. + +
+
+
+
+
+ + ♻ ☆ Knowledge-Aware Federated Active Learning with Non-IID Data ICCV23 + + +
+ Federated learning enables multiple decentralized clients to learn +collaboratively without sharing the local training data. However, the expensive +annotation cost to acquire data labels on local clients remains an obstacle in +utilizing local data. In this paper, we propose a federated active learning +paradigm to efficiently learn a global model with limited annotation budget +while protecting data privacy in a decentralized learning way. The main +challenge faced by federated active learning is the mismatch between the active +sampling goal of the global model on the server and that of the asynchronous +local clients. This becomes even more significant when data is distributed +non-IID across local clients. To address the aforementioned challenge, we +propose Knowledge-Aware Federated Active Learning (KAFAL), which consists of +Knowledge-Specialized Active Sampling (KSAS) and Knowledge-Compensatory +Federated Update (KCFU). KSAS is a novel active sampling method tailored for +the federated active learning problem. It deals with the mismatch challenge by +sampling actively based on the discrepancies between local and global models. +KSAS intensifies specialized knowledge in local clients, ensuring the sampled +data to be informative for both the local clients and the global model. KCFU, +in the meantime, deals with the client heterogeneity caused by limited data and +non-IID data distributions. It compensates for each client's ability in weak +classes by the assistance of the global model. Extensive experiments and +analyses are conducted to show the superiority of KSAS over the +state-of-the-art active learning methods and the efficiency of KCFU under the +federated active learning framework. + +
+
+ comment: 14 pages, 12 figures, ICCV23 +
+
+
+
+
+ + ♻ ☆ Exact Manifold Gaussian Variational Bayes + + +
+ We propose an optimization algorithm for Variational Inference (VI) in +complex models. Our approach relies on natural gradient updates where the +variational space is a Riemann manifold. We develop an efficient algorithm for +Gaussian Variational Inference that implicitly satisfies the positive definite +constraint on the variational covariance matrix. Our Exact manifold Gaussian +Variational Bayes (EMGVB) provides exact but simple update rules and is +straightforward to implement. Due to its black-box nature, EMGVB stands as a +ready-to-use solution for VI in complex models. Over five datasets, we +empirically validate our feasible approach on different statistical, +econometric, and deep learning models, discussing its performance with respect +to baseline methods. + +
+
+
+
+
+ + ♻ ☆ Estimating Driver Personality Traits from On-Road Driving Data + + +
+ This paper focuses on the estimation of a driver's psychological +characteristics using driving data for driving assistance systems. Driving +assistance systems that support drivers by adapting individual psychological +characteristics can provide appropriate feedback and prevent traffic accidents. +As a first step toward implementing such adaptive assistance systems, this +research aims to develop a model to estimate drivers' psychological +characteristics, such as cognitive function, psychological driving style, and +workload sensitivity, from on-road driving behavioral data using machine +learning and deep learning techniques. We also investigated the relationship +between driving behavior and various cognitive functions, including the Trail +Making Test (TMT) and Useful Field of View (UFOV) test, through regression +modeling. The proposed method focuses on road type information and captures +various durations of time-series data observed from driving behaviors. First, +we segment the driving time-series data into two road types, namely, arterial +roads and intersections, to consider driving situations. Second, we further +segment data into many sequences of various durations. Third, statistics are +calculated from each sequence. Finally, these statistics are used as input +features of machine learning models to estimate psychological characteristics. +The experimental results show that our model can estimate a driver's cognitive +function, namely, the TMT~(B) and UFOV test scores, with Pearson correlation +coefficients $r$ of 0.579 and 0.708, respectively. Some characteristics, such +as psychological driving style and workload sensitivity, are estimated with +high accuracy, but whether various duration segmentation improves accuracy +depends on the characteristics, and it is not effective for all +characteristics. + +
+
+
+
+
+ + ♻ ☆ Comparison of Machine Learning Methods for Assigning Software Issues to + Team Members + + +
+ Software issues contain units of work to fix, improve, or create new threads +during the development and facilitate communication among the team members. +Assigning an issue to the most relevant team member and determining a category +of an issue is a tedious and challenging task. Wrong classifications cause +delays and rework in the project and trouble among the team members. This paper +proposes a set of carefully curated linguistic features for shallow machine +learning methods and compares the performance of shallow and ensemble methods +with deep language models. Unlike the state-of-the-art, we assign issues to +four roles (designer, developer, tester, and leader) rather than to specific +individuals or teams to contribute to the generality of our solution. We also +consider the level of experience of the developers to reflect the industrial +practices in our solution formulation. We collect and annotate five industrial +data sets from one of the top three global television producers to evaluate our +proposal and compare it with deep language models. Our data sets contain 5324 +issues in total. We show that an ensemble classifier of shallow techniques +achieves 0.92 for issue assignment in accuracy which is statistically +comparable to the state-of-the-art deep language models. The contributions +include the public sharing of five annotated industrial issue data sets, the +development of a clear and comprehensive feature set, the introduction of a +novel label set, and the validation of the efficacy of an ensemble classifier +of shallow machine learning techniques. + +
+
+
+
+
+ + ♻ ☆ Deletion and Insertion Tests in Regression Models + + +
+ A basic task in explainable AI (XAI) is to identify the most important +features behind a prediction made by a black box function $f$. The insertion +and deletion tests of Petsiuk et al. (2018) can be used to judge the quality of +algorithms that rank pixels from most to least important for a classification. +Motivated by regression problems we establish a formula for their area under +the curve (AUC) criteria in terms of certain main effects and interactions in +an anchored decomposition of $f$. We find an expression for the expected value +of the AUC under a random ordering of inputs to $f$ and propose an alternative +area above a straight line for the regression setting. We use this criterion to +compare feature importances computed by integrated gradients (IG) to those +computed by Kernel SHAP (KS) as well as LIME, DeepLIFT, vanilla gradient and +input$\times$gradient methods. KS has the best overall performance in two +datasets we consider but it is very expensive to compute. We find that IG is +nearly as good as KS while being much faster. Our comparison problems include +some binary inputs that pose a challenge to IG because it must use values +between the possible variable levels and so we consider ways to handle binary +variables in IG. We show that sorting variables by their Shapley value does not +necessarily give the optimal ordering for an insertion-deletion test. It will +however do that for monotone functions of additive models, such as logistic +regression. + +
+
+
+
+
+ + ♻ ☆ Dirac signal processing of higher-order topological signals + + +
+ Higher-order networks can sustain topological signals which are variables +associated not only to the nodes, but also to the links, to the triangles and +in general to the higher dimensional simplices of simplicial complexes. These +topological signals can describe a large variety of real systems including +currents in the ocean, synaptic currents between neurons and biological +transportation networks. In real scenarios topological signal data might be +noisy and an important task is to process these signals by improving their +signal to noise ratio. So far topological signals are typically processed +independently of each other. For instance, node signals are processed +independently of link signals, and algorithms that can enforce a consistent +processing of topological signals across different dimensions are largely +lacking. Here we propose Dirac signal processing, an adaptive, unsupervised +signal processing algorithm that learns to jointly filter topological signals +supported on nodes, links and triangles of simplicial complexes in a consistent +way. The proposed Dirac signal processing algorithm is formulated in terms of +the discrete Dirac operator which can be interpreted as "square root" of a +higher-order Hodge Laplacian. We discuss in detail the properties of the Dirac +operator including its spectrum and the chirality of its eigenvectors and we +adopt this operator to formulate Dirac signal processing that can filter noisy +signals defined on nodes, links and triangles of simplicial complexes. We test +our algorithms on noisy synthetic data and noisy data of drifters in the ocean +and find that the algorithm can learn to efficiently reconstruct the true +signals outperforming algorithms based exclusively on the Hodge Laplacian. + +
+
+ comment: (26 pages, 12 figures) +
+
+
+
+
+ + ♻ ☆ MARLlib: A Scalable and Efficient Multi-agent Reinforcement Learning + Library + + +
+ A significant challenge facing researchers in the area of multi-agent +reinforcement learning (MARL) pertains to the identification of a library that +can offer fast and compatible development for multi-agent tasks and algorithm +combinations, while obviating the need to consider compatibility issues. In +this paper, we present MARLlib, a library designed to address the +aforementioned challenge by leveraging three key mechanisms: 1) a standardized +multi-agent environment wrapper, 2) an agent-level algorithm implementation, +and 3) a flexible policy mapping strategy. By utilizing these mechanisms, +MARLlib can effectively disentangle the intertwined nature of the multi-agent +task and the learning process of the algorithm, with the ability to +automatically alter the training strategy based on the current task's +attributes. The MARLlib library's source code is publicly accessible on GitHub: +\url{https://github.com/Replicable-MARL/MARLlib}. + +
+
+
+
+
+ + ♻ ☆ Riemannian Hamiltonian methods for min-max optimization on manifolds + + +
+ In this paper, we study min-max optimization problems on Riemannian +manifolds. We introduce a Riemannian Hamiltonian function, minimization of +which serves as a proxy for solving the original min-max problems. Under the +Riemannian Polyak--{\L}ojasiewicz condition on the Hamiltonian function, its +minimizer corresponds to the desired min-max saddle point. We also provide +cases where this condition is satisfied. For geodesic-bilinear optimization in +particular, solving the proxy problem leads to the correct search direction +towards global optimality, which becomes challenging with the min-max +formulation. To minimize the Hamiltonian function, we propose Riemannian +Hamiltonian methods (RHM) and present their convergence analyses. We extend RHM +to include consensus regularization and to the stochastic setting. We +illustrate the efficacy of the proposed RHM in applications such as subspace +robust Wasserstein distance, robust training of neural networks, and generative +adversarial networks. + +
+
+
+
+
+ + ♻ ☆ Minimalist Traffic Prediction: Linear Layer Is All You Need + + +
+ Traffic prediction is essential for the progression of Intelligent +Transportation Systems (ITS) and the vision of smart cities. While +Spatial-Temporal Graph Neural Networks (STGNNs) have shown promise in this +domain by leveraging Graph Neural Networks (GNNs) integrated with either RNNs +or Transformers, they present challenges such as computational complexity, +gradient issues, and resource-intensiveness. This paper addresses these +challenges, advocating for three main solutions: a node-embedding approach, +time series decomposition, and periodicity learning. We introduce STLinear, a +minimalist model architecture designed for optimized efficiency and +performance. Unlike traditional STGNNs, STlinear operates fully locally, +avoiding inter-node data exchanges, and relies exclusively on linear layers, +drastically cutting computational demands. Our empirical studies on real-world +datasets confirm STLinear's prowess, matching or exceeding the accuracy of +leading STGNNs, but with significantly reduced complexity and computation +overhead (more than 95% reduction in MACs per epoch compared to +state-of-the-art STGNN baseline published in 2023). In summary, STLinear +emerges as a potent, efficient alternative to conventional STGNNs, with +profound implications for the future of ITS and smart city initiatives. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ♻ ☆ UTRNet: High-Resolution Urdu Text Recognition In Printed Documents ICDAR 2023 + + +
+ In this paper, we propose a novel approach to address the challenges of +printed Urdu text recognition using high-resolution, multi-scale semantic +feature extraction. Our proposed UTRNet architecture, a hybrid CNN-RNN model, +demonstrates state-of-the-art performance on benchmark datasets. To address the +limitations of previous works, which struggle to generalize to the intricacies +of the Urdu script and the lack of sufficient annotated real-world data, we +have introduced the UTRSet-Real, a large-scale annotated real-world dataset +comprising over 11,000 lines and UTRSet-Synth, a synthetic dataset with 20,000 +lines closely resembling real-world and made corrections to the ground truth of +the existing IIITH dataset, making it a more reliable resource for future +research. We also provide UrduDoc, a benchmark dataset for Urdu text line +detection in scanned documents. Additionally, we have developed an online tool +for end-to-end Urdu OCR from printed documents by integrating UTRNet with a +text detection model. Our work not only addresses the current limitations of +Urdu OCR but also paves the way for future research in this area and +facilitates the continued advancement of Urdu OCR technology. The project page +with source code, datasets, annotations, trained models, and online tool is +available at abdur75648.github.io/UTRNet. + +
+
+ comment: Accepted at The 17th International Conference on Document Analysis + and Recognition (ICDAR 2023) +
+
+
+
+
+ + ♻ ☆ Forward-Backward Reasoning in Large Language Models for Verification + + +
+ Chain-of-Though (CoT) prompting has shown promising performance in various +reasoning tasks. Recently, Self-Consistency \citep{wang2023selfconsistency} +proposes to sample a diverse set of reasoning chains which may lead to +different answers while the answer that receives the most votes is selected. In +this paper, we propose a novel method to use backward reasoning in verifying +candidate answers. We mask a token in the question by ${\bf x}$ and ask the LLM +to predict the masked token when a candidate answer is provided by \textit{a +simple template}, i.e., "\textit{\textbf{If we know the answer of the above +question is \{a candidate answer\}, what is the value of unknown variable ${\bf +x}$?}}" Intuitively, the LLM is expected to predict the masked token +successfully if the provided candidate answer is correct. We further propose +FOBAR to combine forward and backward reasoning for estimating the probability +of candidate answers. We conduct extensive experiments on six data sets and +three LLMs. Experimental results demonstrate that FOBAR achieves +state-of-the-art performance on various reasoning benchmarks. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ Learning to Generalize towards Unseen Domains via a Content-Aware Style + Invariant Model for Disease Detection from Chest X-rays + + +
+ Performance degradation due to source domain mismatch is a longstanding +challenge in deep learning-based medical image analysis, particularly for chest +X-rays (CXRs). Several methods (e.g., adversarial training, multi-domain +mixups) have been proposed to extract domain-invariant high-level features to +address this domain shift. However, these methods do not explicitly regularize +the content and style characteristics of the extracted domain-invariant +features. Recent studies have demonstrated that CNN models exhibit a strong +bias toward styles (e.g., uninformative textures) rather than content (e.g., +shape), in stark contrast to the human-vision system. Radiologists tend to +learn visual cues from CXRs and thus perform well across multiple domains. +Therefore, in medical imaging for pathology diagnosis from CXR images, models +should extract domain-invariant features that are style-invariant and +content-biased. Motivated by this, we employ the novel style randomization +modules (SRMs) at both image and feature levels that work together +hierarchically to create rich style perturbed features on the fly while keeping +the content intact. In addition, we leverage consistency regularizations +between global semantic features and predicted probability distributions, +respectively, for with and without style perturbed versions of the same CXR +image to tweak the model's sensitivity toward content markers for accurate +predictions. Extensive experiments with three large-scale thoracic disease +datasets, i.e., CheXpert, MIMIC-CXR, and BRAX, demonstrate that our proposed +framework is more robust in the presence of domain shift and achieves +state-of-the-art performance. + +
+
+
+
+
+ + ♻ ☆ Phase-shifted Adversarial Training UAI 2023 + + +
+ Adversarial training has been considered an imperative component for safely +deploying neural network-based applications to the real world. To achieve +stronger robustness, existing methods primarily focus on how to generate strong +attacks by increasing the number of update steps, regularizing the models with +the smoothed loss function, and injecting the randomness into the attack. +Instead, we analyze the behavior of adversarial training through the lens of +response frequency. We empirically discover that adversarial training causes +neural networks to have low convergence to high-frequency information, +resulting in highly oscillated predictions near each data. To learn +high-frequency contents efficiently and effectively, we first prove that a +universal phenomenon of frequency principle, i.e., \textit{lower frequencies +are learned first}, still holds in adversarial training. Based on that, we +propose phase-shifted adversarial training (PhaseAT) in which the model learns +high-frequency components by shifting these frequencies to the low-frequency +range where the fast convergence occurs. For evaluations, we conduct the +experiments on CIFAR-10 and ImageNet with the adaptive attack carefully +designed for reliable evaluation. Comprehensive results show that PhaseAT +significantly improves the convergence for high-frequency information. This +results in improved adversarial robustness by enabling the model to have +smoothed predictions near each data. + +
+
+ comment: Conference on Uncertainty in Artificial Intelligence, 2023 (UAI 2023) +
+
+
+
+
+ + ♻ ☆ A Rigorous Uncertainty-Aware Quantification Framework Is Essential for + Reproducible and Replicable Machine Learning Workflows + + +
+ The ability to replicate predictions by machine learning (ML) or artificial +intelligence (AI) models and results in scientific workflows that incorporate +such ML/AI predictions is driven by numerous factors. An uncertainty-aware +metric that can quantitatively assess the reproducibility of quantities of +interest (QoI) would contribute to the trustworthiness of results obtained from +scientific workflows involving ML/AI models. In this article, we discuss how +uncertainty quantification (UQ) in a Bayesian paradigm can provide a general +and rigorous framework for quantifying reproducibility for complex scientific +workflows. Such as framework has the potential to fill a critical gap that +currently exists in ML/AI for scientific workflows, as it will enable +researchers to determine the impact of ML/AI model prediction variability on +the predictive outcomes of ML/AI-powered workflows. We expect that the +envisioned framework will contribute to the design of more reproducible and +trustworthy workflows for diverse scientific applications, and ultimately, +accelerate scientific discoveries. + +
+
+
+
+
+ + ♻ ☆ Information Theory-Guided Heuristic Progressive Multi-View Coding + + +
+ Multi-view representation learning aims to capture comprehensive information +from multiple views of a shared context. Recent works intuitively apply +contrastive learning to different views in a pairwise manner, which is still +scalable: view-specific noise is not filtered in learning view-shared +representations; the fake negative pairs, where the negative terms are actually +within the same class as the positive, and the real negative pairs are +coequally treated; evenly measuring the similarities between terms might +interfere with optimization. Importantly, few works study the theoretical +framework of generalized self-supervised multi-view learning, especially for +more than two views. To this end, we rethink the existing multi-view learning +paradigm from the perspective of information theory and then propose a novel +information theoretical framework for generalized multi-view learning. Guided +by it, we build a multi-view coding method with a three-tier progressive +architecture, namely Information theory-guided hierarchical Progressive +Multi-view Coding (IPMC). In the distribution-tier, IPMC aligns the +distribution between views to reduce view-specific noise. In the set-tier, IPMC +constructs self-adjusted contrasting pools, which are adaptively modified by a +view filter. Lastly, in the instance-tier, we adopt a designed unified loss to +learn representations and reduce the gradient interference. Theoretically and +empirically, we demonstrate the superiority of IPMC over state-of-the-art +methods. + +
+
+ comment: This paper is accepted by the jourcal of Neural Networks (Elsevier) + by 2023. arXiv admin note: substantial text overlap with arXiv:2109.02344 +
+
+
+
+
+ + ♻ ☆ Designing an attack-defense game: how to increase robustness of + financial transaction models via a competition + + +
+ Given the escalating risks of malicious attacks in the finance sector and the +consequential severe damage, a thorough understanding of adversarial strategies +and robust defense mechanisms for machine learning models is critical. The +threat becomes even more severe with the increased adoption in banks more +accurate, but potentially fragile neural networks. We aim to investigate the +current state and dynamics of adversarial attacks and defenses for neural +network models that use sequential financial data as the input. + To achieve this goal, we have designed a competition that allows realistic +and detailed investigation of problems in modern financial transaction data. +The participants compete directly against each other, so possible attacks and +defenses are examined in close-to-real-life conditions. Our main contributions +are the analysis of the competition dynamics that answers the questions on how +important it is to conceal a model from malicious users, how long does it take +to break it, and what techniques one should use to make it more robust, and +introduction additional way to attack models or increase their robustness. + Our analysis continues with a meta-study on the used approaches with their +power, numerical experiments, and accompanied ablations studies. We show that +the developed attacks and defenses outperform existing alternatives from the +literature while being practical in terms of execution, proving the validity of +the competition as a tool for uncovering vulnerabilities of machine learning +models and mitigating them in various domains. + +
+
+
+
+
+ + ♻ ☆ Task Relation-aware Continual User Representation Learning KDD 2023 + + +
+ User modeling, which learns to represent users into a low-dimensional +representation space based on their past behaviors, got a surge of interest +from the industry for providing personalized services to users. Previous +efforts in user modeling mainly focus on learning a task-specific user +representation that is designed for a single task. However, since learning +task-specific user representations for every task is infeasible, recent studies +introduce the concept of universal user representation, which is a more +generalized representation of a user that is relevant to a variety of tasks. +Despite their effectiveness, existing approaches for learning universal user +representations are impractical in real-world applications due to the data +requirement, catastrophic forgetting and the limited learning capability for +continually added tasks. In this paper, we propose a novel continual user +representation learning method, called TERACON, whose learning capability is +not limited as the number of learned tasks increases while capturing the +relationship between the tasks. The main idea is to introduce an embedding for +each task, i.e., task embedding, which is utilized to generate task-specific +soft masks that not only allow the entire model parameters to be updated until +the end of training sequence, but also facilitate the relationship between the +tasks to be captured. Moreover, we introduce a novel knowledge retention module +with pseudo-labeling strategy that successfully alleviates the long-standing +problem of continual learning, i.e., catastrophic forgetting. Extensive +experiments on public and proprietary real-world datasets demonstrate the +superiority and practicality of TERACON. Our code is available at +https://github.com/Sein-Kim/TERACON. + +
+
+ comment: KDD 2023 +
+
+
+
+
+ + ♻ ☆ Regret-Based Optimization for Robust Reinforcement Learning + + +
+ Deep Reinforcement Learning (DRL) policies have been shown to be vulnerable +to small adversarial noise in observations. Such adversarial noise can have +disastrous consequences in safety-critical environments. For instance, a +self-driving car receiving adversarially perturbed sensory observations about +nearby signs (e.g., a stop sign physically altered to be perceived as a speed +limit sign) or objects (e.g., cars altered to be recognized as trees) can be +fatal. Existing approaches for making RL algorithms robust to an +observation-perturbing adversary have focused on reactive approaches that +iteratively improve against adversarial examples generated at each iteration. +While such approaches have been shown to provide improvements over regular RL +methods, they are reactive and can fare significantly worse if certain +categories of adversarial examples are not generated during training. To that +end, we pursue a more proactive approach that relies on directly optimizing a +well-studied robustness measure, regret instead of expected value. We provide a +principled approach that minimizes maximum regret over a "neighborhood" of +observations to the received "observation". Our regret criterion can be used to +modify existing value- and policy-based Deep RL methods. We demonstrate that +our approaches provide a significant improvement in performance across a wide +variety of benchmarks against leading approaches for robust Deep RL. + +
+
+
+
+
+ + ♻ ☆ BallGAN: 3D-aware Image Synthesis with a Spherical Background ICCV 2023 + + +
+ 3D-aware GANs aim to synthesize realistic 3D scenes such that they can be +rendered in arbitrary perspectives to produce images. Although previous methods +produce realistic images, they suffer from unstable training or degenerate +solutions where the 3D geometry is unnatural. We hypothesize that the 3D +geometry is underdetermined due to the insufficient constraint, i.e., being +classified as real image to the discriminator is not enough. To solve this +problem, we propose to approximate the background as a spherical surface and +represent a scene as a union of the foreground placed in the sphere and the +thin spherical background. It reduces the degree of freedom in the background +field. Accordingly, we modify the volume rendering equation and incorporate +dedicated constraints to design a novel 3D-aware GAN framework named BallGAN. +BallGAN has multiple advantages as follows. 1) It produces more reasonable 3D +geometry; the images of a scene across different viewpoints have better +photometric consistency and fidelity than the state-of-the-art methods. 2) The +training becomes much more stable. 3) The foreground can be separately rendered +on top of different arbitrary backgrounds. + +
+
+ comment: ICCV 2023, Project Page: https://minjung-s.github.io/ballgan +
+
+
+
+
+ + ♻ ☆ Self-consistency for open-ended generations + + +
+ Large Language Models (LLMs) can exhibit considerable variation in the +quality of their sampled outputs. Reranking and selecting the best generation +from the sampled set is a popular way of obtaining strong gains in generation +quality. In this paper, we present a novel approach for reranking LLM +generations. Unlike other techniques that might involve additional inferences +or training a specialized reranker, our approach relies on easy to compute +pairwise statistics between the generations that have minimal compute overhead. +We show that our approach can be formalized as an extension of self-consistency +and analyze its performance in that framework, theoretically as well as via +simulations. We show strong improvements for selecting the best $k$ generations +for code generation tasks as well as robust improvements for best generation +for the tasks of autoformalization, and summarization. While our approach only +assumes black-box access to LLMs, we show that additional access to token +probabilities can improve performance even further. + +
+
+
+
+
+ + ♻ ☆ Deep Residual Error and Bag-of-Tricks Learning for Gravitational Wave + Surrogate Modeling + + +
+ Deep learning methods have been employed in gravitational-wave astronomy to +accelerate the construction of surrogate waveforms for the inspiral of +spin-aligned black hole binaries, among other applications. We face the +challenge of modeling the residual error of an artificial neural network that +models the coefficients of the surrogate waveform expansion (especially those +of the phase of the waveform) which we demonstrate has sufficient structure to +be learnable by a second network. Adding this second network, we were able to +reduce the maximum mismatch for waveforms in a validation set by 13.4 times. We +also explored several other ideas for improving the accuracy of the surrogate +model, such as the exploitation of similarities between waveforms, the +augmentation of the training set, the dissection of the input space, using +dedicated networks per output coefficient and output augmentation. In several +cases, small improvements can be observed, but the most significant improvement +still comes from the addition of a second network that models the residual +error. Since the residual error for more general surrogate waveform models +(when e.g., eccentricity is included) may also have a specific structure, one +can expect our method to be applicable to cases where the gain in accuracy +could lead to significant gains in computational time. + +
+
+
+
+
+ + ♻ ☆ Foundation Model-oriented Robustness: Robust Image Model Evaluation with + Pretrained Models + + +
+ Machine learning has demonstrated remarkable performance over finite +datasets, yet whether the scores over the fixed benchmarks can sufficiently +indicate the model's performance in the real world is still in discussion. In +reality, an ideal robust model will probably behave similarly to the oracle +(e.g., the human users), thus a good evaluation protocol is probably to +evaluate the models' behaviors in comparison to the oracle. In this paper, we +introduce a new robustness measurement that directly measures the image +classification model's performance compared with a surrogate oracle (i.e., a +foundation model). Besides, we design a simple method that can accomplish the +evaluation beyond the scope of the benchmarks. Our method extends the image +datasets with new samples that are sufficiently perturbed to be distinct from +the ones in the original sets, but are still bounded within the same +image-label structure the original test image represents, constrained by a +foundation model pretrained with a large amount of samples. As a result, our +new method will offer us a new way to evaluate the models' robustness +performance, free of limitations of fixed benchmarks or constrained +perturbations, although scoped by the power of the oracle. In addition to the +evaluation results, we also leverage our generated data to understand the +behaviors of the model and our new evaluation strategies. + +
+
+
+
+
+ + ♻ ☆ Temporal Saliency Detection Towards Explainable Transformer-based + Timeseries Forecasting + + +
+ Despite the notable advancements in numerous Transformer-based models, the +task of long multi-horizon time series forecasting remains a persistent +challenge, especially towards explainability. Focusing on commonly used +saliency maps in explaining DNN in general, our quest is to build +attention-based architecture that can automatically encode saliency-related +temporal patterns by establishing connections with appropriate attention heads. +Hence, this paper introduces Temporal Saliency Detection (TSD), an effective +approach that builds upon the attention mechanism and applies it to +multi-horizon time series prediction. While our proposed architecture adheres +to the general encoder-decoder structure, it undergoes a significant renovation +in the encoder component, wherein we incorporate a series of information +contracting and expanding blocks inspired by the U-Net style architecture. The +TSD approach facilitates the multiresolution analysis of saliency patterns by +condensing multi-heads, thereby progressively enhancing the forecasting of +complex time series data. Empirical evaluations illustrate the superiority of +our proposed approach compared to other models across multiple standard +benchmark datasets in diverse far-horizon forecasting settings. The initial TSD +achieves substantial relative improvements of 31% and 46% over several models +in the context of multivariate and univariate prediction. We believe the +comprehensive investigations presented in this study will offer valuable +insights and benefits to future research endeavors. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ♻ ☆ Federated Learning in Big Model Era: Domain-Specific Multimodal Large + Models + + +
+ Multimodal data, which can comprehensively perceive and recognize the +physical world, has become an essential path towards general artificial +intelligence. However, multimodal large models trained on public datasets often +underperform in specific industrial domains. This paper proposes a multimodal +federated learning framework that enables multiple enterprises to utilize +private domain data to collaboratively train large models for vertical domains, +achieving intelligent services across scenarios. The authors discuss in-depth +the strategic transformation of federated learning in terms of intelligence +foundation and objectives in the era of big model, as well as the new +challenges faced in heterogeneous data, model aggregation, performance and cost +trade-off, data privacy, and incentive mechanism. The paper elaborates a case +study of leading enterprises contributing multimodal data and expert knowledge +to city safety operation management , including distributed deployment and +efficient coordination of the federated learning platform, technical +innovations on data quality improvement based on large model capabilities and +efficient joint fine-tuning approaches. Preliminary experiments show that +enterprises can enhance and accumulate intelligent capabilities through +multimodal model federated learning, thereby jointly creating an smart city +model that provides high-quality intelligent services covering energy +infrastructure safety, residential community security, and urban operation +management. The established federated learning cooperation ecosystem is +expected to further aggregate industry, academia, and research resources, +realize large models in multiple vertical domains, and promote the large-scale +industrial application of artificial intelligence and cutting-edge research on +multimodal federated learning. + +
+
+
+
+
+ + ♻ ☆ Traffic Forecasting on New Roads Unseen in the Training Data Using + Spatial Contrastive Pre-Training ECML + + +
+ New roads are being constructed all the time. However, the capabilities of +previous deep forecasting models to generalize to new roads not seen in the +training data (unseen roads) are rarely explored. In this paper, we introduce a +novel setup called a spatio-temporal (ST) split to evaluate the models' +capabilities to generalize to unseen roads. In this setup, the models are +trained on data from a sample of roads, but tested on roads not seen in the +training data. Moreover, we also present a novel framework called Spatial +Contrastive Pre-Training (SCPT) where we introduce a spatial encoder module to +extract latent features from unseen roads during inference time. This spatial +encoder is pre-trained using contrastive learning. During inference, the +spatial encoder only requires two days of traffic data on the new roads and +does not require any re-training. We also show that the output from the spatial +encoder can be used effectively to infer latent node embeddings on unseen roads +during inference time. The SCPT framework also incorporates a new layer, named +the spatially gated addition (SGA) layer, to effectively combine the latent +features from the output of the spatial encoder to existing backbones. +Additionally, since there is limited data on the unseen roads, we argue that it +is better to decouple traffic signals to trivial-to-capture periodic signals +and difficult-to-capture Markovian signals, and for the spatial encoder to only +learn the Markovian signals. Finally, we empirically evaluated SCPT using the +ST split setup on four real-world datasets. The results showed that adding SCPT +to a backbone consistently improves forecasting performance on unseen roads. +More importantly, the improvements are greater when forecasting further into +the future. The codes are available on GitHub: +https://github.com/cruiseresearchgroup/forecasting-on-new-roads . + +
+
+ comment: 25 pages including reference, an additional 3 pages of appendix, 8 + figures. ECML PKDD 2023 Journal track special issue: Data Mining and + Knowledge Discovery (DAMI) +
+
+
+
+
+ + ♻ ☆ A Structured Span Selector NAACL 2022 + + +
+ Many natural language processing tasks, e.g., coreference resolution and +semantic role labeling, require selecting text spans and making decisions about +them. A typical approach to such tasks is to score all possible spans and +greedily select spans for task-specific downstream processing. This approach, +however, does not incorporate any inductive bias about what sort of spans ought +to be selected, e.g., that selected spans tend to be syntactic constituents. In +this paper, we propose a novel grammar-based structured span selection model +which learns to make use of the partial span-level annotation provided for such +problems. Compared to previous approaches, our approach gets rid of the +heuristic greedy span selection scheme, allowing us to model the downstream +task on an optimal set of spans. We evaluate our model on two popular span +prediction tasks: coreference resolution and semantic role labeling. We show +empirical improvements on both. + +
+
+ comment: NAACL 2022 camera-ready +
+
+
+
+
+ + ♻ ☆ Pruning Deep Neural Networks from a Sparsity Perspective ICLR 2023 + + +
+ In recent years, deep network pruning has attracted significant attention in +order to enable the rapid deployment of AI into small devices with computation +and memory constraints. Pruning is often achieved by dropping redundant +weights, neurons, or layers of a deep network while attempting to retain a +comparable test performance. Many deep pruning algorithms have been proposed +with impressive empirical success. However, existing approaches lack a +quantifiable measure to estimate the compressibility of a sub-network during +each pruning iteration and thus may under-prune or over-prune the model. In +this work, we propose PQ Index (PQI) to measure the potential compressibility +of deep neural networks and use this to develop a Sparsity-informed Adaptive +Pruning (SAP) algorithm. Our extensive experiments corroborate the hypothesis +that for a generic pruning procedure, PQI decreases first when a large model is +being effectively regularized and then increases when its compressibility +reaches a limit that appears to correspond to the beginning of underfitting. +Subsequently, PQI decreases again when the model collapse and significant +deterioration in the performance of the model start to occur. Additionally, our +experiments demonstrate that the proposed adaptive pruning algorithm with +proper choice of hyper-parameters is superior to the iterative pruning +algorithms such as the lottery ticket-based pruning methods, in terms of both +compression efficiency and robustness. + +
+
+ comment: ICLR 2023 +
+
+
+
+
+ + ♻ ☆ AdaTerm: Adaptive T-Distribution Estimated Robust Moments for + Noise-Robust Stochastic Gradient Optimization + + +
+ With the increasing practicality of deep learning applications, practitioners +are inevitably faced with datasets corrupted by noise from various sources such +as measurement errors, mislabeling, and estimated surrogate inputs/outputs that +can adversely impact the optimization results. It is a common practice to +improve the optimization algorithm's robustness to noise, since this algorithm +is ultimately in charge of updating the network parameters. Previous studies +revealed that the first-order moment used in Adam-like stochastic gradient +descent optimizers can be modified based on the Student's t-distribution. While +this modification led to noise-resistant updates, the other associated +statistics remained unchanged, resulting in inconsistencies in the assumed +models. In this paper, we propose AdaTerm, a novel approach that incorporates +the Student's t-distribution to derive not only the first-order moment but also +all the associated statistics. This provides a unified treatment of the +optimization process, offering a comprehensive framework under the statistical +model of the t-distribution for the first time. The proposed approach offers +several advantages over previously proposed approaches, including reduced +hyperparameters and improved robustness and adaptability. This noise-adaptive +behavior contributes to AdaTerm's exceptional learning performance, as +demonstrated through various optimization problems with different and/or +unknown noise ratios. Furthermore, we introduce a new technique for deriving a +theoretical regret bound without relying on AMSGrad, providing a valuable +contribution to the field + +
+
+ comment: 27 pages; Final version accepted by Elsevier Neurocomputing Journal + (2023-08; https://doi.org/10.1016/j.neucom.2023.126692) +
+
+
+
+
+ + ♻ ☆ An ML approach to resolution of singularities ICML + + +
+ The solution set of a system of polynomial equations typically contains +ill-behaved, singular points. Resolution is a fundamental process in geometry +in which we replace singular points with smooth points, while keeping the rest +of the solution set unchanged. Resolutions are not unique: the usual way to +describe them involves repeatedly performing a fundamental operation known as +"blowing-up", and the complexity of the resolution highly depends on certain +choices. The process can be translated into various versions of a 2-player +game, the so-called Hironaka game, and a winning strategy for the first player +provides a solution to the resolution problem. In this paper we introduce a new +approach to the Hironaka game that uses reinforcement learning agents to find +optimal resolutions of singularities. In certain domains, the trained model +outperforms state-of-the-art selection heuristics in total number of polynomial +additions performed, which provides a proof-of-concept that recent developments +in machine learning have the potential to improve performance of algorithms in +symbolic computation. + +
+
+ comment: To appear in Proceedings of the 40th International Conference on + Machine Learning TAG Workshop (ICML-TAG 2023) +
+
+
+
+
+ + ♻ ☆ On the Trustworthiness Landscape of State-of-the-art Generative Models: + A Comprehensive Survey + + +
+ Diffusion models and large language models have emerged as leading-edge +generative models and have sparked a revolutionary impact on various aspects of +human life. However, the practical implementation of these models has also +exposed inherent risks, highlighting their dual nature and raising concerns +regarding their trustworthiness. Despite the abundance of literature on this +subject, a comprehensive survey specifically delving into the intersection of +large-scale generative models and their trustworthiness remains largely absent. +To bridge this gap, This paper investigates both the long-standing and emerging +threats associated with these models across four fundamental dimensions: +privacy, security, fairness, and responsibility. In this way, we construct an +extensive map outlining the trustworthiness of these models, while also +providing practical recommendations and identifying future directions. These +efforts are crucial for promoting the trustworthy deployment of these models, +ultimately benefiting society as a whole. + +
+
+ comment: Draft Version +
+
+
+
+
+ + ♻ ☆ Stability of Aggregation Graph Neural Networks + + +
+ In this paper we study the stability properties of aggregation graph neural +networks (Agg-GNNs) considering perturbations of the underlying graph. An +Agg-GNN is a hybrid architecture where information is defined on the nodes of a +graph, but it is processed block-wise by Euclidean CNNs on the nodes after +several diffusions on the graph shift operator. We derive stability bounds for +the mapping operator associated to a generic Agg-GNN, and we specify conditions +under which such operators can be stable to deformations. We prove that the +stability bounds are defined by the properties of the filters in the first +layer of the CNN that acts on each node. Additionally, we show that there is a +close relationship between the number of aggregations, the filter's +selectivity, and the size of the stability constants. We also conclude that in +Agg-GNNs the selectivity of the mapping operators is tied to the properties of +the filters only in the first layer of the CNN stage. This shows a substantial +difference with respect to the stability properties of selection GNNs, where +the selectivity of the filters in all layers is constrained by their stability. +We provide numerical evidence corroborating the results derived, testing the +behavior of Agg-GNNs in real life application scenarios considering +perturbations of different magnitude. + +
+
+
+
+
+ + ♻ ☆ On the Choice of Perception Loss Function for Learned Video Compression + + +
+ We study causal, low-latency, sequential video compression when the output is +subjected to both a mean squared-error (MSE) distortion loss as well as a +perception loss to target realism. Motivated by prior approaches, we consider +two different perception loss functions (PLFs). The first, PLF-JD, considers +the joint distribution (JD) of all the video frames up to the current one, +while the second metric, PLF-FMD, considers the framewise marginal +distributions (FMD) between the source and reconstruction. Using information +theoretic analysis and deep-learning based experiments, we demonstrate that the +choice of PLF can have a significant effect on the reconstruction, especially +at low-bit rates. In particular, while the reconstruction based on PLF-JD can +better preserve the temporal correlation across frames, it also imposes a +significant penalty in distortion compared to PLF-FMD and further makes it more +difficult to recover from errors made in the earlier output frames. Although +the choice of PLF decisively affects reconstruction quality, we also +demonstrate that it may not be essential to commit to a particular PLF during +encoding and the choice of PLF can be delegated to the decoder. In particular, +encoded representations generated by training a system to minimize the MSE +(without requiring either PLF) can be {\em near universal} and can generate +close to optimal reconstructions for either choice of PLF at the decoder. We +validate our results using (one-shot) information-theoretic analysis, detailed +study of the rate-distortion-perception tradeoff of the Gauss-Markov source +model as well as deep-learning based experiments on moving MNIST and KTH +datasets. + +
+
+
+
+
+ + ♻ ☆ Self-supervised learning based general laboratory progress pretrained + model for cardiovascular event detection + + +
+ The inherent nature of patient data poses several challenges. Prevalent cases +amass substantial longitudinal data owing to their patient volume and +consistent follow-ups, however, longitudinal laboratory data are renowned for +their irregularity, temporality, absenteeism, and sparsity; In contrast, +recruitment for rare or specific cases is often constrained due to their +limited patient size and episodic observations. This study employed +self-supervised learning (SSL) to pretrain a generalized laboratory progress +(GLP) model that captures the overall progression of six common laboratory +markers in prevalent cardiovascular cases, with the intention of transferring +this knowledge to aid in the detection of specific cardiovascular event. GLP +implemented a two-stage training approach, leveraging the information embedded +within interpolated data and amplify the performance of SSL. After GLP +pretraining, it is transferred for TVR detection. The proposed two-stage +training improved the performance of pure SSL, and the transferability of GLP +exhibited distinctiveness. After GLP processing, the classification exhibited a +notable enhancement, with averaged accuracy rising from 0.63 to 0.90. All +evaluated metrics demonstrated substantial superiority (p < 0.01) compared to +prior GLP processing. Our study effectively engages in translational +engineering by transferring patient progression of cardiovascular laboratory +parameters from one patient group to another, transcending the limitations of +data availability. The transferability of disease progression optimized the +strategies of examinations and treatments, and improves patient prognosis while +using commonly available laboratory parameters. The potential for expanding +this approach to encompass other diseases holds great promise. + +
+
+ comment: published in IEEE Journal of Translational Engineering in Health & + Medicine +
+
+
+
+
+ + ♻ ☆ Graphon Pooling for Reducing Dimensionality of Signals and Convolutional + Operators on Graphs + + +
+ In this paper we propose a pooling approach for convolutional information +processing on graphs relying on the theory of graphons and limits of dense +graph sequences. We present three methods that exploit the induced graphon +representation of graphs and graph signals on partitions of [0, 1]2 in the +graphon space. As a result we derive low dimensional representations of the +convolutional operators, while a dimensionality reduction of the signals is +achieved by simple local interpolation of functions in L2([0, 1]). We prove +that those low dimensional representations constitute a convergent sequence of +graphs and graph signals, respectively. The methods proposed and the +theoretical guarantees that we provide show that the reduced graphs and signals +inherit spectral-structural properties of the original quantities. We evaluate +our approach with a set of numerical experiments performed on graph neural +networks (GNNs) that rely on graphon pooling. We observe that graphon pooling +performs significantly better than other approaches proposed in the literature +when dimensionality reduction ratios between layers are large. We also observe +that when graphon pooling is used we have, in general, less overfitting and +lower computational cost. + +
+
+
+
+
+ + ♻ ☆ Group Equality in Adaptive Submodular Maximization + + +
+ In this paper, we study the classic submodular maximization problem subject +to a group equality constraint under both non-adaptive and adaptive settings. +It has been shown that the utility function of many machine learning +applications, including data summarization, influence maximization in social +networks, and personalized recommendation, satisfies the property of +submodularity. Hence, maximizing a submodular function subject to various +constraints can be found at the heart of many of those applications. On a high +level, submodular maximization aims to select a group of most representative +items (e.g., data points). However, the design of most existing algorithms does +not incorporate the fairness constraint, leading to under- or +over-representation of some particular groups. This motivates us to study the +submodular maximization problem with group equality, where we aim to select a +group of items to maximize a (possibly non-monotone) submodular utility +function subject to a group equality constraint. To this end, we develop the +first constant-factor approximation algorithm for this problem. The design of +our algorithm is robust enough to be extended to solving the submodular +maximization problem under a more complicated adaptive setting. Moreover, we +further extend our study to incorporating a global cardinality constraint and +other fairness notations. + +
+
+ comment: This paper has been accepted by INFORMS Journal on Computing +
+
+
+
+
+ + ♻ ☆ Measuring Equality in Machine Learning Security Defenses: A Case Study + in Speech Recognition + + +
+ Over the past decade, the machine learning security community has developed a +myriad of defenses for evasion attacks. An understudied question in that +community is: for whom do these defenses defend? This work considers common +approaches to defending learned systems and how security defenses result in +performance inequities across different sub-populations. We outline appropriate +parity metrics for analysis and begin to answer this question through empirical +results of the fairness implications of machine learning security methods. We +find that many methods that have been proposed can cause direct harm, like +false rejection and unequal benefits from robustness training. The framework we +propose for measuring defense equality can be applied to robustly trained +models, preprocessing-based defenses, and rejection methods. We identify a set +of datasets with a user-centered application and a reasonable computational +cost suitable for case studies in measuring the equality of defenses. In our +case study of speech command recognition, we show how such adversarial training +and augmentation have non-equal but complex protections for social subgroups +across gender, accent, and age in relation to user coverage. We present a +comparison of equality between two rejection-based defenses: randomized +smoothing and neural rejection, finding randomized smoothing more equitable due +to the sampling mechanism for minority groups. This represents the first work +examining the disparity in the adversarial robustness in the speech domain and +the fairness evaluation of rejection-based defenses. + +
+
+ comment: Accepted to AISec'23 +
+
+
+
+
+ + ♻ ☆ Test-Time Adaptation for Visual Document Understanding + + +
+ For visual document understanding (VDU), self-supervised pretraining has been +shown to successfully generate transferable representations, yet, effective +adaptation of such representations to distribution shifts at test-time remains +to be an unexplored area. We propose DocTTA, a novel test-time adaptation +method for documents, that does source-free domain adaptation using unlabeled +target document data. DocTTA leverages cross-modality self-supervised learning +via masked visual language modeling, as well as pseudo labeling to adapt models +learned on a \textit{source} domain to an unlabeled \textit{target} domain at +test time. We introduce new benchmarks using existing public datasets for +various VDU tasks, including entity recognition, key-value extraction, and +document visual question answering. DocTTA shows significant improvements on +these compared to the source model performance, up to 1.89\% in (F1 score), +3.43\% (F1 score), and 17.68\% (ANLS score), respectively. Our benchmark +datasets are available at \url{https://saynaebrahimi.github.io/DocTTA.html}. + +
+
+ comment: Accepted at TMLR 2023 +
+
+
+
+
+ + ♻ ☆ Convergence of the Backward Deep BSDE Method with Applications to + Optimal Stopping Problems + + +
+ The optimal stopping problem is one of the core problems in financial +markets, with broad applications such as pricing American and Bermudan options. +The deep BSDE method [Han, Jentzen and E, PNAS, 115(34):8505-8510, 2018] has +shown great power in solving high-dimensional forward-backward stochastic +differential equations (FBSDEs), and inspired many applications. However, the +method solves backward stochastic differential equations (BSDEs) in a forward +manner, which can not be used for optimal stopping problems that in general +require running BSDE backwardly. To overcome this difficulty, a recent paper +[Wang, Chen, Sudjianto, Liu and Shen, arXiv:1807.06622, 2018] proposed the +backward deep BSDE method to solve the optimal stopping problem. In this paper, +we provide the rigorous theory for the backward deep BSDE method. Specifically, +1. We derive the a posteriori error estimation, i.e., the error of the +numerical solution can be bounded by the training loss function; and; 2. We +give an upper bound of the loss function, which can be sufficiently small +subject to universal approximations. We give two numerical examples, which +present consistent performance with the proved theory. + +
+
+
+
+
+ + ♻ ☆ Quantized Radio Map Estimation Using Tensor and Deep Generative Models + + +
+ Spectrum cartography (SC), also known as radio map estimation (RME), aims at +crafting multi-domain (e.g., frequency and space) radio power propagation maps +from limited sensor measurements. While early methods often lacked theoretical +support, recent works have demonstrated that radio maps can be provably +recovered using low-dimensional models -- such as the block-term tensor +decomposition (BTD) model and certain deep generative models (DGMs) -- of the +high-dimensional multi-domain radio signals. However, these existing provable +SC approaches assume that sensors send real-valued (full-resolution) +measurements to the fusion center, which is unrealistic. This work puts forth a +quantized SC framework that generalizes the BTD and DGM-based SC to scenarios +where heavily quantized sensor measurements are used. A maximum likelihood +estimation (MLE)-based SC framework under a Gaussian quantizer is proposed. +Recoverability of the radio map using the MLE criterion are characterized under +realistic conditions, e.g., imperfect radio map modeling and noisy +measurements. Simulations and real-data experiments are used to showcase the +effectiveness of the proposed approach. + +
+
+ comment: 16 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Conformal Prediction Regions for Time Series using Linear + Complementarity Programming + + +
+ Conformal prediction is a statistical tool for producing prediction regions +of machine learning models that are valid with high probability. However, +applying conformal prediction to time series data leads to conservative +prediction regions. In fact, to obtain prediction regions over $T$ time steps +with confidence $1-\delta$, {previous works require that each individual +prediction region is valid} with confidence $1-\delta/T$. We propose an +optimization-based method for reducing this conservatism to enable long horizon +planning and verification when using learning-enabled time series predictors. +Instead of considering prediction errors individually at each time step, we +consider a parameterized prediction error over multiple time steps. By +optimizing the parameters over an additional dataset, we find prediction +regions that are not conservative. We show that this problem can be cast as a +mixed integer linear complementarity program (MILCP), which we then relax into +a linear complementarity program (LCP). Additionally, we prove that the relaxed +LP has the same optimal cost as the original MILCP. Finally, we demonstrate the +efficacy of our method on case studies using pedestrian trajectory predictors +and F16 fighter jet altitude predictors. + +
+
+
+
+
+ + ♻ ☆ BadVFL: Backdoor Attacks in Vertical Federated Learning + + +
+ Federated learning (FL) enables multiple parties to collaboratively train a +machine learning model without sharing their data; rather, they train their own +model locally and send updates to a central server for aggregation. Depending +on how the data is distributed among the participants, FL can be classified +into Horizontal (HFL) and Vertical (VFL). In VFL, the participants share the +same set of training instances but only host a different and non-overlapping +subset of the whole feature space. Whereas in HFL, each participant shares the +same set of features while the training set is split into locally owned +training data subsets. + VFL is increasingly used in applications like financial fraud detection; +nonetheless, very little work has analyzed its security. In this paper, we +focus on robustness in VFL, in particular, on backdoor attacks, whereby an +adversary attempts to manipulate the aggregate model during the training +process to trigger misclassifications. Performing backdoor attacks in VFL is +more challenging than in HFL, as the adversary i) does not have access to the +labels during training and ii) cannot change the labels as she only has access +to the feature embeddings. We present a first-of-its-kind clean-label backdoor +attack in VFL, which consists of two phases: a label inference and a backdoor +phase. We demonstrate the effectiveness of the attack on three different +datasets, investigate the factors involved in its success, and discuss +countermeasures to mitigate its impact. + +
+
+ comment: Accepted for publication at the 45th IEEE Symposium on Security & + Privacy (S&P 2024). Please cite accordingly +
+
+
+
+
+ + ♻ ☆ BagPipe: Accelerating Deep Recommendation Model Training + + +
+ Deep learning based recommendation models (DLRM) are widely used in several +business critical applications. Training such recommendation models efficiently +is challenging because they contain billions of embedding-based parameters, +leading to significant overheads from embedding access. By profiling existing +systems for DLRM training, we observe that around 75\% of the iteration time is +spent on embedding access and model synchronization. Our key insight in this +paper is that embedding access has a specific structure which can be used to +accelerate training. We observe that embedding accesses are heavily skewed, +with around 1\% of embeddings representing more than 92\% of total accesses. +Further, we observe that during offline training we can lookahead at future +batches to determine exactly which embeddings will be needed at what iteration +in the future. Based on these insights, we develop Bagpipe, a system for +training deep recommendation models that uses caching and prefetching to +overlap remote embedding accesses with the computation. We design an Oracle +Cacher, a new component that uses a lookahead algorithm to generate optimal +cache update decisions while providing strong consistency guarantees against +staleness. We also design a logically replicated, physically partitioned cache +and show that our design can reduce synchronization overheads in a distributed +setting. Finally, we propose a disaggregated system architecture and show that +our design can enable low-overhead fault tolerance. Our experiments using three +datasets and four models show that Bagpipe provides a speed up of up to 5.6x +compared to state of the art baselines, while providing the same convergence +and reproducibility guarantees as synchronous training. + +
+
+
+
+
+ + ♻ ☆ Minimum intrinsic dimension scaling for entropic optimal transport + + +
+ Motivated by the manifold hypothesis, which states that data with a high +extrinsic dimension may yet have a low intrinsic dimension, we develop refined +statistical bounds for entropic optimal transport that are sensitive to the +intrinsic dimension of the data. Our bounds involve a robust notion of +intrinsic dimension, measured at only a single distance scale depending on the +regularization parameter, and show that it is only the minimum of these +single-scale intrinsic dimensions which governs the rate of convergence. We +call this the Minimum Intrinsic Dimension scaling (MID scaling) phenomenon, and +establish MID scaling with no assumptions on the data distributions so long as +the cost is bounded and Lipschitz, and for various entropic optimal transport +quantities beyond just values, with stronger analogs when one distribution is +supported on a manifold. Our results significantly advance the theoretical +state of the art by showing that MID scaling is a generic phenomenon, and +provide the first rigorous interpretation of the statistical effect of entropic +regularization as a distance scale. + +
+
+ comment: 53 pages +
+
+
+
+
+ + ♻ ☆ PruMUX: Augmenting Data Multiplexing with Model Compression ACL 2023 + + +
+ As language models increase in size by the day, methods for efficient +inference are critical to leveraging their capabilities for various +applications. Prior work has investigated techniques like model pruning, +knowledge distillation, and data multiplexing to increase model throughput +without sacrificing accuracy. In this paper, we combine two such methods -- +structured pruning and data multiplexing -- to compound the speedup gains +obtained by either method. Our approach, PruMUX, obtains up to 7.5-29.5X +throughput improvement over BERT-base model with accuracy threshold from 80% to +74%. We further study various combinations of parameters (such as sparsity and +multiplexing factor) in the two techniques to provide a comprehensive analysis +of the tradeoff between accuracy and throughput in the resulting models. We +then propose Auto-PruMUX, a meta-level model that can predict the +high-performance parameters for pruning and multiplexing given a desired +accuracy loss budget, providing a practical method to leverage the combination +effectively. + +
+
+ comment: Published at Findings of the Association for Computational + Linguistics (ACL 2023) +
+
+
+
+
+ + ♻ ☆ Wasserstein Geodesic Generator for Conditional Distributions + + +
+ Generating samples given a specific label requires estimating conditional +distributions. We derive a tractable upper bound of the Wasserstein distance +between conditional distributions to lay the theoretical groundwork to learn +conditional distributions. Based on this result, we propose a novel conditional +generation algorithm where conditional distributions are fully characterized by +a metric space defined by a statistical distance. We employ optimal transport +theory to propose the Wasserstein geodesic generator, a new conditional +generator that learns the Wasserstein geodesic. The proposed method learns both +conditional distributions for observed domains and optimal transport maps +between them. The conditional distributions given unobserved intermediate +domains are on the Wasserstein geodesic between conditional distributions given +two observed domain labels. Experiments on face images with light conditions as +domain labels demonstrate the efficacy of the proposed method. + +
+
+
+
+
+ + ♻ ☆ Multi-fidelity Fourier Neural Operator for Fast Modeling of Large-Scale + Geological Carbon Storage + + +
+ Deep learning-based surrogate models have been widely applied in geological +carbon storage (GCS) problems to accelerate the prediction of reservoir +pressure and CO2 plume migration. Large amounts of data from physics-based +numerical simulators are required to train a model to accurately predict the +complex physical behaviors associated with this process. In practice, the +available training data are always limited in large-scale 3D problems due to +the high computational cost. Therefore, we propose to use a multi-fidelity +Fourier Neural Operator to solve large-scale GCS problems with more affordable +multi-fidelity training datasets. The Fourier Neural Operator has a desirable +grid-invariant property, which simplifies the transfer learning procedure +between datasets with different discretization. We first test the model +efficacy on a GCS reservoir model being discretized into 110k grid cells. The +multi-fidelity model can predict with accuracy comparable to a high-fidelity +model trained with the same amount of high-fidelity data with 81% less data +generation costs. We further test the generalizability of the multi-fidelity +model on a same reservoir model with a finer discretization of 1 million grid +cells. This case was made more challenging by employing high-fidelity and +low-fidelity datasets generated by different geostatistical models and +reservoir simulators. We observe that the multi-fidelity FNO model can predict +pressure fields with reasonable accuracy even when the high-fidelity data are +extremely limited. + +
+
+
+
+
+ + ♻ ☆ Conditional expectation using compactification operators + + +
+ The separate tasks of denoising, least squares expectation, and manifold +learning can often be posed in a common setting of finding the conditional +expectations arising from a product of two random variables. This paper focuses +on this more general problem and describes an operator theoretic approach to +estimating the conditional expectation. Kernel integral operators are used as a +compactification tool, to set up the estimation problem as a linear inverse +problem in a reproducing kernel Hilbert space. This equation is shown to have +solutions that allow numerical approximation, thus guaranteeing the convergence +of data-driven implementations. The overall technique is easy to implement, and +their successful application to some real-world problems are also shown. + +
+
+
+
+
+ + ♻ ☆ LANISTR: Multimodal Learning from Structured and Unstructured Data + + +
+ Multimodal large-scale pretraining has shown impressive performance for +unstructured data including language, image, audio, and video. However, a +prevalent real-world scenario involves the combination of structured data types +(tabular, time-series) with unstructured data which has so far been +understudied. To bridge this gap, we propose LANISTR, an attention-based +framework to learn from LANguage, Image, and STRuctured data. The core of +LANISTR's methodology is rooted in \textit{masking-based} training applied +across both unimodal and multimodal levels. In particular, we introduce a new +similarity-based multimodal masking loss that enables it to learn cross-modal +relations from large-scale multimodal data with missing modalities. On two +real-world datastes, MIMIC-IV (healthcare) and Amazon Product Review (retail), +LANISTR demonstrates remarkable absolute improvements of 6.6\% (AUROC) and up +to 14\% (accuracy) when fine-tuned on 0.1\% and 0.01\% of labeled data, +respectively, compared to the state-of-the-art alternatives. Notably, these +improvements are observed even in the presence of considerable missingness +ratios of 35.7\% and 99.8\%, in the respective datasets. + +
+
+
+
+
+ + ♻ ☆ Pareto Invariant Representation Learning for Multimedia Recommendation ACM MM 2023 + + +
+ Multimedia recommendation involves personalized ranking tasks, where +multimedia content is usually represented using a generic encoder. However, +these generic representations introduce spurious correlations that fail to +reveal users' true preferences. Existing works attempt to alleviate this +problem by learning invariant representations, but overlook the balance between +independent and identically distributed (IID) and out-of-distribution (OOD) +generalization. In this paper, we propose a framework called Pareto Invariant +Representation Learning (PaInvRL) to mitigate the impact of spurious +correlations from an IID-OOD multi-objective optimization perspective, by +learning invariant representations (intrinsic factors that attract user +attention) and variant representations (other factors) simultaneously. +Specifically, PaInvRL includes three iteratively executed modules: (i) +heterogeneous identification module, which identifies the heterogeneous +environments to reflect distributional shifts for user-item interactions; (ii) +invariant mask generation module, which learns invariant masks based on the +Pareto-optimal solutions that minimize the adaptive weighted Invariant Risk +Minimization (IRM) and Empirical Risk (ERM) losses; (iii) convert module, which +generates both variant representations and item-invariant representations for +training a multi-modal recommendation model that mitigates spurious +correlations and balances the generalization performance within and cross the +environmental distributions. We compare the proposed PaInvRL with +state-of-the-art recommendation models on three public multimedia +recommendation datasets (Movielens, Tiktok, and Kwai), and the experimental +results validate the effectiveness of PaInvRL for both within- and +cross-environmental learning. + +
+
+ comment: ACM MM 2023 full paper +
+
+
+
+
+ + ♻ ☆ StableDR: Stabilized Doubly Robust Learning for Recommendation on Data + Missing Not at Random ICLR 23 + + +
+ In recommender systems, users always choose the favorite items to rate, which +leads to data missing not at random and poses a great challenge for unbiased +evaluation and learning of prediction models. Currently, the doubly robust (DR) +methods have been widely studied and demonstrate superior performance. However, +in this paper, we show that DR methods are unstable and have unbounded bias, +variance, and generalization bounds to extremely small propensities. Moreover, +the fact that DR relies more on extrapolation will lead to suboptimal +performance. To address the above limitations while retaining double +robustness, we propose a stabilized doubly robust (StableDR) learning approach +with a weaker reliance on extrapolation. Theoretical analysis shows that +StableDR has bounded bias, variance, and generalization error bound +simultaneously under inaccurate imputed errors and arbitrarily small +propensities. In addition, we propose a novel learning approach for StableDR +that updates the imputation, propensity, and prediction models cyclically, +achieving more stable and accurate predictions. Extensive experiments show that +our approaches significantly outperform the existing methods. + +
+
+ comment: ICLR 23 +
+
+
+
+
+
+
+
+ + Multimedia 9 + +
+
+
+ + ☆ Aparecium: Revealing Secrets from Physical Photographs + + +
+ Watermarking is a crucial tool for safeguarding copyrights and can serve as a +more aesthetically pleasing alternative to QR codes. In recent years, +watermarking methods based on deep learning have proved superior robustness +against complex physical distortions than traditional watermarking methods. +However, they have certain limitations that render them less effective in +practice. For instance, current solutions necessitate physical photographs to +be rectangular for accurate localization, cannot handle physical bending or +folding, and require the hidden area to be completely captured at a close +distance and small angle. To overcome these challenges, we propose a novel deep +watermarking framework dubbed \textit{Aparecium}. Specifically, we preprocess +secrets (i.e., watermarks) into a pattern and then embed it into the cover +image, which is symmetrical to the final decoding-then-extracting process. To +capture the watermarked region from complex physical scenarios, a locator is +also introduced. Besides, we adopt a three-stage training strategy for training +convergence. Extensive experiments demonstrate that \textit{Aparecium} is not +only robust against different digital distortions, but also can resist various +physical distortions, such as screen-shooting and printing-shooting, even in +severe cases including different shapes, curvature, folding, incompleteness, +long distances, and big angles while maintaining high visual quality. +Furthermore, some ablation studies are also conducted to verify our design. + +
+
+
+
+
+ + ☆ CgT-GAN: CLIP-guided Text GAN for Image Captioning ACM MM 2023 + + +
+ The large-scale visual-language pre-trained model, Contrastive Language-Image +Pre-training (CLIP), has significantly improved image captioning for scenarios +without human-annotated image-caption pairs. Recent advanced CLIP-based image +captioning without human annotations follows a text-only training paradigm, +i.e., reconstructing text from shared embedding space. Nevertheless, these +approaches are limited by the training/inference gap or huge storage +requirements for text embeddings. Given that it is trivial to obtain images in +the real world, we propose CLIP-guided text GAN (CgT-GAN), which incorporates +images into the training process to enable the model to "see" real visual +modality. Particularly, we use adversarial training to teach CgT-GAN to mimic +the phrases of an external text corpus and CLIP-based reward to provide +semantic guidance. The caption generator is jointly rewarded based on the +caption naturalness to human language calculated from the GAN's discriminator +and the semantic guidance reward computed by the CLIP-based reward module. In +addition to the cosine similarity as the semantic guidance reward (i.e., +CLIP-cos), we further introduce a novel semantic guidance reward called +CLIP-agg, which aligns the generated caption with a weighted text embedding by +attentively aggregating the entire corpus. Experimental results on three +subtasks (ZS-IC, In-UIC and Cross-UIC) show that CgT-GAN outperforms +state-of-the-art methods significantly across all metrics. Code is available at +https://github.com/Lihr747/CgtGAN. + +
+
+ comment: Accepted at ACM MM 2023 +
+
+
+
+
+ + ☆ EVE: Efficient Vision-Language Pre-training with Masked Prediction and + Modality-Aware MoE + + +
+ Building scalable vision-language models to learn from diverse, multimodal +data remains an open challenge. In this paper, we introduce an Efficient +Vision-languagE foundation model, namely EVE, which is one unified multimodal +Transformer pre-trained solely by one unified pre-training task. Specifically, +EVE encodes both vision and language within a shared Transformer network +integrated with modality-aware sparse Mixture-of-Experts (MoE) modules, which +capture modality-specific information by selectively switching to different +experts. To unify pre-training tasks of vision and language, EVE performs +masked signal modeling on image-text pairs to reconstruct masked signals, i.e., +image pixels and text tokens, given visible signals. This simple yet effective +pre-training objective accelerates training by 3.5x compared to the model +pre-trained with Image-Text Contrastive and Image-Text Matching losses. Owing +to the combination of the unified architecture and pre-training task, EVE is +easy to scale up, enabling better downstream performance with fewer resources +and faster training speed. Despite its simplicity, EVE achieves +state-of-the-art performance on various vision-language downstream tasks, +including visual question answering, visual reasoning, and image-text +retrieval. + +
+
+
+
+
+ + ☆ With a Little Help from your own Past: Prototypical Memory Networks for + Image Captioning ICCV 2023 + + +
+ Image captioning, like many tasks involving vision and language, currently +relies on Transformer-based architectures for extracting the semantics in an +image and translating it into linguistically coherent descriptions. Although +successful, the attention operator only considers a weighted summation of +projections of the current input sample, therefore ignoring the relevant +semantic information which can come from the joint observation of other +samples. In this paper, we devise a network which can perform attention over +activations obtained while processing other training samples, through a +prototypical memory model. Our memory models the distribution of past keys and +values through the definition of prototype vectors which are both +discriminative and compact. Experimentally, we assess the performance of the +proposed model on the COCO dataset, in comparison with carefully designed +baselines and state-of-the-art approaches, and by investigating the role of +each of the proposed components. We demonstrate that our proposal can increase +the performance of an encoder-decoder Transformer by 3.7 CIDEr points both when +training in cross-entropy only and when fine-tuning with self-critical sequence +training. Source code and trained models are available at: +https://github.com/aimagelab/PMA-Net. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ AdVerb: Visually Guided Audio Dereverberation ICCV 2023 + + +
+ We present AdVerb, a novel audio-visual dereverberation framework that uses +visual cues in addition to the reverberant sound to estimate clean audio. +Although audio-only dereverberation is a well-studied problem, our approach +incorporates the complementary visual modality to perform audio +dereverberation. Given an image of the environment where the reverberated sound +signal has been recorded, AdVerb employs a novel geometry-aware cross-modal +transformer architecture that captures scene geometry and audio-visual +cross-modal relationship to generate a complex ideal ratio mask, which, when +applied to the reverberant audio predicts the clean sound. The effectiveness of +our method is demonstrated through extensive quantitative and qualitative +evaluations. Our approach significantly outperforms traditional audio-only and +audio-visual baselines on three downstream tasks: speech enhancement, speech +recognition, and speaker verification, with relative improvements in the range +of 18% - 82% on the LibriSpeech test-clean set. We also achieve highly +satisfactory RT60 error scores on the AVSpeech dataset. + +
+
+ comment: Accepted at ICCV 2023. For project page, see + https://gamma.umd.edu/researchdirections/speech/adverb +
+
+
+
+
+ + ♻ ☆ Multimodal Garment Designer: Human-Centric Latent Diffusion Models for + Fashion Image Editing ICCV 2023 + + +
+ Fashion illustration is used by designers to communicate their vision and to +bring the design idea from conceptualization to realization, showing how +clothes interact with the human body. In this context, computer vision can thus +be used to improve the fashion design process. Differently from previous works +that mainly focused on the virtual try-on of garments, we propose the task of +multimodal-conditioned fashion image editing, guiding the generation of +human-centric fashion images by following multimodal prompts, such as text, +human body poses, and garment sketches. We tackle this problem by proposing a +new architecture based on latent diffusion models, an approach that has not +been used before in the fashion domain. Given the lack of existing datasets +suitable for the task, we also extend two existing fashion datasets, namely +Dress Code and VITON-HD, with multimodal annotations collected in a +semi-automatic manner. Experimental results on these new datasets demonstrate +the effectiveness of our proposal, both in terms of realism and coherence with +the given multimodal inputs. Source code and collected multimodal annotations +are publicly available at: +https://github.com/aimagelab/multimodal-garment-designer. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ VoxBlink: X-Large Speaker Verification Dataset on Camera ICASSP2024 + + +
+ In this paper, we contribute a novel and extensive dataset for speaker +verification, which contains noisy 38k identities/1.45M utterances (VoxBlink) +and relatively cleaned 18k identities/1.02M (VoxBlink-Clean) utterances for +training. Firstly, we accumulate a 60K+ users' list with their avatars and +download their short videos on YouTube. We then established an automatic and +scalable pipeline to extract relevant speech and video segments from these +videos. To our knowledge, the VoxBlink dataset is one of the largest speaker +recognition datasets available. Secondly, we conduct a series of experiments +based on different backbones trained on a mix of the VoxCeleb2 and the +VoxBlink-Clean. Our findings highlight a notable performance improvement, +ranging from 13% to 30%, across different backbone architectures upon +integrating our dataset for training. The dataset will be made publicly +available shortly. + +
+
+ comment: submit to ICASSP2024 +
+
+
+
+
+ + ♻ ☆ A Tale of Two Graphs: Freezing and Denoising Graph Structures for + Multimodal Recommendation + + +
+ Multimodal recommender systems utilizing multimodal features (e.g., images +and textual descriptions) typically show better recommendation accuracy than +general recommendation models based solely on user-item interactions. +Generally, prior work fuses multimodal features into item ID embeddings to +enrich item representations, thus failing to capture the latent semantic +item-item structures. In this context, LATTICE proposes to learn the latent +structure between items explicitly and achieves state-of-the-art performance +for multimodal recommendations. However, we argue the latent graph structure +learning of LATTICE is both inefficient and unnecessary. Experimentally, we +demonstrate that freezing its item-item structure before training can also +achieve competitive performance. Based on this finding, we propose a simple yet +effective model, dubbed as FREEDOM, that FREEzes the item-item graph and +DenOises the user-item interaction graph simultaneously for Multimodal +recommendation. Theoretically, we examine the design of FREEDOM through a graph +spectral perspective and demonstrate that it possesses a tighter upper bound on +the graph spectrum. In denoising the user-item interaction graph, we devise a +degree-sensitive edge pruning method, which rejects possibly noisy edges with a +high probability when sampling the graph. We evaluate the proposed model on +three real-world datasets and show that FREEDOM can significantly outperform +current strongest baselines. Compared with LATTICE, FREEDOM achieves an average +improvement of 19.07% in recommendation accuracy while reducing its memory cost +up to 6$\times$ on large graphs. The source code is available at: +https://github.com/enoche/FREEDOM. + +
+
+ comment: Accepted to ACM Multimedia (MM) 2023 +
+
+
+
+
+ + ♻ ☆ Display object alignment may influence location recall in unexpected + ways + + +
+ There is a presumption in human-computer interaction that laying out menus +and most other material in neat rows and columns helps users get work done. The +rule has been so implicit in the field of design as to allow for no debate. +However, the idea that perfect collinearity benefits creates an advantage for +both either search and or recall has rarely been tested. Drawing from separate +branches of cognitive literature, we tested a minimal brainstorming interface +with either aligned or eccentrically arranged layouts on 96 college students. +Incidental exact recall of recently worked locations improved in the eccentric +condition. And in both conditions there were frequent near-miss recall errors +to neighboring aligned objects and groups of objects. Further analysis found +only marginal performance advantages specifically for females with the +eccentric design. However, NASA-TLX subjective measures showed that in +eccentric, females reported higher performance, less effort, and yet also +higher frustration; while males reported lower performance with about the same +effort, and lower frustration. + +
+
+ comment: superseded by arXiv:2308.12201 +
+
+
+
+
+
+
+ + + +
+
+ +
+
+ + diff --git a/index.js b/index.js new file mode 100644 index 00000000..69f5da7b --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`